From 0c8b999404cb73c4abe88dbd4e3b2b413ba88578 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 30 Nov 2021 00:30:34 -0600
Subject: [PATCH 001/124] add scale api and test (#37683)

---
 paddle/fluid/operators/scale_op.cc      | 10 ++---
 paddle/pten/api/include/math.h          | 18 +++++---
 paddle/pten/api/lib/math.cc             | 39 +++++++++++++++++
 paddle/pten/include/math.h              | 17 +-------
 paddle/pten/kernels/cpu/math.cc         | 37 ++--------------
 paddle/pten/kernels/cpu/math.h          | 11 +----
 paddle/pten/kernels/cuda/math.cu        | 38 ++---------------
 paddle/pten/kernels/cuda/math.h         | 11 +----
 paddle/pten/tests/api/CMakeLists.txt    |  1 +
 paddle/pten/tests/api/test_scale_api.cc | 57 +++++++++++++++++++++++++
 10 files changed, 126 insertions(+), 113 deletions(-)
 create mode 100644 paddle/pten/tests/api/test_scale_api.cc

diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index c24f924313fb90..4e9c84ef4c9503 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -75,14 +75,14 @@ class ScaleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     if (ctx.InputVar("X")->IsType<framework::LoDTensor>() ||
         ctx.InputVar("X")->IsType<framework::Tensor>()) {
+      std::string scale_attr;
       if (ctx.HasInput("ScaleTensor")) {
-        return framework::KernelSignature("scale.host", {"X", "ScaleTensor"},
-                                          {"bias", "bias_after_scale"},
-                                          {"Out"});
+        scale_attr = "ScaleTensor";
       } else {
-        return framework::KernelSignature(
-            "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+        scale_attr = "scale";
       }
+      return framework::KernelSignature(
+          "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
     }
     // TODO(chenweihang): support other cases after selected rows added
     return framework::KernelSignature("scale.unregistered", {}, {}, {});
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
index 149500c546dfd3..700af6d2d59116 100644
--- a/paddle/pten/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -15,16 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
 
 namespace paddle {
 namespace experimental {
 
-// TODO(chenweihang): add scale API
-// TODO(chenweihang): move mean API into stat.h/cc
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim);
-
 PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
 
 PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
@@ -33,10 +28,21 @@ PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
 
 PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
 
+// TODO(chenweihang): move mean API into stat.h/cc
+PD_DLL_DECL Tensor mean(const Tensor& x,
+                        const std::vector<int64_t>& axis,
+                        bool keep_dim);
+
 PD_DLL_DECL Tensor sum(const Tensor& x,
                        const std::vector<int64_t>& axis,
                        DataType dtype,
                        bool keep_dim);
 
+// TODO(chenweihang): Follow-up discussion on the handling of `act` argument
+PD_DLL_DECL Tensor scale(const Tensor& x,
+                         const Scalar& scale,
+                         float bias,
+                         bool bias_after_scale);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
index bd2567ddb15063..a97d78b5a9d6f0 100644
--- a/paddle/pten/api/lib/math.cc
+++ b/paddle/pten/api/lib/math.cc
@@ -274,6 +274,45 @@ PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
 
   return out;
 }
+
+PD_DLL_DECL Tensor scale(const Tensor& x,
+                         const Scalar& scale,
+                         float bias,
+                         bool bias_after_scale) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "scale", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
+  kernel_context.EmplaceBackAttr(bias);
+  kernel_context.EmplaceBackAttr(bias_after_scale);
+
+  // 4. InferMeta
+  auto out_meta = UnchangedInferMeta(dense_x->meta());
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index c6528d85c27cc0..c2b9f75bda0449 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -78,7 +78,7 @@ DenseTensor Sum(const ContextT& dev_ctx,
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
-                  float scale,
+                  const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
   auto out_meta = UnchangedInferMeta(x.meta());
@@ -90,21 +90,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          dev_ctx.GetPlace());
-  pten::DenseTensor dense_out(allocator, out_meta);
-  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Add(const ContextT& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index 634b5231da266d..05ca7a3ae52446 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -50,28 +50,12 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-// TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
-// register its dtype def
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx,
-                              x,
-                              static_cast<float>(*scale.data<T>()),
-                              bias,
-                              bias_after_scale,
-                              out);
+  eigen::Scale<CPUContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 template <typename T>
@@ -145,20 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CPU,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   paddle::platform::bfloat16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
+
 PT_REGISTER_KERNEL("elementwise_add",
                    CPU,
                    ANY,
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
index c06d40e57799fb..31532f38f6e49f 100644
--- a/paddle/pten/kernels/cpu/math.h
+++ b/paddle/pten/kernels/cpu/math.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
@@ -40,19 +41,11 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CPUContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index bc5582926a4006..8d6abc92855305 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -79,30 +79,12 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(scale.place()),
-                    false,
-                    paddle::platform::errors::InvalidArgument(
-                        "Scale argument isn't a host tensor."));
-  eigen::Scale<CUDAContext, T>(dev_ctx,
-                               x,
-                               static_cast<float>(*scale.data<T>()),
-                               bias,
-                               bias_after_scale,
-                               out);
+  eigen::Scale<CUDAContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 // Create the definition of ElementwiseAdd
@@ -150,20 +132,6 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CUDA,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
 PT_REGISTER_KERNEL("elementwise_add",
                    CUDA,
                    ANY,
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
index dcee649d7d82d5..0ac55f1f879507 100644
--- a/paddle/pten/kernels/cuda/math.h
+++ b/paddle/pten/kernels/cuda/math.h
@@ -17,6 +17,7 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -42,19 +43,11 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CUDAContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index 207d8f35b4c453..c670d094810198 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -20,3 +20,4 @@ cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten
 cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
new file mode 100644
index 00000000000000..2c0cd5cc71d8ee
--- /dev/null
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/math.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+void CheckScaleResult(Tensor* out) {
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 3);
+  ASSERT_EQ(out->dims()[1], 4);
+  ASSERT_EQ(out->numel(), 12);
+  ASSERT_EQ(out->is_cpu(), true);
+  ASSERT_EQ(out->type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out->layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out->initialized(), true);
+  for (int64_t i = 0; i < out->numel(); ++i) {
+    ASSERT_EQ(out->mutable_data<float>()[i], 3.0);
+  }
+}
+
+TEST(API, scale) {
+  // 1. check `scale` is float value
+  auto x = experimental::full({3, 4}, 1.0, pten::DataType::FLOAT32);
+  auto out1 = experimental::scale(x, 2.0, 1.0, true);
+  CheckScaleResult(&out1);
+
+  // 2. check `scale` is Tensor with shape [1]
+  auto scale = experimental::full({1}, 2.0, pten::DataType::FLOAT32);
+  auto out2 = experimental::scale(x, scale, 1.0, true);
+  CheckScaleResult(&out2);
+}
+
+}  // namespace tests
+}  // namespace paddle

From 2f4c089bbb995ddd02967663d0a9f25481c69531 Mon Sep 17 00:00:00 2001
From: andyjpaddle <87074272+andyjpaddle@users.noreply.github.com>
Date: Tue, 30 Nov 2021 14:37:27 +0800
Subject: [PATCH 002/124] Add diff op (#37441)

* add diff op, test=develop

* rm some notes, test=develop

* update diff doc

* update sample code

* fix diff api params and example code, test=develop
---
 python/paddle/__init__.py                     |   4 +-
 .../fluid/tests/unittests/test_diff_op.py     | 214 ++++++++++++++++++
 python/paddle/tensor/__init__.py              |   4 +-
 python/paddle/tensor/math.py                  | 163 +++++++++++++
 4 files changed, 383 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_diff_op.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 5823cf460ee9f1..af42cbfc88eb1a 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -223,6 +223,7 @@
 from .tensor.math import digamma  # noqa: F401
 from .tensor.math import neg  # noqa: F401
 from .tensor.math import lgamma  # noqa: F401
+from .tensor.math import diff  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
@@ -531,5 +532,6 @@
            'broadcast_tensors',
            'einsum',
            'set_flags',
-           'get_flags'
+           'get_flags',
+           'diff'
 ]
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
new file mode 100644
index 00000000000000..345dad54132bc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+
+
+class TestDiffOp(unittest.TestCase):
+    def set_args(self):
+        self.input = np.array([1, 4, 5, 2]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+    def get_output(self):
+        if self.prepend is not None and self.append is not None:
+            self.output = np.diff(
+                self.input,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+        elif self.prepend is not None:
+            self.output = np.diff(
+                self.input, n=self.n, axis=self.axis, prepend=self.prepend)
+        elif self.append is not None:
+            self.output = np.diff(
+                self.input, n=self.n, axis=self.axis, append=self.append)
+        else:
+            self.output = np.diff(self.input, n=self.n, axis=self.axis)
+
+    def setUp(self):
+        self.set_args()
+        self.get_output()
+        self.places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.input, place=place)
+            if self.prepend is not None:
+                self.prepend = paddle.to_tensor(self.prepend, place=place)
+            if self.append is not None:
+                self.append = paddle.to_tensor(self.append, place=place)
+            out = paddle.diff(
+                x,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+            self.assertTrue((out.numpy() == self.output).all(), True)
+
+    def test_static(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                x = paddle.fluid.data(
+                    name="input",
+                    shape=self.input.shape,
+                    dtype=self.input.dtype)
+                has_pend = False
+                prepend = None
+                append = None
+                if self.prepend is not None:
+                    has_pend = True
+                    prepend = paddle.fluid.data(
+                        name="prepend",
+                        shape=self.prepend.shape,
+                        dtype=self.prepend.dtype)
+                if self.append is not None:
+                    has_pend = True
+                    append = paddle.fluid.data(
+                        name="append",
+                        shape=self.append.shape,
+                        dtype=self.append.dtype)
+
+                exe = fluid.Executor(place)
+                out = paddle.diff(
+                    x, n=self.n, axis=self.axis, prepend=prepend, append=append)
+                fetches = exe.run(fluid.default_main_program(),
+                                  feed={
+                                      "input": self.input,
+                                      "prepend": self.prepend,
+                                      "append": self.append
+                                  },
+                                  fetch_list=[out])
+                self.assertTrue((fetches[0] == self.output).all(), True)
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(self.input, place=place, stop_gradient=False)
+            if self.prepend is not None:
+                self.prepend = paddle.to_tensor(self.prepend, place=place)
+            if self.append is not None:
+                self.append = paddle.to_tensor(self.append, place=place)
+            out = paddle.diff(
+                x,
+                n=self.n,
+                axis=self.axis,
+                prepend=self.prepend,
+                append=self.append)
+            try:
+                out.backward()
+                x_grad = x.grad
+            except:
+                raise RuntimeError("Check Diff Gradient Failed")
+
+
+class TestDiffOpAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpNDim(TestDiffOp):
+    def set_args(self):
+        self.input = np.random.rand(10, 10).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpBool(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([0, 1, 1, 0, 1, 0]).astype('bool')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = None
+
+
+class TestDiffOpPrepend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+        self.append = None
+
+
+class TestDiffOpPrependAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = np.array(
+            [[0, 2, 3, 4], [1, 3, 5, 7], [2, 5, 8, 0]]).astype('float32')
+        self.append = None
+
+
+class TestDiffOpAppend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = None
+        self.append = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+
+
+class TestDiffOpAppendAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = None
+        self.append = np.array([[2, 3, 4, 1]]).astype('float32')
+
+
+class TestDiffOpPreAppend(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = -1
+        self.prepend = np.array([[0, 4], [5, 9]]).astype('float32')
+        self.append = np.array([[2, 3, 4], [1, 3, 5]]).astype('float32')
+
+
+class TestDiffOpPreAppendAxis(TestDiffOp):
+    def set_args(self):
+        self.input = np.array([[1, 4, 5, 2], [1, 5, 4, 2]]).astype('float32')
+        self.n = 1
+        self.axis = 0
+        self.prepend = np.array([[0, 4, 5, 9], [5, 9, 2, 3]]).astype('float32')
+        self.append = np.array([[2, 3, 4, 7], [1, 3, 5, 6]]).astype('float32')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 21d1dd1793b2cb..23e956b6590d4c 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -189,6 +189,7 @@
 from .math import neg  # noqa: F401
 from .math import lgamma  # noqa: F401
 from .math import diagonal  # noqa: F401
+from .math import diff  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -400,7 +401,8 @@
            'uniform_',
            'multi_dot',
            'solve',
-           'triangular_solve'
+           'triangular_solve',
+           'diff'
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f5f0b5ed0873c1..0a5930d91adbba 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2611,3 +2611,166 @@ def atan2(x, y, name=None):
         helper.append_op(
                 type='atan2', inputs=inputs, outputs={'Out': out})
         return out
+
+
+def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
+    r"""
+    Computes the n-th forward difference along the given axis.
+    The first-order differences is computed by using the following formula: 
+
+    .. math::
+
+        out[i] = x[i+1] - x[i]
+    
+    Higher-order differences are computed by using paddle.diff() recursively. 
+    Only n=1 is currently supported.
+
+    Args:
+        x(Tensor): The input tensor to compute the forward difference on
+        n(int, optional): The number of times to recursively compute the difference. 
+                          Only support n=1. Default:1
+        axis(int, optional): The axis to compute the difference along. Default:-1
+        prepend(Tensor, optional): The tensor to prepend to input along axis before computing the difference.
+                                   It's dimensions must be equivalent to that of x, 
+                                   and its shapes must match x's shape except on axis.
+        append(Tensor, optional): The tensor to append to input along axis before computing the difference, 
+                                   It's dimensions must be equivalent to that of x, 
+                                   and its shapes must match x's shape except on axis.
+        name(str|None): A name for this layer(optional). If set None, 
+                        the layer will be named automatically.
+    
+    Returns:
+        Tensor: The output tensor with same dtype with x.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([1, 4, 5, 2])
+            out = paddle.diff(x)
+            print(out)
+            # out:
+            # [3, 1, -3]
+
+            y = paddle.to_tensor([7, 9])
+            out = paddle.diff(x, append=y)
+            print(out)
+            # out: 
+            # [3, 1, -3, 5, 2]
+
+            z = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+            out = paddle.diff(z, axis=0)
+            print(out)
+            # out:
+            # [[3, 3, 3]]
+            out = paddle.diff(z, axis=1)
+            print(out)
+            # out:
+            # [[1, 1], [1, 1]]
+
+
+    """
+
+    if axis < 0:
+        axis = axis + len(x.shape)
+    if axis > len(x.shape):
+        axis = len(x.shape)
+    if axis < 0:
+        axis = 0
+    dtype = x.dtype
+    axes = [axis]
+    infer_flags = list(1 for i in range(len(axes)))
+    if in_dygraph_mode():
+        has_pend = False
+        input_list = []
+        if prepend is not None and append is not None:
+            input_list = [prepend, x, append]
+            has_pend = True
+        elif prepend is not None:
+            input_list = [prepend, x]
+            has_pend = True
+        elif append is not None:
+            input_list = [x, append]
+            has_pend = True
+        if has_pend:
+            new_input = _C_ops.concat(input_list, 'axis', axis)
+        else:
+            new_input = x
+
+        attrs_1 = ()
+        attrs_2 = ()
+
+        dim_len = new_input.shape[axis]
+
+        starts_1 = [0]
+        attrs_1 += ('starts', starts_1)
+        ends_1 = [dim_len - 1]
+        attrs_1 += ('ends', ends_1)
+        input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \
+            'infer_flags', infer_flags, *attrs_1)
+        starts_2 = [1]
+        attrs_2 += ('starts', starts_2)
+        ends_2 = [dim_len]
+        attrs_2 += ('ends', ends_2)
+        input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \
+            'infer_flags', infer_flags, *attrs_2)
+
+        if x.dtype == paddle.bool:
+            op = getattr(_C_ops, "logical_xor")
+            out = op(input_back, input_front)
+        else:
+            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+        return out
+    else:
+        check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
+        check_type(axis, 'axis', (int), 'diff')
+        helper = LayerHelper('diff', **locals())
+        has_pend = False
+        input_list = []
+        if prepend is not None and append is not None:
+            input_list = [prepend, x, append]
+            has_pend = True
+        elif prepend is not None:
+            input_list = [prepend, x]
+            has_pend = True
+        elif append is not None:
+            input_list = [x, append]
+            has_pend = True
+
+        if has_pend:
+            new_input = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='concat', inputs={'X': input_list}, outputs={'Out': [new_input]}, attrs={'axis': axis}
+            )
+        else:
+            new_input = x
+
+        dim_len = new_input.shape[axis]
+        attrs_1 = {'axes': axes}
+        starts_1 = [0]
+        ends_1 = [dim_len - 1]
+        attrs_1['starts'] = starts_1
+        attrs_1['ends'] = ends_1
+        input_front = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='slice', inputs={'Input': new_input}, attrs=attrs_1, outputs={'Out': input_front}
+        )
+        attrs_2 = {'axes': axes}
+        starts_2 = [1]
+        ends_2 = [dim_len]
+        attrs_2['starts'] = starts_2
+        attrs_2['ends'] = ends_2
+        input_back = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='slice', inputs={'Input': new_input}, attrs=attrs_2, outputs={'Out': input_back}
+        )
+
+        if dtype == paddle.bool:
+            out = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type='logical_xor', inputs={"X": input_back, "Y": input_front}, outputs={"Out": out}
+            )
+        else:
+            out = layers.elementwise_sub(input_back, input_front, axis=axis)
+
+        return out

From 1514eec6e6d0202846ef5e6781cbfa73aa9cf283 Mon Sep 17 00:00:00 2001
From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com>
Date: Tue, 30 Nov 2021 14:37:55 +0800
Subject: [PATCH 003/124] pscore global shuffle&default accessor config
 (#37626)

---
 .../framework/distributed_strategy.proto      |  81 +++++---
 .../fleet/base/distributed_strategy.py        |  24 ++-
 .../distributed/fleet/base/fleet_base.py      |   2 +-
 .../distributed/fleet/runtime/the_one_ps.py   | 184 ++++++++++++------
 python/paddle/fluid/dataset.py                |  30 ++-
 .../fluid/tests/unittests/dist_fleet_ctr.py   |  53 ++++-
 .../tests/unittests/test_dist_fleet_ctr.py    |  35 ++++
 .../test_fleet_distributed_strategy.py        |   9 +-
 8 files changed, 323 insertions(+), 95 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 5aef43263575ec..ae5c9504ecb6ee 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -181,7 +181,7 @@ enum TableType {
 message TableParameter {
   optional uint64 table_id = 1;
   optional string table_class = 2;
-  optional uint64 shard_num = 3;
+  optional uint64 shard_num = 3 [ default = 1000 ];
   optional TableType type = 4;
   optional TableAccessorParameter accessor = 5;
 }
@@ -190,42 +190,73 @@ message TableAccessorParameter {
   optional string accessor_class = 1;
   optional SGDParameter embed_sgd_param = 2;
   optional SGDParameter embedx_sgd_param = 3;
-  optional uint32 fea_dim = 4; // for sparse table, this means field size of one
-                               // value; for dense table, this means total value
-                               // num
-  optional uint32 embedx_dim = 5;       // embedx feature size
-  optional uint32 embedx_threshold = 6; // embedx feature create threshold
+  optional uint32 fea_dim = 4 [ default = 11 ];   // field size of one value
+  optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
+  optional uint32 embedx_threshold = 6
+      [ default = 10 ]; // embedx feature create threshold
   optional CtrAccessorParameter ctr_accessor_param = 7;
+  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
 }
 
 // TODO(guanqun): add NaiveSGD/Adam...
 message SGDParameter {
   optional string name = 1;
-  optional SGDRuleParameter adagrad = 2;
+  optional SparseNaiveSGDRuleParameter naive = 2;
+  optional SparseAdagradSGDRuleParameter adagrad = 3;
+  optional SparseAdamSGDParameter adam = 4;
 }
 
-message SGDRuleParameter {
-  optional double learning_rate = 1;
-  optional double initial_g2sum = 2;
-  optional double initial_range = 3 [ default = 0 ];
+message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  repeated float weight_bounds = 3;
+}
+
+message
+    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_g2sum = 2 [ default = 3.0 ];
+  optional double initial_range = 3 [ default = 0.0001 ];
   repeated float weight_bounds = 4;
 }
 
+message SparseAdamSGDParameter { // SparseAdamSGDRule
+  optional double learning_rate = 1 [ default = 0.001 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  optional double beta1_decay_rate = 3 [ default = 0.9 ];
+  optional double beta2_decay_rate = 4 [ default = 0.999 ];
+  optional double ada_epsilon = 5 [ default = 1e-08 ];
+  repeated float weight_bounds = 6;
+}
+
 message CtrAccessorParameter {
-  optional float nonclk_coeff = 1; // to calculate show_click_score
-  optional float click_coeff = 2;  // to calculate show_click_score
-  optional float base_threshold =
-      3; // show_click_score > base_threshold, this feature can be saved
-  optional float delta_threshold =
-      4; // delta_score > delta_threshold, this feature can be saved
-  optional float delta_keep_days =
-      5; // unseen_day < delta_keep_days, this feature can be saved
-  optional float show_click_decay_rate = 6; // show/click will update to
-                                            // show/click *
-                                            // show_click_decay_rate after a day
-  optional float delete_threshold = 7;      // threshold to shrink a feasign
-  optional float delete_after_unseen_days = 8;
-  optional int32 ssd_unseenday_threshold = 9;
+  optional float nonclk_coeff = 1
+      [ default = 0.1 ]; // to calculate show_click_score
+  optional float click_coeff = 2
+      [ default = 1 ]; // to calculate show_click_score
+  optional float base_threshold = 3 [
+    default = 1.5
+  ]; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold = 4
+      [ default =
+            0.25 ]; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days = 5
+      [ default =
+            16 ]; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6
+      [ default = 0.98 ]; // show/click will update to
+                          // show/click *
+                          // show_click_decay_rate after a day
+  optional float delete_threshold = 7
+      [ default = 0.8 ]; // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8 [ default = 30 ];
+  optional int32 ssd_unseenday_threshold = 9 [ default = 1 ];
+}
+
+message TableAccessorSaveParameter {
+  optional uint32 param = 1;
+  optional string converter = 2;
+  optional string deconverter = 3;
 }
 
 message FsClientParameter {
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index cdbc7bd0cd7440..cc0a5de233c382 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -470,12 +470,22 @@ def sparse_table_configs(self, configs):
         from google.protobuf.descriptor import FieldDescriptor
         table_param = self.strategy.downpour_table_param
 
-        def set_table_config(msg, config_name, configs):
+        def set_table_config(msg, config_name, configs, index=0):
             for field in msg.DESCRIPTOR.fields:
                 name = config_name + "." + field.name
                 if field.type == FieldDescriptor.TYPE_MESSAGE:
                     print("message:", name)
-                    set_table_config(getattr(msg, field.name), name, configs)
+                    if field.label == FieldDescriptor.LABEL_REPEATED:
+                        if name + ".num" not in configs:
+                            continue
+                        num = configs[name + ".num"]
+                        print("message num:", name, num)
+                        for i in range(num):
+                            data = getattr(msg, field.name).add()
+                            set_table_config(data, name, configs, i)
+                    else:
+                        set_table_config(
+                            getattr(msg, field.name), name, configs)
                 else:
                     print("not message:", name)
                     if name not in configs:
@@ -483,9 +493,15 @@ def set_table_config(msg, config_name, configs):
                     if field.label == FieldDescriptor.LABEL_REPEATED:
                         getattr(msg, field.name).extend(configs[name])
                     else:
-                        setattr(msg, field.name, configs[name])
+                        if type(configs[name]) == list:
+                            setattr(msg, field.name, configs[name][index])
+                        else:
+                            setattr(msg, field.name, configs[name])
 
-        set_table_config(table_param, "table_parameters", configs)
+        if not configs:
+            print("table configs is empty")
+        else:
+            set_table_config(table_param, "table_parameters", configs)
 
     @property
     def amp(self):
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 57199b8a1e8cc4..a1e5ef2ba799fc 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -823,7 +823,7 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
         self._runtime_handle._save_persistables(executor, dirname, main_program,
                                                 mode)
 
-    def shrink(self, threshold):
+    def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
 
     def distributed_optimizer(self, optimizer, strategy=None):
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 81613cc1efdfb0..1c51e833f53f60 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -24,7 +24,6 @@
 from paddle.fluid.framework import Variable, Parameter
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
-import paddle.distributed.fleet as fleet
 
 __all__ = []
 
@@ -53,6 +52,70 @@ def parse_table_class(varname, o_main_program):
                 return "MemorySparseTable"
 
 
+def get_default_accessor_proto(accessor, varname, o_main_program):
+    embedding_dim = 0
+    for var in o_main_program.list_vars():
+        if var.name == varname:
+            print("var:", var)
+            print("var.shape:", var.shape)
+            embedding_dim = var.shape[1]
+            print("sparse dim:", embedding_dim)
+            break
+
+    accessor.accessor_class = "CtrCommonAccessor"
+    accessor.fea_dim = embedding_dim + 2
+    accessor.embedx_dim = embedding_dim - 1
+    accessor.embedx_threshold = 0
+
+    ctr_accessor_param = accessor.ctr_accessor_param
+    ctr_accessor_param.nonclk_coeff = 0.1
+    ctr_accessor_param.click_coeff = 1.0
+    ctr_accessor_param.base_threshold = 0
+    ctr_accessor_param.delta_threshold = 0
+    ctr_accessor_param.delta_keep_days = 16
+    ctr_accessor_param.show_click_decay_rate = 1
+    ctr_accessor_param.delete_threshold = 0
+    ctr_accessor_param.delete_after_unseen_days = 30
+    ctr_accessor_param.ssd_unseenday_threshold = 1
+
+    embed_sgd_param = accessor.embed_sgd_param
+    embed_sgd_param.name = "SparseAdaGradSGDRule"
+    embed_sgd_param.adagrad.learning_rate = 0.05
+    embed_sgd_param.adagrad.initial_g2sum = 3.0
+    embed_sgd_param.adagrad.initial_range = 0.0001
+    embed_sgd_param.adagrad.weight_bounds.append(-10.0)
+    embed_sgd_param.adagrad.weight_bounds.append(10.0)
+
+    embedx_sgd_param = accessor.embedx_sgd_param
+    embedx_sgd_param.name = "SparseAdaGradSGDRule"
+    embedx_sgd_param.adagrad.learning_rate = 0.05
+    embedx_sgd_param.adagrad.initial_g2sum = 3.0
+    embedx_sgd_param.adagrad.initial_range = 0.0001
+    embedx_sgd_param.adagrad.weight_bounds.append(-10.0)
+    embedx_sgd_param.adagrad.weight_bounds.append(10.0)
+
+
+def check_embedding_dim(accessor, varname, o_main_program):
+    embedding_dim = 0
+    for var in o_main_program.list_vars():
+        if var.name == varname:
+            print("var:", var)
+            print("var.shape:", var.shape)
+            embedding_dim = var.shape[1]
+            print("sparse dim:", embedding_dim)
+            break
+    fea_dim = accessor.fea_dim
+    if fea_dim != embedding_dim + 2:
+        raise ValueError(
+            "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
+            format(embedding_dim + 2, fea_dim))
+    embedx_dim = accessor.embedx_dim
+    if embedx_dim != embedding_dim - 1:
+        raise ValueError(
+            "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
+            format(embedding_dim - 1, embedx_dim))
+
+
 class Accessor:
     def __init__(self):
         self.accessor_class = ""
@@ -344,6 +407,11 @@ def __init__(self):
         self.accessor_proto = None
 
     def to_string(self, indent):
+        # if self.id == 1:
+        #     proto_txt = ''
+        #     with open('./sparse_table.prototxt') as f:
+        #         proto_txt = f.read()
+        #     return proto_txt
         table_str = "{}downpour_table_param {{{}\n{}}}"
 
         attrs = ""
@@ -586,6 +654,8 @@ def sync_strategy_envs():
             return kwargs
 
         proto_txt = str(worker) + "\n" + str(server)
+        with open('proto_txt', 'w') as f:
+            f.write(proto_txt)
 
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
 
@@ -847,54 +917,54 @@ def _get_tables():
                     if self.compiled_strategy.is_geo_mode():
                         table.table_class = "SparseGeoTable"
                     else:
-                        table.table_class = parse_table_class(
-                            common.table_name, self.origin_main_program)
-                        table_proto = self.context[
-                            "user_defined_strategy"].sparse_table_configs
-                        table.shard_num = table_proto.shard_num
+                        import copy
+                        table_proto = copy.deepcopy(self.context[
+                            "user_defined_strategy"].sparse_table_configs)
+                        print('table proto:', table_proto)
+                        print('table_class:', table_proto.table_class)
+                        print('shard_num:', table_proto.shard_num)
+                        print('table_proto.accessor:', table_proto.accessor)
+                        print('accessor.IsInitialized',
+                              table_proto.accessor.IsInitialized())
+                        print('accessor.ByteSize',
+                              table_proto.accessor.ByteSize())
+                        if table_proto.table_class:
+                            print('table_proto.table_class is true')
+                            table.table_class = table_proto.table_class
+                        else:
+                            table.table_class = parse_table_class(
+                                common.table_name, self.origin_main_program)
+                        if table.table_class != 'MemorySparseTable':
+                            table.table_class = 'MemorySparseTable'
+                            warnings.warn(
+                                "The PS mode must use MemorySparseTable.")
+
+                        if table_proto.shard_num:
+                            print('table_proto.shard_num is true')
+                            table.shard_num = table_proto.shard_num
+                        else:
+                            table.shard_num = 1000
+                            warnings.warn(
+                                "The shard_num of sparse table is not set, use default value 1000."
+                            )
+
+                        if table_proto.accessor.ByteSize() == 0:
+                            print('table_proto.accessor is false')
+                            get_default_accessor_proto(table_proto.accessor,
+                                                       common.table_name,
+                                                       self.origin_main_program)
+                            warnings.warn(
+                                "The accessor of sparse table is not set, use default value."
+                            )
+                        check_embedding_dim(table_proto.accessor,
+                                            common.table_name,
+                                            self.origin_main_program)
+                        print('accessor.ByteSize',
+                              table_proto.accessor.ByteSize())
                         from google.protobuf import text_format
                         table.accessor_proto = text_format.MessageToString(
                             table_proto.accessor)
-
-                        print('table proto:', table_proto)
-                        if table.table_class == 'MemorySparseTable' and table.accessor_proto == '':
-                            emb_dim = ctx.sections()[1]
-                            table.shard_num = 1950
-                            table.accessor_proto = 'accessor_class: "CtrCommonAccessor"\n' \
-                                                   'embed_sgd_param {\n' \
-                                                   '  name: "SparseAdaGradSGDRule"\n' \
-                                                   '  adagrad {\n' \
-                                                   '    learning_rate: 0.05\n' \
-                                                   '    initial_g2sum: 3.0\n' \
-                                                   '    initial_range: 0.0001\n' \
-                                                   '    weight_bounds: -10.0\n' \
-                                                   '    weight_bounds: 10.0\n' \
-                                                   '  }\n' \
-                                                   '}\n' \
-                                                   'embedx_sgd_param {\n' \
-                                                   '  name: "SparseAdaGradSGDRule"\n' \
-                                                   '  adagrad {\n' \
-                                                   '    learning_rate: 0.05\n' \
-                                                   '    initial_g2sum: 3.0\n' \
-                                                   '    initial_range: 0.0001\n' \
-                                                   '    weight_bounds: -10.0\n' \
-                                                   '    weight_bounds: 10.0\n' \
-                                                   '  }\n' \
-                                                   '}\n' \
-                                                   'fea_dim: ' + str(emb_dim+2) + '\n' \
-                                                   'embedx_dim: ' + str(emb_dim-1) + '\n' \
-                                                   'embedx_threshold: 10\n' \
-                                                   'ctr_accessor_param {\n' \
-                                                   '  nonclk_coeff: 0.1\n' \
-                                                   '  click_coeff: 1.0\n' \
-                                                   '  base_threshold: 1.5\n' \
-                                                   '  delta_threshold: 0.25\n' \
-                                                   '  delta_keep_days: 16.0\n' \
-                                                   '  show_click_decay_rate: 0.98\n' \
-                                                   '  delete_threshold: 0.8\n' \
-                                                   '  delete_after_unseen_days: 30.0\n' \
-                                                   '  ssd_unseenday_threshold: 1\n' \
-                                                   '}'
+                        print("the_one_ps table_proto:", table.accessor_proto)
                 else:
                     table.type = "PS_DENSE_TABLE"
                     table.table_class = "CommonDenseTable"
@@ -916,7 +986,6 @@ def _get_tables():
                     common.sync = "true"
                 else:
                     common.sync = "false"
-
                 table.common = common
 
                 if table.table_class != 'MemorySparseTable':
@@ -1108,8 +1177,6 @@ def _save_distributed_persistables(self,
                 TheOnePSRuntime.__exclude_vars(saved_varnames),
                 main_program.list_vars()))
 
-        self._communicator.pull_dense(denses)
-
         import paddle
         for var in remaining_vars:
             # if var.name not in recv_dense_varnames:
@@ -1209,9 +1276,8 @@ def _ps_inference_save_inference_model(self,
             split_dense_table=self.role_maker._is_heter_parameter_server_mode,
             use_origin_program=True)
         print("the one ps sparses:", sparses)
-        sparse_names = []
-        for id, name in sparses.items():
-            sparse_names.extend(name)
+        sparse_names = self._save_sparse_params(executor, dirname, sparses,
+                                                main_program, mode)
         print("the one ps sparse names:", sparse_names)
 
         denses = self.compiled_strategy.get_the_one_recv_context(
@@ -1225,7 +1291,7 @@ def _ps_inference_save_inference_model(self,
         generate_vars = [var for var in generate_vars]
         remaining_vars = list(
             filter(
-                TheOnePSRuntime.__exclude_vars(generate_vars + sparse_names),
+                TheOnePSRuntime.__exclude_vars(sparse_names),
                 infer_program.list_vars()))
         print("remain_vars:", [var.name for var in remaining_vars])
         for var in remaining_vars:
@@ -1235,9 +1301,6 @@ def _ps_inference_save_inference_model(self,
                 os.path.join(model_path, var.name),
                 use_binary_format=True)
 
-        self._ps_inference_save_persistables(executor, dirname, infer_program,
-                                             mode)
-
     def _save_inference_model(self, *args, **kwargs):
         self._ps_inference_save_inference_model(*args, **kwargs)
 
@@ -1314,8 +1377,15 @@ def load_model(self, path, mode):
             self._load_distributed_persistables(path, mode)
         else:
             self._ps_inference_load_inference_model(path, mode)
+        # self._load_distributed_persistables(path, mode=mode)
 
-    def _shrink(self, threshold):
+    def _shrink(self, threshold=None):
+        if threshold is not None:
+            warnings.warn(
+                "The param threshold is not used in MemorySparseTable, if you need to shrink, please set the config of accessor"
+            )
+        else:
+            threshold = 0
         import paddle.distributed.fleet as fleet
         fleet.util.barrier()
         if self.role_maker._is_first_worker():
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 9f698134eee174..0e291648b37544 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -862,8 +862,12 @@ def global_shuffle(self, fleet=None, thread_num=12):
             thread_num(int): shuffle thread num. Default is 12.
 
         """
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
             if self.trainer_num == -1:
                 self.trainer_num = fleet.worker_num()
         if self.fleet_send_batch_size is None:
@@ -875,14 +879,23 @@ def global_shuffle(self, fleet=None, thread_num=12):
         self.dataset.set_fleet_send_batch_size(self.fleet_send_batch_size)
         self.dataset.set_fleet_send_sleep_seconds(self.fleet_send_sleep_seconds)
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
         self.dataset.global_shuffle(thread_num)
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
         if self.merge_by_lineid:
             self.dataset.merge_by_lineid()
         if fleet is not None:
-            fleet._role_maker.barrier_worker()
+            if not isinstance(fleet, PSLib):
+                fleet.barrier_worker()
+            else:
+                fleet._role_maker.barrier_worker()
 
     @deprecated(
         since="2.0.0",
@@ -1011,10 +1024,15 @@ def get_shuffle_data_size(self, fleet=None):
         import numpy as np
         local_data_size = self.dataset.get_shuffle_data_size()
         local_data_size = np.array([local_data_size])
+        print('global shuffle local_data_size: ', local_data_size)
         if fleet is not None:
+            from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             global_data_size = local_data_size * 0
-            fleet._role_maker.all_reduce_worker(local_data_size,
-                                                global_data_size)
+            if not isinstance(fleet, PSLib):
+                global_data_size = fleet.util.all_reduce(local_data_size)
+            else:
+                fleet._role_maker.all_reduce_worker(local_data_size,
+                                                    global_data_size)
             return global_data_size[0]
         return local_data_size[0]
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 65c8a7500f246c..2bd397b0ef3f53 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -241,7 +241,7 @@ def do_pyreader_training(self, fleet):
         self.check_model_right(model_dir)
         shutil.rmtree(model_dir)
 
-    def do_dataset_training(self, fleet):
+    def do_dataset_training_queuedataset(self, fleet):
         train_file_list = ctr_dataset_reader.prepare_fake_data()
 
         exe = self.get_executor()
@@ -288,5 +288,56 @@ def do_dataset_training(self, fleet):
         if dirname:
             fleet.save_persistables(exe, dirname=dirname)
 
+    def do_dataset_training(self, fleet):
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
+
+        exe = self.get_executor()
+        exe.run(fluid.default_startup_program())
+        fleet.init_worker()
+
+        thread_num = 2
+        batch_size = 128
+        filelist = train_file_list
+
+        # config dataset
+        dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset.set_use_var(self.feeds)
+        dataset.set_batch_size(128)
+        dataset.set_thread(2)
+        dataset.set_filelist(filelist)
+        dataset.set_pipe_command('python ctr_dataset_reader.py')
+        dataset.load_into_memory()
+
+        dataset.global_shuffle(fleet, 12)  ##TODO: thread configure
+        shuffle_data_size = dataset.get_shuffle_data_size(fleet)
+        local_data_size = dataset.get_shuffle_data_size()
+        data_size_list = fleet.util.all_gather(local_data_size)
+        print('after global_shuffle data_size_list: ', data_size_list)
+        print('after global_shuffle data_size: ', shuffle_data_size)
+
+        for epoch_id in range(1):
+            pass_start = time.time()
+            exe.train_from_dataset(
+                program=fluid.default_main_program(),
+                dataset=dataset,
+                fetch_list=[self.avg_cost],
+                fetch_info=["cost"],
+                print_period=2,
+                debug=int(os.getenv("Debug", "0")))
+            pass_time = time.time() - pass_start
+        dataset.release_memory()
+
+        if os.getenv("SAVE_MODEL") == "1":
+            model_dir = tempfile.mkdtemp()
+            fleet.save_inference_model(exe, model_dir,
+                                       [feed.name for feed in self.feeds],
+                                       self.avg_cost)
+            self.check_model_right(model_dir)
+            shutil.rmtree(model_dir)
+
+        dirname = os.getenv("SAVE_DIRNAME", None)
+        if dirname:
+            fleet.save_persistables(exe, dirname=dirname)
+
 if __name__ == "__main__":
     runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 3beb1d3dfe0331..59d196fdf55e57 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -20,6 +20,41 @@
 from test_dist_fleet_base import TestFleetBase
 
 
+class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        #self._reader = "pyreader"
+        self._reader = "dataset"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
+
+
 class TestDistMnistAsync2x2(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 9cf3eb251b3962..a9193c0abdfc18 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -259,11 +259,18 @@ def test_sparse_table_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
             "table_parameters.accessor.embed_sgd_param.adagrad.learning_rate":
-            0.05
+            0.05,
+            "table_parameters.accessor.table_accessor_save_param.num": 2,
+            "table_parameters.accessor.table_accessor_save_param.param":
+            [1, 2]
         }
         strategy.sparse_table_configs = configs
         self.assertEqual(strategy.sparse_table_configs.accessor.embed_sgd_param.
                          adagrad.learning_rate, 0.05)
+        self.assertEqual(
+            strategy.sparse_table_configs.accessor.table_accessor_save_param[
+                0].param, 1)
+
         strategy.adam_d2sum = True
         self.assertEqual(strategy.adam_d2sum, True)
         strategy.fs_client_param = {

From eb9e330548bb306ad272355550bf08cb61cd5ac6 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 30 Nov 2021 14:51:36 +0800
Subject: [PATCH 004/124] Enabled performance benchmark tests for Eager Dygraph
 (#37653)

* Enabled performance benchmark tests for Eager Dygraph

* Protected CUDA tests with macro

* Fixed dependency issues for windows-ci
---
 paddle/fluid/eager/CMakeLists.txt             |  2 +-
 .../fluid/eager/api/generated/CMakeLists.txt  |  2 +-
 .../eager/auto_code_generator/CMakeLists.txt  | 33 +++++++++++++++++--
 paddle/fluid/eager/tests/CMakeLists.txt       |  4 +++
 .../tests/performance_tests/CMakeLists.txt    |  7 ++++
 .../performance_tests/benchmark_eager_cpu.cc  | 20 +++++------
 .../performance_tests/benchmark_eager_cuda.cc | 24 ++++++++------
 .../performance_tests/benchmark_fluid_cpu.cc  |  8 ++---
 .../performance_tests/benchmark_fluid_cuda.cc | 12 ++++---
 .../performance_tests/benchmark_utils.cc      | 28 +++++++++-------
 .../eager/tests/task_tests/CMakeLists.txt     |  2 +-
 11 files changed, 97 insertions(+), 45 deletions(-)
 create mode 100644 paddle/fluid/eager/tests/performance_tests/CMakeLists.txt

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index e8cb55b7afeb9a..d5abf639c83db4 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -2,7 +2,7 @@ set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps dygraph_function dygraph_node)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     message("Performing Eager Dygraph Auto Code Generation")
     add_subdirectory(auto_code_generator)
 endif()
diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt
index 407a8d69e52dae..ebbef286f79230 100644
--- a/paddle/fluid/eager/api/generated/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(eager_generated)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     add_subdirectory(fluid_generated)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 5d31c9139baa81..03cec80b682b11 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -17,9 +17,38 @@ execute_process(
 )
 
 if(WIN32)
+    set(EAGER_CODEGEN_DEPS eager_generator)
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
+    else()
+      set(eager_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
+    endif()
+    
+    if(${CBLAS_PROVIDER} STREQUAL MKLML)
+      message("Copied libiomp5md.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${eager_generator_path}
+        DEPENDS mklml)
+      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
+    else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+      message("Copied openblas.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/openblas.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${eager_generator_path}
+        DEPENDS extern_openblas)
+      list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/openblas.dll)
+    endif()
+
+    if(WITH_MKLDNN)
+      message("Copied mkldnn.dll for Eager AutoCodeGen")
+      ADD_CUSTOM_COMMAND(OUTPUT ${eager_generator_path}/mkldnn.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${eager_generator_path}
+        DEPENDS mkldnn)
+        list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/mkldnn.dll)
+    endif()
+
     add_custom_target(eager_codegen
-      COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" 
-      DEPENDS eager_generator
+      COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" 
+      DEPENDS ${EAGER_CODEGEN_DEPS}
       VERBATIM)
 else()
     add_custom_target(eager_codegen
diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt
index 289f24dfa63675..c1506d8139b432 100644
--- a/paddle/fluid/eager/tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/CMakeLists.txt
@@ -1,2 +1,6 @@
 add_subdirectory(data_structure_tests)
 add_subdirectory(task_tests)
+
+if(NOT ON_INFER)
+    add_subdirectory(performance_tests)
+endif()
diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
new file mode 100644
index 00000000000000..8811aa8ad38a5e
--- /dev/null
+++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)
+
+cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+
+cc_test(test_egr_performance_benchmark_eager_cuda SRCS benchmark_eager_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
+cc_test(test_egr_performance_benchmark_fluid_cuda SRCS benchmark_fluid_cuda.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 0a84f3b523aeed..0637ff2bb23d39 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -25,7 +25,7 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 #ifdef WITH_GPERFTOOLS
@@ -42,11 +42,11 @@ TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
 TEST(Benchmark, EagerScaleCPU) {
   // Prepare Device Contexts
-  egr::InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-    egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor tensor = CreateTensorWithValue(
         ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 5.0, true);
     RetainGradForTensor(tensor);
@@ -78,20 +78,20 @@ TEST(Benchmark, EagerScaleCPU) {
 
 TEST(Benchmark, EagerIntermediateMatmulCPU) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 1.0, true);
     RetainGradForTensor(X);
 
     paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor Y = CreateTensorWithValue(
         ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 2.0, true);
     RetainGradForTensor(Y);
@@ -122,7 +122,7 @@ TEST(Benchmark, EagerIntermediateMatmulCPU) {
 
 TEST(Benchmark, EagerIntermediateMLPCPU) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
@@ -130,7 +130,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     paddle::framework::DDim ddimX =
         paddle::framework::make_ddim({MLP_M, MLP_N});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, MLP_X_VAL, true);
     RetainGradForTensor(X);
@@ -140,13 +140,13 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
       paddle::framework::DDim ddimW =
           paddle::framework::make_ddim({MLP_N, MLP_K});
-      egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor W = CreateTensorWithValue(
           ddimW, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_W_VAL, true);
       RetainGradForTensor(W);
 
       paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
-      egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor B = CreateTensorWithValue(
           ddimB, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_B_VAL, true);
       RetainGradForTensor(B);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index b373802c79eb45..96dff14184f40c 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -24,7 +24,7 @@
 
 #include "paddle/fluid/imperative/tracer.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 
 #ifdef WITH_GPERFTOOLS
@@ -38,12 +38,14 @@ DECLARE_bool(run_pten_kernel);
 
 TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 TEST(Benchmark, EagerScaleCUDA) {
-  egr::InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-    egr::EagerTensor tensor = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor tensor = CreateTensorWithValue(
         ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
     RetainGradForTensor(tensor);
@@ -77,7 +79,7 @@ TEST(Benchmark, EagerScaleCUDA) {
 
 TEST(Benchmark, EagerIntermediateMatmulCUDA) {
   paddle::platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   tracer->SetExpectedPlace(place);
@@ -85,13 +87,13 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddimX = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 1.0, true);
     RetainGradForTensor(X);
 
     paddle::framework::DDim ddimY = paddle::framework::make_ddim({2, 2});
-    egr::EagerTensor Y = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor Y = CreateTensorWithValue(
         ddimY, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, 2.0, true);
     RetainGradForTensor(Y);
@@ -125,7 +127,7 @@ TEST(Benchmark, EagerIntermediateMatmulCUDA) {
 
 TEST(Benchmark, EagerIntermediateMLPCUDA) {
   paddle::platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   tracer->SetExpectedPlace(place);
@@ -134,7 +136,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::framework::DDim ddimX =
         paddle::framework::make_ddim({MLP_M, MLP_N});
-    egr::EagerTensor X = EagerUtils::CreateTensorWithValue(
+    egr::EagerTensor X = CreateTensorWithValue(
         ddimX, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
         pten::DataLayout::NCHW, MLP_X_VAL, true);
     RetainGradForTensor(X);
@@ -144,13 +146,13 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
     for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
       paddle::framework::DDim ddimW =
           paddle::framework::make_ddim({MLP_N, MLP_K});
-      egr::EagerTensor W = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor W = CreateTensorWithValue(
           ddimW, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_W_VAL, true);
       RetainGradForTensor(W);
 
       paddle::framework::DDim ddimB = paddle::framework::make_ddim({MLP_K});
-      egr::EagerTensor B = EagerUtils::CreateTensorWithValue(
+      egr::EagerTensor B = CreateTensorWithValue(
           ddimB, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
           pten::DataLayout::NCHW, MLP_B_VAL, true);
       RetainGradForTensor(B);
@@ -185,3 +187,5 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
     }
   }
 }
+
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index 20844055e300d6..d98000b71fd2a3 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -45,7 +45,7 @@ namespace imperative {
 TEST(Benchmark, FluidScaleCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -88,7 +88,7 @@ TEST(Benchmark, FluidScaleCPU) {
 TEST(Benchmark, FluidMatmulCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -141,7 +141,7 @@ TEST(Benchmark, FluidMatmulCPU) {
 TEST(Benchmark, FluidMLPCPU) {
   // Prepare Device Contexts
   platform::CPUPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "Performance"}) {
     std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 620a4d1cd128d4..918ebadba0a4c8 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
+#include "paddle/fluid/eager/tests/performance_tests/benchmark_utils.h"
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -39,13 +39,15 @@ DECLARE_bool(run_pten_kernel);
 
 TEST(Benchmark, Init) { FLAGS_run_pten_kernel = false; }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 namespace paddle {
 namespace imperative {
 
 TEST(Benchmark, FluidScaleCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -98,7 +100,7 @@ TEST(Benchmark, FluidScaleCUDA) {
 TEST(Benchmark, FluidMatmulCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     std::shared_ptr<imperative::VarBase> X(new imperative::VarBase(true, "X"));
@@ -161,7 +163,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
 TEST(Benchmark, FluidMLPCUDA) {
   // Prepare Device Contexts
   platform::CUDAPlace place;
-  egr::InitEnv(place);
+  eager_test::InitEnv(place);
 
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::platform::DeviceContextPool& pool =
@@ -252,3 +254,5 @@ USE_OP(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
+
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index ae5d02c1e943f1..baa99dc93c2dd3 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -36,10 +36,6 @@
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/memory/memcpy.h"
 
-#include "paddle/fluid/eager/tests/benchmark/benchmark_utils.h"
-
-#include "paddle/pten/core/kernel_registry.h"
-
 static size_t max_num_benchmark_runs = 5000;
 
 namespace egr {
@@ -64,9 +60,9 @@ void benchmark_eager_scale(const EagerTensor& tensor, bool accuracy_check) {
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 10)
-    CompareTensorWithValue<float>(input_tensor, 8189.0);
+    eager_test::CompareTensorWithValue<float>(input_tensor, 8189.0);
     // Examine Backward Grad (w.r.t max_num_runs = 10)
-    CompareGradTensorWithValue<float>(tensor, 1024.0);
+    eager_test::CompareGradTensorWithValue<float>(tensor, 1024.0);
   }
 }
 
@@ -89,10 +85,10 @@ void benchmark_eager_intermediate_matmul(const EagerTensor& X,
 
   if (accuracy_check) {
     // Examine Forward Grad (w.r.t max_num_runs = 2)
-    CompareVariableWithValue<float>(input_tensor0, 16);
+    eager_test::CompareVariableWithValue<float>(input_tensor0, 16);
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    CompareGradVariableWithValue<float>(X, 16);
-    CompareGradVariableWithValue<float>(Y, 16);
+    eager_test::CompareGradVariableWithValue<float>(X, 16);
+    eager_test::CompareGradVariableWithValue<float>(Y, 16);
   }
 }
 
@@ -122,11 +118,11 @@ void benchmark_eager_intermediate_mlp(const EagerTensor& X,
         compute_mlp_expected_results();
 
     // Examine Forward Grad (w.r.t max_num_runs = 2)
-    CompareVariableWithValue<float>(Out, result["Out"]);
+    eager_test::CompareVariableWithValue<float>(Out, result["Out"]);
 
     // Examine Backward Grad (w.r.t max_num_runs = 2)
-    CompareGradVariableWithValue<float>(X, result["GradX"]);
-    CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
+    eager_test::CompareGradVariableWithValue<float>(X, result["GradX"]);
+    eager_test::CompareGradVariableWithValue<float>(Ws[0], result["GradW"]);
   }
 }
 
@@ -141,6 +137,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
   auto* tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
   float* t_ptr = tensor->mutable_data<float>(place);
   std::vector<float> host_data(tensor->numel());
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
@@ -153,6 +151,8 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
                          sizeof(float) * tensor->numel(), stream);
     t_ptr = host_data.data();
   }
+#endif
+
   VLOG(6) << "Tensor Value: " << t_ptr[0] << ", Expected Value: " << value;
   PADDLE_ENFORCE(
       t_ptr[0] == value,
@@ -166,6 +166,8 @@ static void FluidCheckGradTensorValue(
   auto* grad_tensor = X->MutableGradVar()->GetMutable<framework::LoDTensor>();
   float* g_ptr = grad_tensor->mutable_data<float>(place);
   std::vector<float> g_host_data(grad_tensor->numel());
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
@@ -178,6 +180,8 @@ static void FluidCheckGradTensorValue(
                          sizeof(float) * grad_tensor->numel(), stream);
     g_ptr = g_host_data.data();
   }
+#endif
+
   VLOG(6) << "Tensor Value: " << g_ptr[0] << ", Expected Value: " << value;
   PADDLE_ENFORCE(
       g_ptr[0] == value,
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index 3921ce5b69cd7d..c03db1a1575dfa 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -6,6 +6,6 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea
 cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
 
-if(NOT DEFINED ON_INFER)
+if(NOT ON_INFER)
     cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps})
 endif()

From 3d2ec707185c351943446f35a45db425e3be1b53 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 30 Nov 2021 15:12:34 +0800
Subject: [PATCH 005/124] Eager dygraph egr_utils_api namespace refactor
 (#37654)

* Refactored eager legacy namespace

* Fixed namespace issues
---
 paddle/fluid/eager/api/utils/hook_utils.cc    |   2 +
 paddle/fluid/eager/api/utils/hook_utils.h     |   2 +
 paddle/fluid/eager/api/utils/tensor_utils.cc  |   2 +
 paddle/fluid/eager/api/utils/tensor_utils.h   |   2 +
 .../performance_tests/benchmark_eager_cpu.cc  |   4 +-
 .../performance_tests/benchmark_eager_cuda.cc |   4 +-
 .../eager/tests/task_tests/backward_test.cc   |  46 ++++----
 .../cross_batch_accumulation_test.cc          |  22 ++--
 .../tests/task_tests/eager_utils_test.cc      |  14 +--
 .../tests/task_tests/forward_autograd_test.cc |  31 +++---
 .../tests/task_tests/fwd_bwd_joint_test.cc    | 101 +++++++++---------
 .../eager/tests/task_tests/generated_test.cc  |  33 +++---
 .../fluid/eager/tests/task_tests/hook_test.cc |  46 ++++----
 .../tests/task_tests/tensor_utils_test.cc     |  21 ++--
 14 files changed, 158 insertions(+), 172 deletions(-)

diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
index 7f85d014fa8425..85ff6687e0dbea 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.cc
+++ b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -20,6 +20,7 @@
 #include "paddle/pten/core/dense_tensor.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 void RegisterGradientHookForTensor(
     const egr::EagerTensor& tensor,
@@ -90,4 +91,5 @@ void RetainGradForTensor(const egr::EagerTensor& tensor) {
   }
 }
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h
index bf320f0b15d4a1..7e4faa5a2c701e 100644
--- a/paddle/fluid/eager/api/utils/hook_utils.h
+++ b/paddle/fluid/eager/api/utils/hook_utils.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/pten/api/all.h"
 namespace egr {
+namespace egr_utils_api {
 
 void RegisterGradientHookForTensor(
     const egr::EagerTensor& tensor,
@@ -27,4 +28,5 @@ void RegisterReduceHookForTensor(const egr::EagerTensor& tensor,
                                  const std::function<void(void)>& hook);
 void RetainGradForTensor(const egr::EagerTensor& tensor);
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 9dbb308a2c9069..ad6c34b7cf86cd 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -26,6 +26,7 @@
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 bool IsLeafTensor(const egr::EagerTensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
@@ -58,4 +59,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
   return out;
 }
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.h b/paddle/fluid/eager/api/utils/tensor_utils.h
index a0d8caf3cb307e..b3c4b596823208 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.h
+++ b/paddle/fluid/eager/api/utils/tensor_utils.h
@@ -18,6 +18,7 @@
 #include "paddle/pten/api/all.h"
 
 namespace egr {
+namespace egr_utils_api {
 
 // If and only if the tensor holds an AccumulationNode
 // Then it's treated as a leaf tensor
@@ -29,4 +30,5 @@ egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
                                        const pten::DataLayout& layout,
                                        float value, bool is_leaf = true);
 
+}  // namespace egr_utils_api
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 0637ff2bb23d39..83185dff9b7812 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -32,8 +32,8 @@
 #include "gperftools/profiler.h"
 #endif
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
+using namespace egr;            // NOLINT
+using namespace egr_utils_api;  // NOLINT
 
 // Disable pten path
 DECLARE_bool(run_pten_kernel);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 96dff14184f40c..9fbed054183029 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -31,8 +31,8 @@
 #include "gperftools/profiler.h"
 #endif
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
+using namespace egr;            // NOLINT
+using namespace egr_utils_api;  // NOLINT
 
 DECLARE_bool(run_pten_kernel);
 
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index d63cff23ba9c8e..0ec86b7cc360c7 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -30,19 +30,17 @@
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_meta.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Backward, SingleNodeEmptyGrad) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor target_tensor = CreateTensorWithValue(
+  egr::EagerTensor target_tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
 
@@ -67,7 +65,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
@@ -80,26 +78,26 @@ TEST(Backward, SingleNodeEmptyGrad) {
   RunBackward(outs, {});
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 }
 
 TEST(Backward, SingleNodeCustomGrad) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
 
   std::vector<egr::EagerTensor> grad_tensors;
   // Create Grad Tensor
-  egr::EagerTensor grad_tensor = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
   grad_tensors.emplace_back(std::move(grad_tensor));
@@ -128,7 +126,7 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node0 -> AccumulationNode via Edge
     auto meta = egr::AutogradMeta();
@@ -141,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
   RunBackward(target_tensors, grad_tensors);
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
 }
 
 /*
@@ -153,14 +151,14 @@ Node0
 */
 TEST(Backward, LinearNodes) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -202,7 +200,7 @@ TEST(Backward, LinearNodes) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node1 -> AccumulationNode via Edge
     auto meta1 = egr::AutogradMeta();
@@ -215,7 +213,7 @@ TEST(Backward, LinearNodes) {
   RunBackward(target_tensors, {});
 
   // Check Output Value
-  CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
 }
 
 /*
@@ -227,17 +225,17 @@ Node0   Node1
 */
 TEST(Backward, WithAccumulation) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
   std::vector<egr::EagerTensor> target_tensors;
-  egr::EagerTensor tensor0 = CreateTensorWithValue(
+  egr::EagerTensor tensor0 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
-  egr::EagerTensor tensor1 = CreateTensorWithValue(
+  egr::EagerTensor tensor1 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor0));
@@ -245,10 +243,10 @@ TEST(Backward, WithAccumulation) {
 
   // Create Grad Tensor
   std::vector<egr::EagerTensor> grad_tensors;
-  egr::EagerTensor grad_tensor0 = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor0 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
-  egr::EagerTensor grad_tensor1 = CreateTensorWithValue(
+  egr::EagerTensor grad_tensor1 = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
   grad_tensors.emplace_back(std::move(grad_tensor0));
@@ -303,7 +301,7 @@ TEST(Backward, WithAccumulation) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
-    egr::RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
 
     // Connect Node2 -> AccumulationNode via Edge
     auto meta2 = egr::AutogradMeta();
@@ -314,7 +312,7 @@ TEST(Backward, WithAccumulation) {
 
   RunBackward(target_tensors, grad_tensors);
 
-  CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index e1e138cdee8ba5..52e10b2b1b8a09 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -31,17 +31,15 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(CrossBatchAccumulation, SingleScaleNode) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -60,7 +58,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
     auto_grad_meta->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    RetainGradForTensor(target_tensor);  // result: 1.0
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
 
     auto meta = AutogradMeta();
     meta.SetSingleOutRankWithSlot(0, 0);
@@ -71,18 +69,18 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
     auto_grad_meta1->SetGradNode(
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
-    RetainGradForTensor(leaf_tensor);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
   }
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 4d93f0188a746b..c7c27dcc1d1508 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -24,10 +24,7 @@
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(EagerUtils, AutoGradMeta) {
   // Construct Eager Tensor
@@ -167,7 +164,7 @@ TEST(EagerUtils, PassStopGradient) {
 
 TEST(EagerUtils, SyncToVarsSingle) {
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-  auto tensor = eager_test::CreateTestCPUTensor(5.0f, ddim);
+  auto tensor = CreateTestCPUTensor(5.0f, ddim);
   std::vector<std::shared_ptr<egr::EagerTensor>> var_bases =
       egr::EagerUtils::SyncToVars(tensor);
 
@@ -185,9 +182,8 @@ TEST(EagerUtils, SyncToVarsSingle) {
 
 TEST(EagerUtils, SyncToVarsMultiple) {
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
-  std::vector<egr::EagerTensor> tensors = {
-      eager_test::CreateTestCPUTensor(1.0f, ddim),
-      eager_test::CreateTestCPUTensor(2.0f, ddim)};
+  std::vector<egr::EagerTensor> tensors = {CreateTestCPUTensor(1.0f, ddim),
+                                           CreateTestCPUTensor(2.0f, ddim)};
 
   std::vector<std::shared_ptr<egr::EagerTensor>> var_bases =
       egr::EagerUtils::SyncToVars(tensors);
@@ -280,4 +276,4 @@ TEST(EagerUtils, ConstructDuplicableOutput) {
   CHECK(outs[0]->initialized() == false);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index 6e23226cde432a..205f231eceeed5 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -27,21 +27,18 @@
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/tensor_meta.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Forward, SingleNode) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -55,7 +52,7 @@ TEST(Forward, SingleNode) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   // Examine GradNode
   {
@@ -80,14 +77,14 @@ Node1
  out
 */
 TEST(Forward, LinearNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -108,10 +105,10 @@ TEST(Forward, LinearNodes) {
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine GradNode
   {
@@ -156,14 +153,14 @@ TEST(Forward, LinearNodes) {
    out1    out2
 */
 TEST(Forward, BranchedNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(t));
@@ -190,13 +187,13 @@ TEST(Forward, BranchedNodes) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine Forward Output 2
-  CompareTensorWithValue<float>(out2, 150.0);
+  eager_test::CompareTensorWithValue<float>(out2, 150.0);
 
   // Examine GradNode
   {
@@ -248,4 +245,4 @@ TEST(Forward, BranchedNodes) {
   }
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 751e95487659cb..e292844c8ee586 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -29,10 +29,7 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 egr::EagerTensor hook_function(const egr::EagerTensor& t) {
   auto t_dense = std::dynamic_pointer_cast<pten::DenseTensor>(t.impl());
@@ -61,14 +58,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
 }
 
 TEST(FwdBwdJoint, SingleNode) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   float scale = 2.0;
@@ -77,7 +74,7 @@ TEST(FwdBwdJoint, SingleNode) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   std::vector<egr::EagerTensor> outs = {out};
   // 4. Run Backward
@@ -88,7 +85,7 @@ TEST(FwdBwdJoint, SingleNode) {
                  EagerUtils::unsafe_autograd_meta(tensor)->Grad().impl())
                  ->data<float>()[0];
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 2.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
 }
 
 /*
@@ -101,14 +98,14 @@ Node1
  out
 */
 TEST(FwdBwdJoint, LinearNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -125,17 +122,17 @@ TEST(FwdBwdJoint, LinearNodes) {
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   std::vector<egr::EagerTensor> outs = {out1};
   // 4. Run Backward
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 10.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
 }
 
 /*
@@ -149,14 +146,14 @@ TEST(FwdBwdJoint, LinearNodes) {
    out1    out2
 */
 TEST(FwdBwdJoint, BranchedNodes) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -179,10 +176,10 @@ TEST(FwdBwdJoint, BranchedNodes) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
 
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
 
   // Examine Forward Output 2
   {
@@ -201,7 +198,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 }
 
 /*
@@ -215,14 +212,14 @@ TEST(FwdBwdJoint, BranchedNodes) {
    out1    out2
 */
 TEST(FwdBwdJoint, GradientHook) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
       &hook_function;
@@ -234,24 +231,24 @@ TEST(FwdBwdJoint, GradientHook) {
   egr::EagerTensor out0 =
       egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/,
                  true /*trace_backward*/);
-  RetainGradForTensor(out0);                  // hook: +5
-  RegisterGradientHookForTensor(out0, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out0);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out0, hook);  // hook: +5
 
   // Run Forward Node 1
   float scale1 = 5.0;
   float bias1 = 10.0;
   egr::EagerTensor out1 = egr::scale(
       out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/);
-  RetainGradForTensor(out1);                  // hook: +5
-  RegisterGradientHookForTensor(out1, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out1);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out1, hook);  // hook: +5
 
   // Run Forward Node 2
   float scale2 = 10.0;
   float bias2 = 20.0;
   egr::EagerTensor out2 = egr::scale(
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
-  RetainGradForTensor(out2);                  // hook: +5
-  RegisterGradientHookForTensor(out2, hook);  // hook: +5
+  egr_utils_api::RetainGradForTensor(out2);                  // hook: +5
+  egr_utils_api::RegisterGradientHookForTensor(out2, hook);  // hook: +5
 
   // 4. Run Backward
   std::vector<egr::EagerTensor> outs = {out1, out2};
@@ -259,16 +256,16 @@ TEST(FwdBwdJoint, GradientHook) {
 
   // Examine Backward Grad
   // leaf grad
-  CompareGradTensorWithValue<float>(tensor, 190.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 190.0);
 
   // out0 grad
-  CompareGradTensorWithValue<float>(out0, 90.0);
+  eager_test::CompareGradTensorWithValue<float>(out0, 90.0);
 
   // out1 grad
-  CompareGradTensorWithValue<float>(out1, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(out1, 1.0);
 
   // out2 grad
-  CompareGradTensorWithValue<float>(out2, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(out2, 1.0);
 }
 
 /*
@@ -282,14 +279,14 @@ TEST(FwdBwdJoint, GradientHook) {
    out1    out2
 */
 TEST(FwdBwdJoint, CrossBatchAccumulation) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -316,13 +313,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 
   // Cross Batch Accumulation
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 60.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
 }
 
 /* ---------------------------------------------------- */
@@ -331,14 +328,14 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(FwdBwdJoint, SingleNodeCUDA) {
-  InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   float scale = 2.0;
@@ -347,14 +344,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
       tensor, scale, bias, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output
-  CompareTensorWithValue<float>(out, 13.0);
+  eager_test::CompareTensorWithValue<float>(out, 13.0);
 
   std::vector<egr::EagerTensor> outs = {out};
   // 4. Run Backward
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 2.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
 }
 
 /*
@@ -368,14 +365,14 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
    out1    out2
 */
 TEST(FwdBwdJoint, BranchedNodesCUDA) {
-  InitEnv(paddle::platform::CUDAPlace());
+  eager_test::InitEnv(paddle::platform::CUDAPlace());
 
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CUDAPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
 
   // 3. Run Forward
   // Run Forward Node 0
@@ -398,11 +395,11 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
       out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/);
 
   // Examine Forward Output 0
-  CompareTensorWithValue<float>(out0, 13.0);
+  eager_test::CompareTensorWithValue<float>(out0, 13.0);
   // Examine Forward Output 1
-  CompareTensorWithValue<float>(out1, 75.0);
+  eager_test::CompareTensorWithValue<float>(out1, 75.0);
   // Examine Forward Output 2
-  CompareTensorWithValue<float>(out2, 150.0);
+  eager_test::CompareTensorWithValue<float>(out2, 150.0);
 
   // TODO(jiabin): fix this with add functor
   // 4. Run Backward
@@ -410,8 +407,8 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
   RunBackward(outs, {});
 
   // Examine Backward Grad
-  CompareGradTensorWithValue<float>(tensor, 30.0);
+  eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
 }
 #endif
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index eb8d1e517eaf3b..9d6e3310678345 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -30,66 +30,63 @@
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(Generated, Sigmoid) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
   VLOG(6) << "Init Env";
   // 1. Prepare Input
   paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 4, 4, 4});
   VLOG(6) << "Make Dim";
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 0.0, true);
   VLOG(6) << "Make EagerTensor";
-  RetainGradForTensor(tensor);
+  egr_utils_api::RetainGradForTensor(tensor);
   VLOG(6) << "Retain Grad for Tensor";
   auto output_tensor = sigmoid_dygraph_function(tensor, {});
   VLOG(6) << "Run Backward";
-  CompareVariableWithValue<float>(output_tensor, 0.5);
+  eager_test::CompareVariableWithValue<float>(output_tensor, 0.5);
 
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   VLOG(6) << "Runing Backward";
   RunBackward(target_tensors, {});
 
   VLOG(6) << "Finish Backward";
-  CompareGradVariableWithValue<float>(tensor, 0.25);
+  eager_test::CompareGradVariableWithValue<float>(tensor, 0.25);
 }
 
 TEST(Generated, Matmul_v2) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   auto tracer = std::make_shared<paddle::imperative::Tracer>();
   paddle::imperative::SetCurrentTracer(tracer);
 
   // 1. Prepare Input
   paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16});
-  egr::EagerTensor X = CreateTensorWithValue(
+  egr::EagerTensor X = egr_utils_api::CreateTensorWithValue(
       ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 3.0, true);
-  RetainGradForTensor(X);
+  egr_utils_api::RetainGradForTensor(X);
 
   paddle::framework::DDim ddimY = paddle::framework::make_ddim({16, 20});
-  egr::EagerTensor Y = CreateTensorWithValue(
+  egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue(
       ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 2.0, true);
-  RetainGradForTensor(Y);
+  egr_utils_api::RetainGradForTensor(Y);
 
   auto output_tensor = matmul_v2_dygraph_function(
       X, Y, {{"trans_x", false}, {"trans_y", false}});
 
-  CompareVariableWithValue<float>(output_tensor, 96);
+  eager_test::CompareVariableWithValue<float>(output_tensor, 96);
 
   std::vector<egr::EagerTensor> target_tensors = {output_tensor};
   RunBackward(target_tensors, {});
 
-  CompareGradVariableWithValue<float>(X, 2.0 * 20);
-  CompareGradVariableWithValue<float>(Y, 3.0 * 4);
+  eager_test::CompareGradVariableWithValue<float>(X, 2.0 * 20);
+  eager_test::CompareGradVariableWithValue<float>(Y, 3.0 * 4);
 }
 
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 326240d0cb7b97..32b28d8efd21b8 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -30,9 +30,7 @@
 
 #include "paddle/fluid/eager/tests/test_utils.h"
 
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 egr::EagerTensor hook_function(const egr::EagerTensor& t) {
   auto t_dense = std::dynamic_pointer_cast<pten::DenseTensor>(t.impl());
@@ -61,14 +59,14 @@ egr::EagerTensor hook_function(const egr::EagerTensor& t) {
 }
 
 TEST(RetainGrad, HookBeforeRetainGrad) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -99,8 +97,9 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RegisterGradientHookForTensor(target_tensor, hook);
-    RetainGradForTensor(target_tensor);  // result: 1.0 + 3.0 = 4.0
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        target_tensor);  // result: 1.0 + 3.0 = 4.0
   }
 
   // Connect ScaleNode -> AccumulationNode via Edge
@@ -126,25 +125,26 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RegisterGradientHookForTensor(leaf_tensor, hook);
-    RetainGradForTensor(leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
   }
 
   RunBackward(target_tensors, {});
 
-  CompareGradTensorWithValue<float>(target_tensor, 4.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
 
 TEST(RetainGrad, HookAfterRetainGrad) {
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor tensor = CreateTensorWithValue(
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
   target_tensors.emplace_back(std::move(tensor));
@@ -173,8 +173,8 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RetainGradForTensor(target_tensor);  // result: 1.0
-    RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
   }
 
   // Connect ScaleNode -> AccumulationNode via Edge
@@ -200,15 +200,15 @@ TEST(RetainGrad, HookAfterRetainGrad) {
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
             auto_grad_meta));
 
-    RetainGradForTensor(leaf_tensor);  // RetainGrad for leaf tensor gets
-                                       // postponed, result: 4.0*5.0 + 3.0 =
-                                       // 23.0
-    RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // RetainGrad for leaf tensor gets
+                       // postponed, result: 4.0*5.0 + 3.0 =
+                       // 23.0
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
   }
 
   RunBackward(target_tensors, {});
-  CompareGradTensorWithValue<float>(target_tensor, 1.0);
-  CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
 }
-
-}  // namespace eager_test
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
index 5b96c726b22285..5e86cac83a285f 100644
--- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc
@@ -23,39 +23,34 @@
 #include "paddle/fluid/eager/tests/test_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 
-#include "paddle/pten/core/kernel_registry.h"
-
-// TODO(jiabin): remove nolint here!!!
-using namespace egr;  // NOLINT
-
-namespace eager_test {
+namespace egr {
 
 TEST(TensorUtils, Test) {
   // Prepare Device Contexts
-  InitEnv(paddle::platform::CPUPlace());
+  eager_test::InitEnv(paddle::platform::CPUPlace());
 
   // Prepare Inputs
   std::vector<egr::EagerTensor> target_tensors;
   paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
 
   // Create Target Tensor
-  egr::EagerTensor t = CreateTensorWithValue(
+  egr::EagerTensor t = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/);
 
-  egr::EagerTensor t_grad = CreateTensorWithValue(
+  egr::EagerTensor t_grad = egr_utils_api::CreateTensorWithValue(
       ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
       pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
 
-  CHECK_EQ(IsLeafTensor(t), true);
+  CHECK_EQ(egr_utils_api::IsLeafTensor(t), true);
 
   // Test Utils
-  CompareTensorWithValue<float>(t, 5.0);
+  eager_test::CompareTensorWithValue<float>(t, 5.0);
 
   egr::AutogradMeta* meta = egr::EagerUtils::autograd_meta(&t);
   *meta->MutableGrad() = t_grad;
 
-  CompareGradTensorWithValue<float>(t, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(t, 1.0);
 }
 
-}  // namespace eager_test
+}  // namespace egr

From 5440d2f93fc9297125ebde95d0f3d65f2540c44c Mon Sep 17 00:00:00 2001
From: xiayanming <41795079@qq.com>
Date: Tue, 30 Nov 2021 18:57:06 +0800
Subject: [PATCH 006/124] [Auto Parallel] elastic support auto parallel
 re-launch (#37523)

* [Auto Parallel] elastic support auto parallel re-launch

* [Auto Parallel] elastic support auto parallel re-launch

* fix ci issue

* fix ci issue

* fix rank mapping unittest

* fix rank mapping unittest

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue

* fix ci issue
---
 .../distributed/fleet/elastic/__init__.py     |  10 +-
 .../distributed/fleet/elastic/collective.py   |  44 +------
 .../distributed/fleet/elastic/manager.py      |  17 ++-
 python/paddle/distributed/fleet/launch.py     |  20 +++-
 .../test_fleet_elastic_collective.py          | 111 ++++++++++++++++++
 .../unittests/test_fleet_elastic_manager.py   |  43 +++++++
 6 files changed, 199 insertions(+), 46 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py

diff --git a/python/paddle/distributed/fleet/elastic/__init__.py b/python/paddle/distributed/fleet/elastic/__init__.py
index 127aba7d93717c..503d2966a80e7f 100644
--- a/python/paddle/distributed/fleet/elastic/__init__.py
+++ b/python/paddle/distributed/fleet/elastic/__init__.py
@@ -18,14 +18,20 @@
 from .manager import ElasticManager
 from .manager import ElasticStatus
 from .manager import ELASTIC_EXIT_CODE
+from .manager import ElasticLevel
 from .collective import CollectiveLauncher
 
 from paddle.distributed.fleet.launch_utils import DistributeMode
 
 
 def enable_elastic(args, distribute_mode):
-    if distribute_mode != DistributeMode.COLLECTIVE:
-        return False
+    #elastic_level = os.getenv('PADDLE_ELASTIC_FAULT_TOLERANC_LEVEL')
+    #if not elastic_level and (elastic_level != ElasticLevel.FAULT_TOLERANCE and
+    #                          elastic_level != ElasticLevel.ELASTIC):
+    #    return False
+
+    #if distribute_mode != DistributeMode.COLLECTIVE:
+    #    return False
 
     if not args.elastic_server and not os.getenv('PADDLE_ELASTIC_SERVER'):
         return False
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index d9c2735c4bd019..82055314b0dc8c 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -30,42 +30,10 @@ def __init__(self, args):
     def launch(self):
         logger.info("collective lauchner launch ...")
         args = self.args
-        # parse arguments, used for cloud-single-machine and local
-        (device_mode,
-         devices_per_proc) = launch_utils.get_device_proc_info(args)
-        trainers_num = cloud_utils.get_trainers_num()
-        logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".
-                     format(trainers_num, device_mode, devices_per_proc))
-
-        cluster = None
-        pod = None
-
-        start_port = 6170
-        if os.environ.get('FLAGS_START_PORT') is not None:
-            start_port = os.environ.get('FLAGS_START_PORT')
-        if cloud_utils.use_paddlecloud() and trainers_num != 1:
-            cluster, pod = cloud_utils.get_cloud_cluster(
-                args.ips, device_mode, devices_per_proc, start_port)
-            logger.debug("get cluster from cloud:{}".format(cluster))
-        elif device_mode == DeviceMode.ASCEND_NPU:
-            # for ascend
-            cluster, pod = ascend_utils.get_cloud_cluster(
-                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-                device_mode=device_mode,
-                start_port=start_port)
-        else:
-            # trainers_num = 1 or not use paddlecloud ips="a,b"
-            cluster, pod = paddle.distributed.fleet.launch.get_cluster_from_args(
-                args, device_mode, devices_per_proc)
-            logger.debug("get cluster from args:{}".format(cluster))
-
-        global_envs = copy.copy(os.environ.copy())
-        self.gloo_rendezvous_dir = tempfile.mkdtemp()
-        # add gloo env
-        global_envs["PADDLE_WITH_GLOO"] = str(
-            os.getenv("PADDLE_WITH_GLOO", "0"))
-        global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir
+        self.tmp_dir = tempfile.mkdtemp()
+        global_envs = paddle.distributed.fleet.launch.get_global_envs(
+            args, self.tmp_dir)
+        cluster, pod = paddle.distributed.fleet.launch.get_cluster_info(args)
 
         self.procs = start_local_trainers(
             cluster,
@@ -82,8 +50,8 @@ def stop(self):
         logger.info("collective lauchner stop ...")
         if not self._terminate_procs():
             logger.error("kill process failed")
-        if os.path.exists(self.gloo_rendezvous_dir):
-            shutil.rmtree(self.gloo_rendezvous_dir)
+        if os.path.exists(self.tmp_dir):
+            shutil.rmtree(self.tmp_dir)
 
     def watch(self):
         logger.debug("collective lauchner watch ...")
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 391c5631663180..1716e332c82861 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -35,6 +35,7 @@
 logger.addHandler(ch)
 
 ELASTIC_EXIT_CODE = 101
+ELASTIC_AUTO_PARALLEL_EXIT_CODE = 102
 
 # wait for timeout, unit: seconds
 ELASTIC_TIMEOUT = 2 * 60
@@ -103,6 +104,9 @@ def _check_procs(self):
             if ret is None:
                 alive = True
             elif ret != 0:
+                if ret == ELASTIC_AUTO_PARALLEL_EXIT_CODE:
+                    logger.info("return form elastic auto parallel re-launch")
+                    return ret
                 logger.error("ABORT!!! ABORT!!! ABORT!!!")
                 logger.error(
                     "ERROR rank {} error with exit code {}, check log for detail.".
@@ -232,6 +236,7 @@ def host_call_back(event):
                 six.ensure_str(i[0])
                 for i in self.etcd.get_prefix(self.node_prefix)
             ]
+            self.hosts = list(set(self.hosts)) if self.hosts else self.hosts
             logger.info(
                 f"host_call_back curr_host={self.curr_host}, hosts:{self.hosts}")
             self.need_sync = True
@@ -251,6 +256,7 @@ def lease_heartbeat():
                         six.ensure_str(i[0])
                         for i in self.etcd.get_prefix(self.node_prefix)
                     ]
+                    hosts = list(set(hosts)) if hosts else hosts
                     logger.info(
                         f"[lease_heartbeat] curr_host={self.curr_host}, hosts={hosts}"
                     )
@@ -335,6 +341,7 @@ def pre_hook(self):
         if not self.args.elastic_pre_hook:
             logger.info("skip pre_hook")
             return
+        logger.info("execute pre_hook...")
         current_env = copy.copy(os.environ.copy())
         out, err = subprocess.Popen(
             self.args.elastic_pre_hook,
@@ -391,6 +398,7 @@ def _match(self, host_list: list=None):
                 six.ensure_str(i[0])
                 for i in self.etcd.get_prefix(self.node_prefix)
             ]
+        self.hosts = list(set(self.hosts)) if self.hosts else self.hosts
 
         if self.elastic_level == ElasticLevel.FAULT_TOLERANCE:
             if len(self.hosts) == self.np:
@@ -430,6 +438,9 @@ def _update_endpoint(self, endpoints, hosts):
 
     def _update_fault_tolrance(self):
         rank = int(os.getenv('PADDLE_TRAINER_ID', -1))
+        logger.debug(
+            f"self.curr_host={self.curr_host}, self.dist_endpoints={self.dist_endpoints}"
+        )
         if self.curr_host in self.dist_endpoints:
             os.environ['DISTRIBUTED_TRAINER_ENDPOINTS'] = self.dist_endpoints
             os.environ['PADDLE_TRAINERS'] = self.trainers
@@ -550,7 +561,6 @@ def wait(self):
                                                                    self.hosts))
             idx += 1
             time.sleep(2)
-
         return
 
     def run(self, launcher):
@@ -571,6 +581,11 @@ def watch(self):
 
             if ret is not None:  # self terminated
                 logger.info('job exit with code {}'.format(ret))
+                if ret == ELASTIC_AUTO_PARALLEL_EXIT_CODE:
+                    logger.info('job re-launch for auto parallel')
+                    self.launcher.stop()
+                    return ElasticStatus.HOLD
+
                 # process is completed if ret >= 0 or error else
                 completed = True if ret == 0 else False
                 self.exit(completed=completed)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 2595512789bb6b..0aae3331793ca7 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -65,6 +65,7 @@
 import time
 import six
 import copy
+import pathlib
 import argparse
 from argparse import ArgumentParser, REMAINDER
 import paddle
@@ -283,7 +284,7 @@ def cpuonly_check(args):
     return True
 
 
-def launch_collective(args):
+def get_cluster_info(args):
     # parse arguments, used for cloud-single-machine and local
     if args.backend == 'gloo': cpuonly_check(args)
     (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args)
@@ -316,14 +317,23 @@ def launch_collective(args):
             cluster, pod = get_cluster_from_args(args, device_mode,
                                                  devices_per_proc)
             logger.debug("get cluster from args:{}".format(cluster))
+    return cluster, pod
+
 
+def get_global_envs(args, tmp_dir):
     global_envs = copy.copy(os.environ.copy())
-    gloo_rendezvous_dir = tempfile.mkdtemp()
     # add gloo env
     global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
     global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
-    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+    global_envs["PADDLE_GLOO_FS_PATH"] = tmp_dir
     global_envs["PADDLE_DISTRI_BACKEND"] = args.backend
+    return global_envs
+
+
+def launch_collective(args):
+    tmp_dir = tempfile.mkdtemp()
+    cluster, pod = get_cluster_info(args)
+    global_envs = get_global_envs(args, tmp_dir)
 
     procs = start_local_trainers(
         cluster,
@@ -352,8 +362,8 @@ def launch_collective(args):
             terminate_local_procs(procs)
             exit(1)
 
-    if os.path.exists(gloo_rendezvous_dir):
-        shutil.rmtree(gloo_rendezvous_dir)
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir)
 
 
 def launch_ps(args, distribute_mode):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
new file mode 100644
index 00000000000000..2d2f019c5ed09d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import time
+import json
+import unittest
+import argparse
+import tempfile
+import traceback
+from warnings import catch_warnings
+
+from paddle.distributed.fleet.elastic.collective import CollectiveLauncher
+from paddle.distributed.fleet.launch import launch_collective
+
+fake_python_code = """
+print("test")
+"""
+
+
+class TestCollectiveLauncher(unittest.TestCase):
+    def setUp(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+
+        self.code_path = os.path.join(file_dir, "fake_python_for_elastic.py")
+        with open(self.code_path, "w") as f:
+            f.write(fake_python_code)
+
+    def test_launch(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "1"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = "127.0.0.1"
+            scale = None
+            force = None
+            backend = 'gloo'
+            enable_auto_mapping = False
+            run_mode = "cpuonly"
+            servers = None
+            rank_mapping_path = None
+            training_script = "fake_python_for_elastic.py"
+            training_script_args = ["--use_amp false"]
+            log_dir = None
+
+        args = Argument()
+
+        launch = CollectiveLauncher(args)
+
+        try:
+            args.backend = "gloo"
+            launch.launch()
+            launch.stop()
+        except Exception as e:
+            pass
+
+        try:
+            args.backend = "gloo"
+            launch_collective(args)
+        except Exception as e:
+            pass
+
+    def test_stop(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "1"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = "127.0.0.1"
+            scale = None
+            force = None
+            backend = 'gloo'
+            enable_auto_mapping = False
+            run_mode = "cpuonly"
+            servers = None
+            rank_mapping_path = None
+            training_script = "fake_python_for_elastic.py"
+            training_script_args = ["--use_amp false"]
+            log_dir = None
+
+        args = Argument()
+        try:
+            launch = CollectiveLauncher(args)
+            launch.tmp_dir = tempfile.mkdtemp()
+            launch.stop()
+        except Exception as e:
+            pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
index 149304e505c123..6dc9f69d03f7ce 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_manager.py
@@ -20,7 +20,9 @@
 import argparse
 
 from paddle.distributed.fleet.elastic.manager import ElasticManager
+from paddle.distributed.fleet.elastic.manager import LauncherInterface
 from paddle.distributed.fleet.elastic.manager import ELASTIC_TIMEOUT
+from paddle.distributed.fleet.elastic.manager import ELASTIC_AUTO_PARALLEL_EXIT_CODE
 
 
 class MockLease():
@@ -347,6 +349,47 @@ class Argument:
         args.elastic_pre_hook = "hostname"
         elastic.pre_hook()
 
+    def test_watch(self):
+        class Argument:
+            elastic_server = "127.0.0.1:2379"
+            job_id = "test_job_id_123"
+            np = "2"
+            gpus = "0"
+            nproc_per_node = 1
+            host = None
+            curr_host = None
+            ips = None
+            scale = None
+            force = None
+            backend = 'gloo'
+            elastic_pre_hook = None
+
+        class ElasticLauncher:
+            def watch(self):
+                return ELASTIC_AUTO_PARALLEL_EXIT_CODE
+
+            def stop(self):
+                pass
+
+        args = Argument()
+        elastic = ElasticManager(args, self.etcd_client)
+        elastic.stopped = False
+        elastic.launcher = ElasticLauncher()
+        elastic.watch()
+
+    def test_launcher_interface_check_procs(self):
+        class Proc:
+            def poll(self):
+                return ELASTIC_AUTO_PARALLEL_EXIT_CODE
+
+        class ProcList:
+            def __init__(self):
+                self.proc = Proc()
+
+        launch = LauncherInterface(None)
+        launch.procs = [ProcList()]
+        launch._check_procs()
+
 
 if __name__ == "__main__":
     unittest.main()

From c8ffdecbf4f165d48f0b3adbd1f869eba6723331 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 30 Nov 2021 19:31:10 +0800
Subject: [PATCH 007/124] [opt] Add regularation and Nesterov for
 mergerd_momentum op (#37527)

* add regularation and Nesterov for mergerd_momentum

* refine unittest for use_nesterov attr

* refine op check

* refine code

* fix bug

* refine code of regularization_flag

* delete useless code
---
 .../optimizers/merged_momentum_op.cc          |  15 +-
 .../operators/optimizers/merged_momentum_op.h | 174 +++++++++++++---
 paddle/fluid/pybind/op_function.h             |   2 +-
 .../unittests/test_merged_momentum_op.py      | 197 ++++++++++++++++++
 4 files changed, 360 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
index 6c63376b5eb425..1733150f271289 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
@@ -50,7 +50,8 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
-             "Input learning rate");
+             "Input learning rate")
+        .AsDuplicable();
     AddInput("MasterParam", "FP32 master weight for AMP.")
         .AsDispensable()
         .AsDuplicable();
@@ -68,6 +69,18 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable()
         .AsDuplicable();
     AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum or not.")
+        .SetDefault(false);
+    AddAttr<std::vector<std::string>>(
+        "regularization_method",
+        "(string) regularization_method, right now only "
+        "support l2decay or none")
+        .SetDefault({});
+    AddAttr<std::vector<float>>("regularization_coeff",
+                                "(float) regularization_coeff")
+        .SetDefault({});
     AddAttr<bool>("multi_precision",
                   "(bool, default false) "
                   "Whether to use multi-precision during weight updating.")
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h
index 4dfaa4de3ad447..7560b4fd8e5f91 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -85,33 +86,43 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     auto params = ctx.MultiInput<framework::Tensor>("Param");
     auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
     size_t n = params.size();
-    PADDLE_ENFORCE_EQ(
-        n, params_out.size(),
-        platform::errors::InvalidArgument(
-            "Output(ParamOut) number must be equal to Input(Param) number."));
+    PADDLE_ENFORCE_EQ(n, params_out.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          params_out.size(), n));
     for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(
-          params[i], params_out[i],
-          platform::errors::InvalidArgument(
-              "Input(Param) and Output(ParamOut) must be the same Tensors."));
+      PADDLE_ENFORCE_EQ(params[i], params_out[i],
+                        platform::errors::InvalidArgument(
+                            "The size of Input(Param) and Output(ParamOut) "
+                            "must be the same Tensors."));
     }
 
     auto grads = ctx.MultiInput<framework::Tensor>("Grad");
     PADDLE_ENFORCE_EQ(
         n, grads.size(),
         platform::errors::InvalidArgument(
-            "Input(Grad) number must be equal to Input(Param) number."));
+            "The size of Input(Grad) must be equal to Input(Param), but got "
+            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+            grads.size(), n));
 
     auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
     PADDLE_ENFORCE_EQ(n, velocitys.size(),
                       platform::errors::InvalidArgument(
-                          "Input(Velocity) number and Input(Param) number."));
+                          "The size of Input(Velocity) must be equal to "
+                          "Input(Param), but got the size of Input(Velocity) "
+                          "is %d, the size of Input(Param) is %d.",
+                          velocitys.size(), n));
 
     auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
     PADDLE_ENFORCE_EQ(
         n, velocitys_out.size(),
-        platform::errors::InvalidArgument("Output(VelocityOut) number must be "
-                                          "equal to Input(Param) number."));
+        platform::errors::InvalidArgument(
+            "The size of Output(VelocityOut) must be "
+            "equal to Input(Param), but got the size of Output(VelocityOut) is "
+            "%d, the size of Input(Param) is %d.",
+            velocitys_out.size(), n));
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i],
                         platform::errors::InvalidArgument(
@@ -126,12 +137,18 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     if (multi_precision) {
       PADDLE_ENFORCE_EQ(
           n, master_params.size(),
-          platform::errors::InvalidArgument("Input(MasterParam) number must be "
-                                            "equal to Input(Param) number."));
-      PADDLE_ENFORCE_EQ(n, master_params_out.size(),
-                        platform::errors::InvalidArgument(
-                            "Output(MasterParamOut) number must be equal to "
-                            "Input(MasterParam) number."));
+          platform::errors::InvalidArgument(
+              "The size of Input(MasterParam) must be "
+              "equal to Input(Param), but got the size of Input(MasterParam) "
+              "is %d, the size of Input(Param) is %d.",
+              master_params.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, master_params_out.size(),
+          platform::errors::InvalidArgument(
+              "The size of Output(MasterParamOut) must be equal to "
+              "Input(MasterParam), but got the size of Output(MasterParamOut) "
+              "is %d, the size of Input(Param) is %d.",
+              master_params_out.size(), n));
       for (size_t i = 0; i < n; ++i) {
         PADDLE_ENFORCE_EQ(master_params[i], master_params_out[i],
                           platform::errors::InvalidArgument(
@@ -147,20 +164,61 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
       master_params_out.clear();
     }
 
-    auto lr = ctx.Input<framework::Tensor>("LearningRate");
     auto mu = ctx.Attr<float>("mu");
     auto rescale_grad = ctx.Attr<float>("rescale_grad");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    if (lrs.size() != 1) {
+      PADDLE_ENFORCE_EQ(
+          n, lrs.size(),
+          platform::errors::InvalidArgument(
+              "If the size of Input(LearningRate) is not 1, the size of "
+              "Input(LearningRate) must be "
+              "equal to Input(Param), but got the size of Input(LearningRate) "
+              "is %d, the size of Input(Param) is %d.",
+              lrs.size(), n));
+    }
+    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
+    auto regularization_methods =
+        ctx.Attr<std::vector<std::string>>("regularization_method");
+    auto regularization_coeffs =
+        ctx.Attr<std::vector<float>>("regularization_coeff");
+    if (regularization_methods.size() != 0) {
+      PADDLE_ENFORCE_EQ(
+          n, regularization_methods.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_method) must be equal "
+              "to Input(Param), but got the size of "
+              "Attr(regularization_method) is %d, the size of Input(Param) is "
+              "%d.",
+              regularization_methods.size(), n));
+      PADDLE_ENFORCE_EQ(
+          n, regularization_coeffs.size(),
+          platform::errors::InvalidArgument(
+              "The size of Attr(regularization_coeff) must be equal "
+              "to Input(Param), but got the size of Attr(regularization_coeff) "
+              "is %d, the size of Input(Param) is %d.",
+              regularization_coeffs.size(), n));
+    }
+
+    VLOG(5) << "use_nesterov: " << use_nesterov
+            << ",  regularization_methods.size(): "
+            << regularization_methods.size()
+            << ",  regularization_coeffs.size(): "
+            << regularization_coeffs.size();
+
     using MPType = typename operators::details::MPTypeTrait<T>::Type;
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
 
+    if (lrs.size() == 1 && use_nesterov == false &&
+        regularization_methods.size() == 0) {
 #define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision)                \
   MergedMomentumKernelParam<T, MPType, kMultiPrecision> kernel_params;       \
   constexpr auto kMaxMergedNum = decltype(kernel_params)::N;                 \
   size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum;               \
   kernel_params.mu = static_cast<MPType>(mu);                                \
   kernel_params.rescale_grad = static_cast<MPType>(rescale_grad);            \
-  kernel_params.lr = lr->data<MPType>();                                     \
+  kernel_params.lr = lrs[0]->data<MPType>();                                 \
   for (size_t i = 0; i < kernel_num; ++i) {                                  \
     size_t start = i * kMaxMergedNum;                                        \
     size_t end = std::min((i + 1) * kMaxMergedNum, n);                       \
@@ -182,14 +240,78 @@ class MergedMomentumOpKernel : public framework::OpKernel<T> {
     VLOG(10) << "Launch MergedMomentum kernel " << i << " "                  \
              << kernel_params.param_num;                                     \
   }
-
-    if (multi_precision) {
-      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
+      if (multi_precision) {
+        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
+      } else {
+        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
+      }
+#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
     } else {
-      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
-    }
+      for (size_t idx = 0; idx < n; idx++) {
+        RegularizationType regularization_flag =
+            regularization_methods.size() > 0 &&
+                    regularization_methods[idx] == "l2_decay"
+                ? RegularizationType::kL2DECAY
+                : RegularizationType::kNONE;
 
-#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
+        MPType regularization_coeff = static_cast<MPType>(0.0);
+        if (regularization_coeffs.size() != 0) {
+          regularization_coeff =
+              static_cast<MPType>(regularization_coeffs[idx]);
+        }
+        auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0];
+
+        const MPType *master_in_data =
+            multi_precision ? master_params[idx]->data<MPType>() : nullptr;
+        MPType *master_out_data =
+            multi_precision ? master_params_out[idx]->data<MPType>() : nullptr;
+        if (platform::is_cpu_place(ctx.GetPlace())) {
+          CPUDenseMomentumFunctor<MPType> functor;
+          functor(params[idx], grads[idx], velocitys[idx], lr_temp, mu,
+                  use_nesterov, regularization_flag, regularization_coeff,
+                  params_out[idx], velocitys_out[idx]);
+          VLOG(10) << "Launch MergedMomentum cpu kernel.";
+        } else if (platform::is_gpu_place(ctx.GetPlace())) {
+          platform::ForRange<DeviceContext> for_range(
+              static_cast<const DeviceContext &>(ctx.device_context()),
+              params[idx]->numel());
+#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type)          \
+  DenseMomentumFunctor<T, MPType, __reg_type, __nesterov> functor(             \
+      params[idx]->data<T>(), grads[idx]->data<T>(),                           \
+      velocitys[idx]->data<MPType>(), lr_temp->data<MPType>(), master_in_data, \
+      mu, rescale_grad, params[idx]->numel(), regularization_coeff,            \
+      params_out[idx]->data<T>(), velocitys_out[idx]->data<MPType>(),          \
+      master_out_data);                                                        \
+  for_range(functor);
+          if (use_nesterov) {
+            if (regularization_flag == RegularizationType::kL2DECAY) {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  UseNesterov, RegularizationType::kL2DECAY);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY.";
+            } else {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(UseNesterov,
+                                                    RegularizationType::kNONE);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel use_nesterov kNONE.";
+            }
+          } else {
+            if (regularization_flag == RegularizationType::kL2DECAY) {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                  NoNesterov, RegularizationType::kL2DECAY);
+              VLOG(10)
+                  << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY.";
+            } else {
+              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(NoNesterov,
+                                                    RegularizationType::kNONE);
+              VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE.";
+            }
+          }
+        }
+      }
+      VLOG(10)
+          << "Launch MergedMomentum kernel with multi_lr and regularization.";
+    }
   }
 };
 
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 997cb610fafca5..324cd4b1b161f5 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -827,7 +827,7 @@ GetVarBaseListFromArgs(const std::string& op_type, const std::string& arg_name,
                        bool dispensable = false) {
   PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
 
-  if (list == nullptr) {
+  if (list == nullptr || list == Py_None) {
     if (!dispensable) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "%s(): argument '%s' (position %d) must be list of Tensor, but got "
diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
index 96e458795a3c08..9bc3bb7ad341f0 100644
--- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py
@@ -130,6 +130,130 @@ def run_momentum_op(params,
         return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
 
 
+def run_momentum_op2(params,
+                     grads,
+                     velocitys,
+                     master_params,
+                     learning_rate,
+                     place,
+                     multi_precision,
+                     mu=0.9,
+                     rescale_grad=0.01,
+                     use_merged=False,
+                     use_nesterov=True):
+    assert len(params) == len(grads)
+    assert len(params) == len(velocitys)
+    if multi_precision:
+        assert len(params) == len(master_params)
+    op_type = 'merged_momentum' if use_merged else 'momentum'
+    main = paddle.static.Program()
+    startup = paddle.static.Program()
+    with paddle.static.program_guard(main, startup):
+        helper = LayerHelper(op_type, **locals())
+
+        param_vars = [
+            helper.create_variable(
+                persistable=True, shape=p.shape, dtype=p.dtype) for p in params
+        ]
+        grad_vars = [
+            helper.create_variable(
+                shape=g.shape, dtype=g.dtype) for g in grads
+        ]
+        velocity_vars = [
+            helper.create_variable(
+                persistable=True, shape=v.shape, dtype=v.dtype)
+            for v in velocitys
+        ]
+        lr_var = helper.create_variable(
+            persistable=True,
+            shape=learning_rate.shape,
+            dtype=learning_rate.dtype)
+
+        feed_dict = OrderedDict()
+
+        feed_dict.update(
+            OrderedDict([(p_var.name, p_val)
+                         for p_var, p_val in zip(param_vars, params)]))
+        feed_dict.update(
+            OrderedDict([(v_var.name, v_val)
+                         for v_var, v_val in zip(velocity_vars, velocitys)]))
+        fetch_list = list(feed_dict.keys())
+
+        feed_dict.update(
+            OrderedDict([(g_var.name, g_val)
+                         for g_var, g_val in zip(grad_vars, grads)]))
+        feed_dict.update({lr_var.name: learning_rate})
+
+        if multi_precision:
+            master_param_vars = [
+                helper.create_variable(
+                    persistable=True, shape=p.shape, dtype=p.dtype)
+                for p in master_params
+            ]
+            feed_dict.update(
+                OrderedDict([(mp_var.name, mp_val)
+                             for mp_var, mp_val in zip(master_param_vars,
+                                                       master_params)]))
+            # CPUPlace does not use MasterParam
+            if isinstance(place, paddle.CUDAPlace):
+                fetch_list = fetch_list + [
+                    mp_var.name for mp_var in master_param_vars
+                ]
+        else:
+            master_param_vars = None
+
+        if not use_merged:
+            for i, (p, g,
+                    v) in enumerate(zip(param_vars, grad_vars, velocity_vars)):
+                inputs = {
+                    'Param': p,
+                    'Grad': g,
+                    'Velocity': v,
+                    'LearningRate': lr_var,
+                }
+                outputs = {'ParamOut': p, 'VelocityOut': v}
+                if multi_precision:
+                    inputs['MasterParam'] = master_param_vars[i]
+                    outputs['MasterParamOut'] = master_param_vars[i]
+                attrs = {
+                    'mu': mu,
+                    'multi_precision': multi_precision,
+                    'rescale_grad': rescale_grad,
+                    'use_nesterov': use_nesterov,
+                    'regularization_method': 'l2_decay',
+                    'regularization_coeff': 2.0,
+                }
+                helper.append_op(
+                    type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+        else:
+            inputs = {
+                'Param': param_vars,
+                'Grad': grad_vars,
+                'Velocity': velocity_vars,
+                'LearningRate': lr_var,
+            }
+            outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars}
+            if multi_precision:
+                inputs['MasterParam'] = master_param_vars
+                outputs['MasterParamOut'] = master_param_vars
+            attrs = {
+                'mu': mu,
+                'multi_precision': multi_precision,
+                'rescale_grad': rescale_grad,
+                'use_nesterov': use_nesterov,
+                'regularization_method':
+                ['l2_decay' for i in range(len(param_vars))],
+                'regularization_coeff': [2.0 for i in range(len(param_vars))],
+            }
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    exe = paddle.static.Executor(place)
+    with paddle.static.scope_guard(paddle.static.Scope()):
+        exe.run(startup)
+        return exe.run(main, feed=feed_dict, fetch_list=fetch_list)
+
+
 class TestMergedMomentum(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
@@ -193,5 +317,78 @@ def test_main(self):
                 self.check_with_place(place, multi_precision)
 
 
+class TestMergedMomentum2(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        dtype = np.float16 if multi_precision and isinstance(
+            place, paddle.CUDAPlace) else np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        velocitys = self.gen_rand_data(shapes, mp_dtype)
+        learning_rate = self.gen_rand_data([[1]], mp_dtype)[0]
+        if multi_precision:
+            master_params = [p.astype(mp_dtype) for p in params]
+        else:
+            master_params = None
+        return params, grads, velocitys, master_params, learning_rate
+
+    def check_with_place(self, place, multi_precision):
+        params, grads, velocitys, master_params, learning_rate = self.prepare_data(
+            self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_nesterov, use_merged):
+            # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad 
+            rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01
+            return run_momentum_op2(
+                params,
+                grads,
+                velocitys,
+                master_params,
+                learning_rate,
+                place,
+                multi_precision,
+                rescale_grad=rescale_grad,
+                use_merged=use_merged,
+                use_nesterov=use_nesterov)
+
+        outs1 = run_op(use_nesterov=True, use_merged=True)
+        outs2 = run_op(use_nesterov=True, use_merged=False)
+        self.assertEqual(len(outs1), len(outs2))
+        for i, (out1, out2) in enumerate(zip(outs1, outs2)):
+            if isinstance(place, paddle.CUDAPlace):
+                self.assertTrue(np.array_equal(out1, out2))
+            else:
+                self.assertTrue(np.allclose(out1, out2, atol=1e-7))
+
+        outs3 = run_op(use_nesterov=False, use_merged=True)
+        outs4 = run_op(use_nesterov=False, use_merged=False)
+        self.assertEqual(len(outs3), len(outs4))
+        for j, (out3, out4) in enumerate(zip(outs3, outs4)):
+            if isinstance(place, paddle.CUDAPlace):
+                self.assertTrue(np.array_equal(out3, out4))
+            else:
+                self.assertTrue(np.allclose(out3, out4, atol=1e-7))
+
+    def get_places(self):
+        places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        return places
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            for place in self.get_places():
+                self.check_with_place(place, multi_precision)
+
+
 if __name__ == "__main__":
     unittest.main()

From d93ee063ce6473da487b45feba1899c3efe31744 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 30 Nov 2021 13:50:20 +0100
Subject: [PATCH 008/124] Add new unittests for gIOHW format in
 conv_transpose_mkldnn_op (#37344)

* Add new unittests

* Replace I with O channel for filter groups

* Undo changes affecting other operators

* Fix oneDNN namespace typo

* Fix code format error
---
 .../mkldnn/conv_transpose_mkldnn_op.cc        | 10 ++++-----
 .../mkldnn/test_conv2d_transpose_mkldnn_op.py | 21 +++++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 35e35eb4bcb55d..4a3d1f455bd265 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -26,14 +26,12 @@ using Tensor = framework::Tensor;
 using framework::DataLayout;
 
 inline dnnl::memory::dims GetWeightsTz(const Tensor* filter, const int groups) {
-  auto iohw_weights_tz = framework::vectorize(filter->dims());
-  auto weights_tz = iohw_weights_tz;
-
-  // IOHW -> OIHW
-  weights_tz[0] = iohw_weights_tz[1];
-  weights_tz[1] = iohw_weights_tz[0];
+  auto weights_tz = framework::vectorize(filter->dims());
   int g = std::max(groups, 1);
+  int g_dim = (g > 1) ? 1 : 0;
   platform::GetGroupConvWeightsTz(weights_tz, g);
+  // gIOHW -> gOIHW || IOHW -> OIHW
+  std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]);
   return weights_tz;
 }
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 86609f015a2605..a36fc28013bb4c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -154,6 +154,27 @@ def init_test_case(self):
         self.padding_algorithm = "EXPLICIT"
 
 
+class TestMKLDNNWithGroups(TestConv2DTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
+class TestMKLDNNWithGroups_NHWC(TestConv2DTransposeMKLDNNOp):
+    def init_test_case(self):
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
+        self.pad = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 4]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
 if __name__ == '__main__':
     enable_static()
     unittest.main()

From 5747fd1e6b6ee1b1eebd5d3e79602fb98289b251 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 30 Nov 2021 21:13:37 +0800
Subject: [PATCH 009/124] add pten_transpose dependence device_context (#37705)

---
 paddle/pten/kernels/math/cpu/CMakeLists.txt  | 2 +-
 paddle/pten/kernels/math/cuda/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/pten/kernels/math/cpu/CMakeLists.txt b/paddle/pten/kernels/math/cpu/CMakeLists.txt
index b30d6d96bdd2d1..235a49a5e4af51 100644
--- a/paddle/pten/kernels/math/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/math/cpu/CMakeLists.txt
@@ -1 +1 @@
-cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor)
+cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor device_context)
diff --git a/paddle/pten/kernels/math/cuda/CMakeLists.txt b/paddle/pten/kernels/math/cuda/CMakeLists.txt
index 5e14f759a6dfcc..b0be23bb092416 100644
--- a/paddle/pten/kernels/math/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/math/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WITH_GPU)
-  nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc)
+  nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc device_context)
 elseif(WITH_ROCM)
-  hip_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc)
+  hip_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc device_context)
 endif()

From fab92824b5994cbcec5d7282fb9369bba5596419 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Tue, 30 Nov 2021 14:41:32 +0100
Subject: [PATCH 010/124] refactoring matmul_v2 mkldnn hierarchy (#37622)

* refactoring matmul hierarchy

* review fix

* review fix

* review_FIX-part2
---
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   | 135 +++++++++---------
 1 file changed, 71 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 0457aeed616fa3..0266edac75d1ef 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -25,9 +25,9 @@ using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::to_void_cast;
 using Tensor = paddle::framework::Tensor;
-using paddle::framework::vectorize;
-using paddle::framework::make_ddim;
 using paddle::framework::GradVarName;
+using paddle::framework::make_ddim;
+using paddle::framework::vectorize;
 
 template <typename T>
 class MatMulV2MKLDNNHandler
@@ -123,45 +123,58 @@ class MatMulV2MKLDNNHandler
   }
 };
 
-template <typename T>
-class MatMulV2MKLDNNKernel
-    : public paddle::operators::MatMulGradMKLDNNKernel<T> {
- public:
-  void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
+bool IsOutputFused(const ExecutionContext& ctx) {
+  auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
+  auto& fused_transpose_Out = ctx.Attr<std::vector<int>>("fused_transpose_Out");
+  return !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
+}
+
+float ComputeOutputScale(const ExecutionContext& ctx) {
+  float scale_x = ctx.Attr<float>("Scale_x");
+  float scale_y = ctx.Attr<float>("Scale_y");
+  bool force_fp32_out = ctx.Attr<bool>("force_fp32_output");
+  float scale_out = force_fp32_out ? 1.f : ctx.Attr<float>("Scale_out");
+  return scale_out / (scale_x * scale_y);
+}
 
- protected:
-  void ExecuteMatMul(const ExecutionContext& ctx,
+template <typename T>
+void ExecuteMatMulV2(const ExecutionContext& ctx,
                      const MKLDNNDeviceContext& dev_ctx,
                      const dnnl::engine onednn_engine,
                      paddle::platform::Place cpu_place, const Tensor* x,
                      std::vector<int64_t>& x_dims, bool trans_x,
                      const Tensor* y, std::vector<int64_t>& y_dims,
                      bool trans_y, Tensor* out, std::vector<int64_t>& out_dims,
-                     int execution_number = 0) const {
-    MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims,
-                                     trans_x, y_dims, trans_y,
-                                     IsOutputFused(ctx));
+                     int execution_number = 0) {
+  MatMulV2MKLDNNHandler<T> handler(onednn_engine, ctx.GetPlace(), x_dims,
+                                   trans_x, y_dims, trans_y,
+                                   IsOutputFused(ctx));
 
-    const auto src_memory_p = handler.AcquireSrcMemory(x);
-    const auto weights_memory_p = handler.AcquireWeightsMemory(y);
-    const auto dst_memory_p = handler.AcquireDstMemory(out);
+  const auto src_memory_p = handler.AcquireSrcMemory(x);
+  const auto weights_memory_p = handler.AcquireWeightsMemory(y);
+  const auto dst_memory_p = handler.AcquireDstMemory(out);
 
-    auto matmul_p = handler.AcquireForwardPrimitive();
+  auto matmul_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, memory> matmul_args = {
-        {DNNL_ARG_SRC, *src_memory_p},
-        {DNNL_ARG_WEIGHTS, *weights_memory_p},
-        {DNNL_ARG_DST, *dst_memory_p}};
+  std::unordered_map<int, memory> matmul_args = {
+      {DNNL_ARG_SRC, *src_memory_p},
+      {DNNL_ARG_WEIGHTS, *weights_memory_p},
+      {DNNL_ARG_DST, *dst_memory_p}};
 
-    auto& astream = MKLDNNDeviceContext::tls().get_stream();
-    matmul_p->execute(astream, matmul_args);
-    astream.wait();
+  auto& astream = MKLDNNDeviceContext::tls().get_stream();
+  matmul_p->execute(astream, matmul_args);
+  astream.wait();
 
-    auto format = paddle::platform::MKLDNNFormatForSize(
-        out->dims().size(), dnnl::memory::format_tag::nchw);
-    out->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    out->set_format(format);
-  }
+  auto format = paddle::platform::MKLDNNFormatForSize(
+      out->dims().size(), dnnl::memory::format_tag::nchw);
+  out->set_layout(paddle::framework::DataLayout::kMKLDNN);
+  out->set_format(format);
+}
+
+template <typename T>
+class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
 
  private:
   void CalculateMatrixDims(const ExecutionContext& ctx,
@@ -207,13 +220,6 @@ class MatMulV2MKLDNNKernel
     }
   }
 
-  bool IsOutputFused(const ExecutionContext& ctx) const {
-    auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
-    auto& fused_transpose_Out =
-        ctx.Attr<std::vector<int>>("fused_transpose_Out");
-    return !fused_reshape_Out.empty() && !fused_transpose_Out.empty();
-  }
-
   void RunKernel(const ExecutionContext& ctx) const {
     const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
@@ -237,13 +243,14 @@ class MatMulV2MKLDNNKernel
     CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims,
                         out);
 
-    ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_bd_dims,
-                  trans_x, y, y_bd_dims, trans_y, out, out_dims);
+    ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
+                       x_bd_dims, trans_x, y, y_bd_dims, trans_y, out,
+                       out_dims);
   }
 };
 
 template <typename T>
-class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
+class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); }
 
@@ -316,7 +323,7 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
     // if no broadcasting is needed, we can simply use matmul's grad and avoid
     // using reduce_sum
     if (!is_broadcast) {
-      paddle::operators::MatMulGradMKLDNNKernel<T>::Compute(ctx);
+      matmul_v1_grad_mkldnn_kernel.Compute(ctx);
       return;
     }
 
@@ -342,33 +349,29 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
                             dy_bd_dims);
 
     if (trans_x && trans_y) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y,
-                          y_dims, true, dout, dout_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims,
-                          2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
+                         true, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, true, x, x_dims, true, &dy_tmp, dy_bd_dims,
+                         2);
     } else if (trans_x) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y,
-                          y_dims, false, dout, dout_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                          x_dims, false, dout, dout_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), y, y_dims,
+                         false, dout, dout_dims, true, &dx_tmp, dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
+                         false, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
     } else if (trans_y) {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, false, y, y_dims, false, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, true, x, x_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, false, y, y_dims, false, &dx_tmp,
+                         dx_bd_dims, 1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, true, x, x_dims, false, &dy_tmp, dy_bd_dims,
+                         2);
     } else {
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
-                          dout_dims, false, y, y_dims, true, &dx_tmp,
-                          dx_bd_dims, 1);
-      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                          x_dims, true, dout, dout_dims, false, &dy_tmp,
-                          dy_bd_dims, 2);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), dout,
+                         dout_dims, false, y, y_dims, true, &dx_tmp, dx_bd_dims,
+                         1);
+      ExecuteMatMulV2<T>(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), x, x_dims,
+                         true, dout, dout_dims, false, &dy_tmp, dy_bd_dims, 2);
     }
 
     if (x_dims != dx_bd_dims) {
@@ -389,8 +392,12 @@ class MatMulV2GradMKLDNNKernel : public MatMulV2MKLDNNKernel<T> {
     dy->set_layout(paddle::framework::DataLayout::kMKLDNN);
     dy->set_format(y->format());
   }
+
+ private:
+  paddle::operators::MatMulGradMKLDNNKernel<T> matmul_v1_grad_mkldnn_kernel;
 };
 }  // anonymous namespace
+
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace,

From 82b55961cdeb683a59870837c844b8d1171d48e4 Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Tue, 30 Nov 2021 14:49:14 +0100
Subject: [PATCH 011/124] add matmul_v2_transpose_reshape_fuse_pass to
 quant2_int8_mkldnn_pass.py (#37619)

---
 .../fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 7930923668c7d7..bc97e5cf6c9586 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -419,6 +419,7 @@ def _optimize_fp32_graph(self, graph):
         if self._is_fc_quantized(graph):
             graph = self._apply_pass(graph, 'fc_mkldnn_pass')
         graph = self._apply_pass(graph, 'matmul_transpose_reshape_fuse_pass')
+        graph = self._apply_pass(graph, 'matmul_v2_transpose_reshape_fuse_pass')
         # the following pass should be the last one since it will work on all fused ops.
         graph = self._apply_pass(graph, 'runtime_context_cache_pass')
         return graph

From 8a4460f522e21be9060890882ac1365613950585 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Tue, 30 Nov 2021 22:25:39 +0800
Subject: [PATCH 012/124] [fleet_executor] interceptor run from python
 interface (#37693)

---
 .../distributed/fleet_executor/carrier.cc     | 43 ++++++++++++-------
 .../distributed/fleet_executor/carrier.h      |  3 ++
 .../fleet_executor/compute_interceptor.cc     | 24 +++++------
 .../distributed/fleet_executor/interceptor.cc | 14 ++++--
 .../distributed/fleet_executor/message_bus.cc |  3 ++
 .../fleet_executor/runtime_graph.cc           |  1 +
 6 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 108a21b92fdfd2..8a4f10473e3d27 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -92,19 +92,22 @@ Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
 }
 
 void Carrier::Start() {
-  // TODO(fleet_executor dev): this start is a faked one, need replace
-  for (const auto& pair : interceptor_idx_to_interceptor_) {
-    VLOG(3) << "Fake run is sending start to interceptor " << pair.first << ".";
-    InterceptorMessage tmp_msg;
-    tmp_msg.set_src_id(pair.first);
-    tmp_msg.set_dst_id(pair.first);
-    tmp_msg.set_message_type(DATA_IS_READY);
-    MessageBus& message_bus_instance = MessageBus::Instance();
-    PADDLE_ENFORCE_EQ(message_bus_instance.IsInit(), true,
-                      platform::errors::PreconditionNotMet(
-                          "Message bus has not been initialized."));
-    message_bus_instance.Send(tmp_msg);
+  MessageBus& msg_bus = MessageBus::Instance();
+  PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Message bus has not been initialized."));
+
+  for (int64_t id : source_interceptor_ids_) {
+    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
+            << ".";
+    InterceptorMessage start_msg;
+    // source node data_is_ready is send by carrier, so set src_id=-1
+    start_msg.set_src_id(-1);
+    start_msg.set_dst_id(id);
+    start_msg.set_message_type(DATA_IS_READY);
+    msg_bus.Send(start_msg);
   }
+
   std::unique_lock<std::mutex> lock(running_mutex_);
   cond_var_.wait(lock);
   dev_ctx_->Wait();
@@ -164,16 +167,26 @@ void Carrier::CreateInterceptors() {
       int64_t interceptor_id = item.first;
       TaskNode* task_node = item.second;
 
-      // TODO(wangxi): use node_type to select different Interceptor
-      auto interceptor =
-          std::make_unique<Interceptor>(interceptor_id, task_node);
+      std::unique_ptr<Interceptor> interceptor;
+      if (task_node->type().empty()) {
+        // TODO(wangxi): delete this in future
+        interceptor.reset(new Interceptor(interceptor_id, task_node));
+      } else {
+        interceptor = InterceptorFactory::Create(task_node->type(),
+                                                 interceptor_id, task_node);
+      }
       interceptor->SetPlace(place_);
       interceptor->SetMiniBatchScope(minibatch_scope_);
       interceptor->SetMicroBatchScope(microbatch_scopes_);
       interceptor->SetRootScope(root_scope_);
+
       SetInterceptor(interceptor_id, std::move(interceptor));
       VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
               << ".";
+
+      if (task_node->upstream().empty()) {
+        source_interceptor_ids_.emplace_back(interceptor_id);
+      }
     }
     // The carrier will be always waiting for outside initializer
     // since there is no interceptor has been created during auto init
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index c4c6a418464747..b5976b297f9139 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -17,6 +17,7 @@
 #include <condition_variable>
 #include <memory>
 #include <mutex>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -90,6 +91,8 @@ class Carrier final {
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
       interceptor_idx_to_interceptor_;
 
+  std::vector<int64_t> source_interceptor_ids_;
+
   std::vector<InterceptorMessage> message_tmp_{};
   std::mutex tmp_message_mutex_;
   bool creating_interceptors_{true};
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 3008c83069942c..fd55aa2aa1c465 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -154,18 +154,6 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::Run() {
-  // If there is no limit, source interceptor can be executed
-  // an unlimited number of times.
-  // Now source node can only run
-  if (ShouldReset()) {
-    for (auto& out_buff : out_buffs_) {
-      // buffer is using
-      if (out_buff.second.second != 0) return;
-    }
-    step_ = 0;  // reset
-    return;
-  }
-
   while (IsInputReady() && CanWriteOutput() && !ShouldReset()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
@@ -181,6 +169,18 @@ void ComputeInterceptor::Run() {
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
   }
+
+  // If there is no limit, source interceptor can be executed
+  // an unlimited number of times.
+  // Now source node can only run max_run_times.
+  if (ShouldReset()) {
+    for (auto& out_buff : out_buffs_) {
+      // buffer is using
+      if (out_buff.second.second != 0) return;
+    }
+    step_ = 0;  // reset
+    return;
+  }
 }
 
 void ComputeInterceptor::ReceivedStop(int64_t up_id) {
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 40429502825c9c..63c2bb3fc6eecb 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -46,11 +46,19 @@ void Interceptor::Handle(const InterceptorMessage& msg) {
     VLOG(3) << "Interceptor is using default message handler. This handler is "
                "only used for test purpose. Check whether you init interceptor "
                "in the proper way.";
+
     if (msg.message_type() == DATA_IS_READY) {
+      if (node_->role() != 2) {
+        VLOG(3) << "Fake handler is sending DATA_IS_READY message to: "
+                << interceptor_id_ + 1 << ".";
+        InterceptorMessage data_is_ready_msg;
+        data_is_ready_msg.set_message_type(DATA_IS_READY);
+        Send(interceptor_id_ + 1, data_is_ready_msg);
+      }
       VLOG(3) << "Fake handler is sending stop message to it self.";
-      InterceptorMessage msg;
-      msg.set_message_type(STOP);
-      Send(interceptor_id_, msg);
+      InterceptorMessage stop_msg;
+      stop_msg.set_message_type(STOP);
+      Send(interceptor_id_, stop_msg);
     } else if (msg.message_type() == STOP) {
       stop_ = true;
       StopCarrier();
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 2071477372c9e7..de2171e68e19e2 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -136,6 +136,9 @@ void MessageBus::ListenPort() {
 }
 
 bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
+  // -1 is sent by carrier to source interceptor
+  if (src_id == -1) src_id = dst_id;
+
   // check whether the dst is the same rank or different rank with src
   const auto& src_rank = interceptor_id_to_rank_.find(src_id);
   const auto& dst_rank = interceptor_id_to_rank_.find(dst_id);
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 3a76bd43f9d55b..b32db6c2294b80 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -112,6 +112,7 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
   for (const auto& op_desc : program.Block(0).AllOps()) {
     ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
   }
+
   std::unordered_map<int32_t, std::vector<OperatorBase*>> role_to_ops;
   for (const auto& op : ops_) {
     int32_t op_role = op->Attr<int32_t>("op_role");

From 661dbdbee5c1112a88d22fd0a76d4de06d2ca705 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Wed, 1 Dec 2021 09:43:25 +0800
Subject: [PATCH 013/124] Modify ShareTensorWithCinnBuffer by callback to save
 memory (#37493)

Modify ShareTensorWithCinnBuffer by callback to save memory
---
 .../framework/paddle2cinn/cinn_compiler.cc    |  1 +
 paddle/fluid/operators/cinn_launch_op.cc      | 82 +++++++++++--------
 paddle/fluid/operators/cinn_launch_op.h       | 28 +++----
 paddle/fluid/operators/cinn_launch_op_test.cc | 54 ++++--------
 4 files changed, 78 insertions(+), 87 deletions(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 3f1b6c78d8417d..360c9270782083 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -209,6 +209,7 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
+  options.with_buffer_handle_instruction_inserted = true;
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
diff --git a/paddle/fluid/operators/cinn_launch_op.cc b/paddle/fluid/operators/cinn_launch_op.cc
index e70a51d8805165..f0ad5b3c3bf996 100644
--- a/paddle/fluid/operators/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cc
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/cinn_launch_op.h"
+
+#include <functional>
 #include <vector>
+
 #include "paddle/fluid/string/string_helper.h"
 
 DECLARE_bool(cudnn_deterministic);
@@ -108,33 +111,9 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
   return all_parameters;
 }
 
-void CinnLaunchContext::MutableTensorData(const std::string& var_name,
-                                          const platform::Place& place,
-                                          LoDTensor* paddle_tensor,
-                                          bool is_internal_var) {
-  auto cinn_name = var_name;
-  if (!is_internal_var) {
-    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
-                      platform::errors::InvalidArgument(
-                          "Paddle variable(%s) not used by cinn", var_name));
-    cinn_name = paddle2cinn_varmap_.at(var_name);
-  }
-
-  auto cinn_tensor = GetCinnTensor(cinn_name);
-  // TODO(CtfGo): support mutable corresponding c++ type after CINN ready
-  VLOG(4) << "Only support float in cinn_launch op now.";
-  paddle_tensor->mutable_data<float>(
-      framework::make_ddim(cinn_tensor->shape().data()), place);
-}
-
 void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
                                               const LoDTensor& paddle_tensor,
                                               const CinnTensor& cinn_tensor) {
-  PADDLE_ENFORCE_EQ(
-      paddle_tensor.IsInitialized(), true,
-      platform::errors::InvalidArgument(
-          "Tensor in variable(%s) is not initialized.", paddle_name));
-
   // check dimension
   auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
@@ -147,27 +126,39 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
 }
 
 void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
+                                               const platform::Place& place,
                                                LoDTensor* paddle_tensor) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
                     platform::errors::InvalidArgument(
                         "Paddle variable(%s) not used by cinn", paddle_name));
 
   const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, GetCinnTensor(cinn_name));
-  return SetArgument(cinn_name, paddle_tensor);
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
+  if (!paddle_tensor->IsInitialized()) {
+    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  }
+  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
+  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
+                     paddle_tensor);
 }
 
 void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
+                                               const platform::Place& place,
                                                LoDTensor* paddle_tensor) {
   PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not found in cinn socpe.", cinn_name));
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, GetCinnTensor(cinn_name));
-  return SetArgument(cinn_name, paddle_tensor);
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
+  if (!paddle_tensor->IsInitialized()) {
+    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  }
+  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
+  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
+                     paddle_tensor);
 }
 
 std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    LoDTensor* tensor) {
+    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
   // convert paddle dimensions array to cinn format
   std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
   for (auto i = 0; i < tensor->dims().size(); ++i) {
@@ -177,19 +168,42 @@ std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
   auto cinn_buffer = std::make_unique<cinn_buffer_t>();
   // assign size and memory
   cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
-  cinn_buffer->memory = reinterpret_cast<uint8_t*>(tensor->data<float>());
+
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
+        buffer->memory =
+            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+        return 0;
+      });
+
+  if (free_mem_callback) {
+    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+        [tensor](void* ctx, cinn_buffer_t* buffer) {
+          tensor->clear();
+          return 0;
+        });
+    return cinn_buffer;
+  }
+
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
   return cinn_buffer;
 }
 
 void CinnLaunchContext::SetArgument(const std::string& cinn_name,
+                                    const platform::Place& place,
+                                    bool free_mem_callback,
                                     LoDTensor* paddle_tensor) {
-  auto buffer = ShareTensorWithCinnBuffer(paddle_tensor);
+  auto buffer =
+      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
   name2argument_.emplace(cinn_name, buffer.get());
   hold_buffers_.emplace_back(std::move(buffer));
   VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), "
-          << "type(" << framework::DataTypeToString(paddle_tensor->type())
-          << "), dims(" << paddle_tensor->dims() << ").";
+          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
+          << ").";
 }
 
 const std::map<std::string, cinn_pod_value_t>&
diff --git a/paddle/fluid/operators/cinn_launch_op.h b/paddle/fluid/operators/cinn_launch_op.h
index 53e6ff0d613873..2b1bf89197dffb 100644
--- a/paddle/fluid/operators/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn_launch_op.h
@@ -49,16 +49,13 @@ class CinnLaunchContext {
   // Return whether a Paddle variable used on compiled kernels
   bool IsVariableUsed(const std::string& var_name);
 
-  // Allocate buffer to a Paddle tensor with assginment information from CINN
-  void MutableTensorData(const std::string& var_name,
-                         const platform::Place& place, LoDTensor* paddle_tensor,
-                         bool is_internal_var = false);
-
   // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& var_name,
+                              const platform::Place& place, LoDTensor* tensor);
 
   // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& var_name,
+                              const platform::Place& place, LoDTensor* tensor);
 
   // Extract internal variable names from CinnScope
   // by excluding used input and output variables
@@ -83,10 +80,12 @@ class CinnLaunchContext {
 
   // Share the buffer of a Paddle tensor to CINN by delivering memory address
   // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(LoDTensor* tensor);
+  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
+      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
 
   // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, LoDTensor* paddle_tensor);
+  void SetArgument(const std::string& cinn_name, const platform::Place& place,
+                   bool free_mem_callback, LoDTensor* paddle_tensor);
 
  private:
   // a variable name map from paddle to cinn
@@ -198,7 +197,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
       }
 
       launch_context->AssignExternalVariable(
-          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
+          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
     }
 
     // 3.2 Prepare output variables: all output variables should
@@ -215,11 +214,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                             "Output variable(%s) not used by cinn", var_name));
 
       auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      if (!tensor->IsInitialized()) {
-        launch_context->MutableTensorData(var_name, place, tensor);
-      }
-      launch_context->AssignExternalVariable(
-          var_name, scope.GetVar(var_name)->GetMutable<LoDTensor>());
+      launch_context->AssignExternalVariable(var_name, place, tensor);
     }
 
     // 3.3 Prepare internal or temporary variables: Create a temporary
@@ -232,8 +227,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     framework::Scope* temp_scope = scope.NewTmpScope().release();
     for (const auto& var_name : internal_variable_names) {
       auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->MutableTensorData(var_name, place, tensor, true);
-      launch_context->AssignInternalVariable(var_name, tensor);
+      launch_context->AssignInternalVariable(var_name, place, tensor);
     }
 
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
diff --git a/paddle/fluid/operators/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn_launch_op_test.cc
index 5a07a49a5969aa..5e0b87d06afeaf 100644
--- a/paddle/fluid/operators/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn_launch_op_test.cc
@@ -222,30 +222,9 @@ TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
   auto launch_context =
       std::make_unique<CinnLaunchContext>(GetDefaultCompiledObj());
   auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 1);
-  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
-}
-
-TEST(CinnLaunchContextTest, TestMutableTensorData) {
-  platform::CPUPlace place;
-  framework::Scope scope;
-  auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-
-  auto launch_context =
-      std::make_unique<CinnLaunchContext>(GetDefaultCompiledObj());
-  // mutable_data on external variable
-  ASSERT_NO_THROW(launch_context->MutableTensorData("var1", place, tensor1));
-  ASSERT_TRUE(tensor1->IsInitialized());
-  ASSERT_EQ(tensor1->dims(), framework::make_ddim({3, 4}));
-  ASSERT_THROW(launch_context->MutableTensorData("not_exist", place, tensor1),
-               paddle::platform::EnforceNotMet);
-
-  // mutable_data on internal variable
-  ASSERT_NO_THROW(
-      launch_context->MutableTensorData("cinn_var2", place, tensor2, true));
-  ASSERT_TRUE(tensor2->IsInitialized());
-  ASSERT_EQ(tensor2->dims(), framework::make_ddim({6, 7, 8}));
+  ASSERT_EQ(internal_variable_names.size(), 3);
+  EXPECT_NE(internal_variable_names.find("cinn_var2"),
+            internal_variable_names.end());
 }
 
 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
@@ -255,12 +234,9 @@ TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
   framework::Scope scope;
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
-  // CheckTensorEquivalent: tensor is not initialized
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1),
-               paddle::platform::EnforceNotMet);
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
                paddle::platform::EnforceNotMet);
 }
 
@@ -272,11 +248,12 @@ TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
   auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
 
   // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
                paddle::platform::EnforceNotMet);
   // not found
-  ASSERT_THROW(launch_context->AssignExternalVariable("cinn_var4", tensor4),
-               paddle::platform::EnforceNotMet);
+  ASSERT_THROW(
+      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
+      paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestSetArgument) {
@@ -286,22 +263,25 @@ TEST(CinnLaunchContextTest, TestSetArgument) {
   platform::CPUPlace place;
   framework::Scope scope;
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
-  tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
-  auto* data1 = tensor1->data<float>();
+  float* data1 =
+      tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
 
   // assign external variable
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1", tensor1));
+  ASSERT_NO_THROW(
+      launch_context->AssignExternalVariable("var1", place, tensor1));
   auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
   tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2", tensor2));
+  ASSERT_NO_THROW(
+      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
   // FinalizeArguments not missed check
   ASSERT_THROW(launch_context->FinalizeArguments(),
                paddle::platform::EnforceNotMet);
   auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
   tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3", tensor3));
+  ASSERT_NO_THROW(
+      launch_context->AssignExternalVariable("var3", place, tensor3));
 
   auto name2argument = launch_context->FinalizeArguments();
   ASSERT_EQ(name2argument.size(), 3);
@@ -310,6 +290,8 @@ TEST(CinnLaunchContextTest, TestSetArgument) {
   auto* cinn_buffer =
       static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
 
+  ASSERT_EQ(cinn_buffer->memory, nullptr);
+  cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
   ASSERT_EQ(cinn_buffer->num_elements(), 12);
   auto* shadow_data = reinterpret_cast<float*>(cinn_buffer->memory);

From 8ac0344a4ba3f291fa170145504cbfd9ead03d2c Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 1 Dec 2021 10:36:47 +0800
Subject: [PATCH 014/124] Add paddle.rad2deg and paddle.deg2rad (#37598)

---
 python/paddle/__init__.py                     |   4 +
 .../fluid/tests/unittests/test_deg2rad.py     |  75 ++++++++++++
 .../fluid/tests/unittests/test_rad2deg.py     |  92 ++++++++++++++
 python/paddle/tensor/__init__.py              |   2 +
 python/paddle/tensor/math.py                  | 114 +++++++++++++++++-
 5 files changed, 285 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_deg2rad.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_rad2deg.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index af42cbfc88eb1a..73b186efed37b3 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -223,6 +223,8 @@
 from .tensor.math import digamma  # noqa: F401
 from .tensor.math import neg  # noqa: F401
 from .tensor.math import lgamma  # noqa: F401
+from .tensor.math import rad2deg  # noqa: F401
+from .tensor.math import deg2rad  # noqa: F401
 from .tensor.math import diff  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
@@ -458,6 +460,8 @@
            'ceil',
            'atan',
            'atan2',
+           'rad2deg',
+           'deg2rad',
            'expand',
            'broadcast_to',
            'ones_like',
diff --git a/python/paddle/fluid/tests/unittests/test_deg2rad.py b/python/paddle/fluid/tests/unittests/test_deg2rad.py
new file mode 100644
index 00000000000000..31219d5ab97af1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_deg2rad.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestDeg2radAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_dtype = 'float64'
+        self.x_np = np.array(
+            [180.0, -180.0, 360.0, -360.0, 90.0, -90.0]).astype(np.float64)
+        self.x_shape = [6]
+        self.out_np = np.deg2rad(self.x_np)
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
+            out = paddle.deg2rad(x)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input': self.x_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(out[0]) == self.out_np).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
+        result1 = paddle.deg2rad(x1)
+        self.assertEqual(np.allclose(self.out_np, result1.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestDeg2radAPI2(TestDeg2radAPI):
+    # Test input data type is int
+    def setUp(self):
+        self.x_np = 180
+        self.x_shape = [1]
+        self.out_np = np.pi
+        self.x_dtype = 'int64'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(180)
+        result2 = paddle.deg2rad(x2)
+        self.assertEqual(np.allclose(np.pi, result2.numpy()), True)
+
+        paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_rad2deg.py b/python/paddle/fluid/tests/unittests/test_rad2deg.py
new file mode 100644
index 00000000000000..9f117cbab9a4d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rad2deg.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestRad2degAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_dtype = 'float64'
+        self.x_np = np.array(
+            [3.142, -3.142, 6.283, -6.283, 1.570, -1.570]).astype(np.float64)
+        self.x_shape = [6]
+        self.out_np = np.rad2deg(self.x_np)
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input', dtype=self.x_dtype, shape=self.x_shape)
+            out = paddle.rad2deg(x)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input': self.x_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(out[0]) == self.out_np).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
+        result1 = paddle.rad2deg(x1)
+        self.assertEqual(np.allclose(self.out_np, result1.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestRad2degAPI2(TestRad2degAPI):
+    def setUp(self):
+        self.x_np = np.pi / 2
+        self.x_shape = [1]
+        self.out_np = 90
+        self.x_dtype = 'float32'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(np.pi / 2)
+        result2 = paddle.rad2deg(x2)
+        self.assertEqual(np.allclose(90, result2.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestRad2degAPI3(TestRad2degAPI):
+    # Test input data type is int
+    def setUp(self):
+        self.x_np = 1
+        self.x_shape = [1]
+        self.out_np = 180 / np.pi
+        self.x_dtype = 'int64'
+
+    def test_dygraph(self):
+        paddle.disable_static()
+
+        x2 = paddle.to_tensor(1)
+        result2 = paddle.rad2deg(x2)
+        self.assertEqual(np.allclose(180 / np.pi, result2.numpy()), True)
+
+        paddle.enable_static()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 23e956b6590d4c..d8acb946a6d976 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -189,6 +189,8 @@
 from .math import neg  # noqa: F401
 from .math import lgamma  # noqa: F401
 from .math import diagonal  # noqa: F401
+from .math import rad2deg  # noqa: F401
+from .math import deg2rad  # noqa: F401
 from .math import diff  # noqa: F401
 
 from .random import multinomial  # noqa: F401
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0a5930d91adbba..66c3eaece76d43 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2612,6 +2612,117 @@ def atan2(x, y, name=None):
                 type='atan2', inputs=inputs, outputs={'Out': out})
         return out
 
+def rad2deg(x, name=None):
+    """
+    Convert each of the elements of input x from angles in radians to degrees.
+    
+    Equation:
+        .. math::
+
+            rad2deg(x)=180/ \pi * x
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the shape and data type is the same with input (The output data type is float32 when the input data type is int).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
+            result1 = paddle.rad2deg(x1)
+            print(result1)
+            # Tensor(shape=[6], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [180.02334595, -180.02334595,  359.98937988, -359.98937988,
+            #           9.95437622 , -89.95437622])
+
+            x2 = paddle.to_tensor(np.pi/2)
+            result2 = paddle.rad2deg(x2)
+            print(result2)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [90.])
+                     
+            x3 = paddle.to_tensor(1)
+            result3 = paddle.rad2deg(x3)
+            print(result3)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [57.29578018])
+    """
+    rad2deg_scale = 180 / np.pi
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.scale(x, 'scale', rad2deg_scale)
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'rad2deg')
+        helper = LayerHelper('rad2deg', **locals())
+        out_cast = x
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            out_cast = helper.create_variable_for_type_inference(dtype=paddle.float32)
+            helper.append_op(
+                    type='cast', inputs={'X':x}, outputs={'Out': out_cast}, attrs={'in_dtype': x.dtype,'out_dtype': paddle.float32})
+        out = helper.create_variable_for_type_inference(dtype=out_cast.dtype)
+        helper.append_op(
+            type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': rad2deg_scale})
+        return out
+
+def deg2rad(x, name=None):
+    """
+    Convert each of the elements of input x from degrees to angles in radians.
+    
+    Equation:
+        .. math::
+
+            deg2rad(x)=\pi * x / 180
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the shape and data type is the same with input (The output data type is float32 when the input data type is int).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
+            result1 = paddle.deg2rad(x1)
+            print(result1)
+            # Tensor(shape=[6], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [3.14159274, -3.14159274,  6.28318548, -6.28318548,  1.57079637,
+            #           -1.57079637])
+
+            x2 = paddle.to_tensor(180)
+            result2 = paddle.deg2rad(x2)
+            print(result2)
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #         [3.14159274])
+    """
+    deg2rad_scale = np.pi / 180.0
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.scale(x, 'scale', deg2rad_scale)
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], 'deg2rad')
+        helper = LayerHelper('deg2rad', **locals())
+        out_cast = x
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            out_cast = helper.create_variable_for_type_inference(dtype=paddle.float32)
+            helper.append_op(
+                    type='cast', inputs={'X':x}, outputs={'Out': out_cast}, attrs={'in_dtype': x.dtype,'out_dtype': paddle.float32})
+        out = helper.create_variable_for_type_inference(dtype=out_cast.dtype)
+        helper.append_op(
+            type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
+        return out
 
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     r"""
@@ -2646,6 +2757,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         .. code-block:: python
 
             import paddle
+
             x = paddle.to_tensor([1, 4, 5, 2])
             out = paddle.diff(x)
             print(out)
@@ -2667,8 +2779,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             print(out)
             # out:
             # [[1, 1], [1, 1]]
-
-
     """
 
     if axis < 0:

From 9f61bc36f059759daf8598fc8c04e7532bbc02a9 Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Wed, 1 Dec 2021 10:51:26 +0800
Subject: [PATCH 015/124] fix flatten in quant (#37722)

---
 .../paddle/fluid/contrib/slim/quantization/quantization_pass.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 9b2954b13f2222..645feda21f0f35 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -136,7 +136,7 @@
     "flatten": [["X"], ["Out"]],
     "flatten2": [["X"], ["Out"]],
     "unsqueeze2": [["X"], ["Out"]],
-    "flatten_contiguous_range": [['X'], ["Out", "XShape"]],
+    "flatten_contiguous_range": [['X'], ["Out"]],
 }
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']

From 3ef89a47a480ca60bbf6d40abdd3c458e2e564fe Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Wed, 1 Dec 2021 11:11:35 +0800
Subject: [PATCH 016/124] Delete pylint for ci (#37720)

Delete pylint for ci
---
 paddle/scripts/paddle_build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index fba7c8a71dcaa8..5c46f011a74d15 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -312,7 +312,6 @@ function check_style() {
     fi
 
 
-    pip install cpplint pylint pytest astroid isort
     # set up go environment for running gometalinter
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle

From 28b43111a437a89f9a698b1d903dfe27dcdd71ba Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Wed, 1 Dec 2021 11:20:12 +0800
Subject: [PATCH 017/124] add angle_op (#37689)

* add angle_op
---
 paddle/fluid/operators/angle_op.cc            | 125 +++++++++++++++
 paddle/fluid/operators/angle_op.cu            |  31 ++++
 paddle/fluid/operators/angle_op.h             | 147 ++++++++++++++++++
 paddle/fluid/platform/complex.h               |  10 ++
 python/paddle/__init__.py                     |   4 +-
 .../fluid/tests/unittests/test_angle_op.py    | 109 +++++++++++++
 python/paddle/tensor/__init__.py              |   4 +-
 python/paddle/tensor/math.py                  |  55 +++++++
 8 files changed, 483 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/operators/angle_op.cc
 create mode 100644 paddle/fluid/operators/angle_op.cu
 create mode 100644 paddle/fluid/operators/angle_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_angle_op.py

diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc
new file mode 100644
index 00000000000000..3cb01486814965
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/angle_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class AngleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "angle");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "angle");
+
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AngleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of angle op.");
+    AddOutput("Out", "(Tensor), The output tensor of angle op.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<bool>("use_cudnn",
+                  "(bool, default false) Only used in cudnn kernel, need "
+                  "install cudnn")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Angle Operator.
+
+This operator is used to perform elementwise angle for input $X$.
+$$out = angle(x)$$
+
+)DOC");
+  }
+};
+
+class AngleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@Grad", "angle_grad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Out@Grad", "angle_grad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   "X@Grad", "angle_grad");
+
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(dtype, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class AngleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("angle_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetInput("X", this->Input("X"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(angle, ops::AngleOp, ops::AngleOpMaker,
+                  ops::AngleGradMaker<paddle::framework::OpDesc>,
+                  ops::AngleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    angle, ops::AngleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<float>>,
+    ops::AngleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::complex<double>>);
+
+REGISTER_OPERATOR(angle_grad, ops::AngleGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    angle_grad, ops::AngleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/angle_op.cu b/paddle/fluid/operators/angle_op.cu
new file mode 100644
index 00000000000000..3264f426a77d1e
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.cu
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/angle_op.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    angle, ops::AngleKernel<plat::CUDADeviceContext, float>,
+    ops::AngleKernel<plat::CUDADeviceContext, double>,
+    ops::AngleKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AngleKernel<plat::CUDADeviceContext, plat::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    angle_grad, ops::AngleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, plat::complex<float>>,
+    ops::AngleGradKernel<plat::CUDADeviceContext, plat::complex<double>>);
diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
new file mode 100644
index 00000000000000..093a04f03df956
--- /dev/null
+++ b/paddle/fluid/operators/angle_op.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
+#include "paddle/fluid/operators/math/complex_functors.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+namespace math {
+template <typename T, typename Enable = void>
+struct AngleFunctor;
+
+// angel function for complex
+template <typename T>
+struct AngleFunctor<T, Complex<T, Real<T>>> {
+  AngleFunctor(const T* input, Real<T>* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = arg(input_[idx]);
+  }
+
+  const T* input_;
+  Real<T>* output_;
+  int64_t numel_;
+};
+
+// angel function for real
+template <typename T>
+struct AngleFunctor<T, NoComplex<T, Real<T>>> {
+  AngleFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = input_[idx] < static_cast<T>(0) ? M_PI : 0;
+  }
+
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T, typename Enable = void>
+struct AngleGradFunctor;
+
+// angle grad for complex
+template <typename T>
+struct AngleGradFunctor<T, Complex<T, Real<T>>> {
+  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    if (x_[idx] == T(0)) {
+      dx_[idx] = T(0);
+    } else {
+      const math::Real<T> r_square =
+          x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
+      dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
+                   dout_[idx] * x_[idx].real / r_square);
+    }
+  }
+
+  const math::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+
+// angle grad for real
+template <typename T>
+struct AngleGradFunctor<T, NoComplex<T, Real<T>>> {
+  AngleGradFunctor(const math::Real<T>* dout, const T* x, T* dx, int64_t numel)
+      : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
+
+  const math::Real<T>* dout_;
+  const T* x_;
+  T* dx_;
+  int64_t numel_;
+};
+}  // namespace math
+
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class AngleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    auto numel = x->numel();
+    auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<math::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(math::Real<T>)));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::AngleFunctor<T> functor(x_data, out_data, numel);
+    for_range(functor);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AngleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const framework::Tensor* d_out =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
+    framework::Tensor* d_x =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto numel = d_out->numel();
+    auto* dout_data = d_out->data<math::Real<T>>();
+    auto* x_data = x->data<T>();
+    auto* dx_data = d_x->mutable_data<T>(
+        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    math::AngleGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+    for_range(functor);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h
index 35de34086c57d0..e50b74133847cc 100644
--- a/paddle/fluid/platform/complex.h
+++ b/paddle/fluid/platform/complex.h
@@ -401,6 +401,16 @@ HOSTDEVICE inline T abs(const complex<T>& a) {
 #endif
 }
 
+template <typename T>
+HOSTDEVICE inline T arg(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return thrust::arg(thrust::complex<T>(a));
+#else
+  return std::arg(std::complex<T>(a));
+#endif
+}
+
 template <typename T>
 HOSTDEVICE inline complex<T> pow(const complex<T>& a, const complex<T>& b) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 73b186efed37b3..c37c331bae4a6e 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -226,6 +226,7 @@
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
 from .tensor.math import diff  # noqa: F401
+from .tensor.math import angle  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
@@ -537,5 +538,6 @@
            'einsum',
            'set_flags',
            'get_flags',
-           'diff'
+           'diff',
+           'angle',
 ]
diff --git a/python/paddle/fluid/tests/unittests/test_angle_op.py b/python/paddle/fluid/tests/unittests/test_angle_op.py
new file mode 100644
index 00000000000000..05397c2434d8c8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_angle_op.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle.fluid import dygraph
+from paddle import static
+paddle.enable_static()
+
+
+def angle_grad(x, dout):
+    if np.iscomplexobj(x):
+
+        def angle_grad_element(xi, douti):
+            if xi == 0:
+                return 0
+            rsquare = np.abs(xi)**2
+            return -douti * xi.imag / rsquare + 1j * douti * xi.real / rsquare
+
+        return np.vectorize(angle_grad_element)(x, dout)
+    else:
+        return np.zeros_like(x).astype(x.dtype)
+
+
+class TestAngleOpFloat(OpTest):
+    def setUp(self):
+        self.op_type = "angle"
+        self.dtype = "float64"
+        self.x = np.linspace(-5, 5, 101).astype(self.dtype)
+        out_ref = np.angle(self.x)
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[
+                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
+            ])
+
+
+class TestAngleOpComplex(OpTest):
+    def setUp(self):
+        self.op_type = "angle"
+        self.dtype = "complex128"
+        real = np.expand_dims(np.linspace(-2, 2, 11), -1).astype("float64")
+        imag = np.linspace(-2, 2, 11).astype("float64")
+        self.x = real + 1j * imag
+        out_ref = np.angle(self.x)
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[
+                angle_grad(self.x, np.ones_like(self.x) / self.x.size)
+            ])
+
+
+class TestAngleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(2, 3) + 1j * np.random.randn(2, 3)
+        self.out = np.angle(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.angle(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[2, 3], dtype="complex128")
+            out = paddle.angle(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index d8acb946a6d976..7cc2c7623a9ff6 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -192,6 +192,7 @@
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
 from .math import diff  # noqa: F401
+from .math import angle  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -404,7 +405,8 @@
            'multi_dot',
            'solve',
            'triangular_solve',
-           'diff'
+           'diff',
+           'angle',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 66c3eaece76d43..36d61fa08546bf 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -24,6 +24,7 @@
 from paddle.common_ops_import import dygraph_utils
 
 from paddle.tensor import cast
+from paddle.tensor.attribute import _complex_to_real_dtype
 import paddle
 from ..fluid import layers
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
@@ -2884,3 +2885,57 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
             out = layers.elementwise_sub(input_back, input_front, axis=axis)
 
         return out
+
+
+def angle(x, name=None):
+    r"""
+    Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
+    for negative real numbers, the angle is :math:`\pi`.
+
+    Equation:
+        .. math::
+
+            angle(x)=arctan2(x.imag, x.real)
+
+    Args:
+        x (Tensor): An N-D Tensor, the data type is complex64, complex128, or float32, float64 .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-2, -1, 0, 1]).unsqueeze(-1).astype('float32')
+            y = paddle.to_tensor([-2, -1, 0, 1]).astype('float32')
+            z = x + 1j * y
+            print(z.numpy())
+            # [[-2.-2.j -2.-1.j -2.+0.j -2.+1.j]
+            #  [-1.-2.j -1.-1.j -1.+0.j -1.+1.j]
+            #  [ 0.-2.j  0.-1.j  0.+0.j  0.+1.j]
+            #  [ 1.-2.j  1.-1.j  1.+0.j  1.+1.j]]
+
+            theta = paddle.angle(z)
+            print(theta.numpy())
+            # [[-2.3561945 -2.6779451  3.1415927  2.6779451]
+            #  [-2.0344439 -2.3561945  3.1415927  2.3561945]
+            #  [-1.5707964 -1.5707964  0.         1.5707964]
+            #  [-1.1071488 -0.7853982  0.         0.7853982]]
+    """
+
+    if in_dygraph_mode():
+        return _C_ops.angle(x)
+
+    check_variable_and_dtype(x, 'x',
+        ['float32', 'float64', 'complex64', 'complex128'], 'angle')
+    op_type = "angle"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_complex_to_real_dtype(x.dtype))
+    outputs = {"Out": out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out

From 370864ddca4d1f54120dd1feefbfcfffe2919d82 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Wed, 1 Dec 2021 11:30:40 +0800
Subject: [PATCH 018/124] optimizer __call__ to make dygraph faster (#37713)

* optimizer __call__ to make dygraph faster

* fix return type
---
 python/paddle/fluid/dygraph/layers.py | 68 +++++++++++++++------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 8ff960a90ea91d..662e233bd40d61 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -881,41 +881,49 @@ def clear_gradients(self):
     def _build_once(self, *args, **kwargs):
         pass
 
+    def _dygraph_call_func(self, *inputs, **kwargs):
+        for forward_pre_hook in self._forward_pre_hooks.values():
+            hook_result = forward_pre_hook(self, inputs)
+            if hook_result is not None:
+                if not isinstance(hook_result, tuple):
+                    hook_result = (hook_result, )
+                inputs = hook_result
+
+        if not self._built:
+            with program_desc_tracing_guard(False):
+                self._build_once(*inputs, **kwargs)
+
+                # TODO(liuyuhui) Only xpu broadcast parameters here.
+                # The other device is to call _sync_params_buffers in DataParallel
+                # to realize the parameter synchronization among multiply cards.
+                if parallel_helper._is_data_parallel_mode(
+                ) and paddle.is_compiled_with_xpu():
+                    parallel_helper._broadcast_parameters(
+                        self._parameters.values())
+
+            self._built = True
+
+        outputs = self.forward(*inputs, **kwargs)
+
+        for forward_post_hook in self._forward_post_hooks.values():
+            hook_result = forward_post_hook(self, inputs, outputs)
+            if hook_result is not None:
+                outputs = hook_result
+
+        return outputs
+
     def __call__(self, *inputs, **kwargs):
         # NOTE(Aurelius84): Why we still need param_guard here?
         # In case of ControlFlow, true_fn and false_fn will contain
         # parameters that may not trigger logic of `Operator` to create
         # them. we add this to make sure all parameters is available.
-        with param_guard(self._parameters), param_guard(self._buffers):
-            for forward_pre_hook in self._forward_pre_hooks.values():
-                hook_result = forward_pre_hook(self, inputs)
-                if hook_result is not None:
-                    if not isinstance(hook_result, tuple):
-                        hook_result = (hook_result, )
-                    inputs = hook_result
-
-            if not self._built:
-                with program_desc_tracing_guard(False):
-                    self._build_once(*inputs, **kwargs)
-
-                    # TODO(liuyuhui) Only xpu broadcast parameters here.
-                    # The other device is to call _sync_params_buffers in DataParallel
-                    # to realize the parameter synchronization among multiply cards.
-                    if parallel_helper._is_data_parallel_mode(
-                    ) and paddle.is_compiled_with_xpu():
-                        parallel_helper._broadcast_parameters(
-                            self._parameters.values())
-
-                self._built = True
-
-            outputs = self.forward(*inputs, **kwargs)
-
-            for forward_post_hook in self._forward_post_hooks.values():
-                hook_result = forward_post_hook(self, inputs, outputs)
-                if hook_result is not None:
-                    outputs = hook_result
-
-            return outputs
+        from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
+
+        if in_declarative_mode() and not framework.in_dygraph_mode():
+            with param_guard(self._parameters), param_guard(self._buffers):
+                return self._dygraph_call_func(*inputs, **kwargs)
+        else:
+            return self._dygraph_call_func(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         """

From f91e2331f071891f1e8650819f1a42f9460fd9f1 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Wed, 1 Dec 2021 11:31:18 +0800
Subject: [PATCH 019/124] open some import test in long_time_test (#37719)

---
 tools/windows/run_unittests.sh | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 4d6fa93c637445..a6aac30a374499 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -156,17 +156,12 @@ long_time_test="^test_gru_op$|\
 ^test_activation_op$|\
 ^test_bicubic_interp_v2_op$|\
 ^test_bilinear_interp_v2_op$|\
-^test_conv_nn_grad$|\
 ^test_crop_tensor_op$|\
 ^test_cross_entropy2_op$|\
 ^test_cross_op$|\
-^test_elementwise_div_op$|\
 ^test_elementwise_nn_grad$|\
 ^test_fused_elemwise_activation_op$|\
-^test_group_norm_op$|\
-^test_gru_unit_op$|\
 ^test_imperative_lod_tensor_to_selected_rows$|\
-^test_imperative_optimizer$|\
 ^test_imperative_selected_rows_to_lod_tensor$|\
 ^test_layer_norm_op$|\
 ^test_multiclass_nms_op$|\
@@ -175,8 +170,6 @@ long_time_test="^test_gru_op$|\
 ^test_norm_nn_grad$|\
 ^test_normal$|\
 ^test_pool3d_op$|\
-^test_pool2d_op$|\
-^test_softmax_with_cross_entropy_op$|\
 ^test_static_save_load$|\
 ^test_trilinear_interp_op$|\
 ^test_trilinear_interp_v2_op$|\
@@ -185,8 +178,6 @@ long_time_test="^test_gru_op$|\
 ^test_sequence_conv$|\
 ^test_sgd_op$|\
 ^test_transformer$|\
-^test_lstmp_op$|\
-^test_conv2d_transpose_op$|\
 ^test_imperative_auto_mixed_precision$|\
 ^test_imperative_optimizer_v2$|\
 ^test_strided_slice_op$"

From 06c3cce990557a6db40309bb7b25b8c349b20427 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 1 Dec 2021 11:53:28 +0800
Subject: [PATCH 020/124] Handled dispensable tensors in AutoCodeGen for Eager
 Dygraph (#37723)

---
 .../auto_code_generator/eager_generator.cc    | 264 +++++++++++++-----
 paddle/fluid/pybind/op_function_generator.cc  | 104 +------
 paddle/fluid/pybind/op_function_generator.h   | 121 ++++++++
 3 files changed, 314 insertions(+), 175 deletions(-)
 create mode 100644 paddle/fluid/pybind/op_function_generator.h

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index c0714775da852c..136eaebe2cc4bf 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/op_function_generator.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -358,18 +359,149 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   return true;
 }
 
+/* --------------------------------------- */
+/* --------- Preprocess Ins/Outs --------- */
+/* --------------------------------------- */
+static void PurifyOpProto(
+    const proto::OpProto& op_proto,
+    std::unordered_map<std::string, size_t>* fwd_inputs_name_pos_map,
+    std::unordered_map<std::string, size_t>* fwd_outputs_name_pos_map,
+    std::map<std::string, std::string>* grad_outs_slotname_map,
+    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
+    std::map<std::string, std::string>* grad_ins_grad_slotname_map,
+    std::vector<proto::OpProto::Var>* in_vars,
+    std::vector<proto::OpProto::Var>* out_vars,
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_ins,
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_outs) {
+  // Op Name
+  const std::string op_name = op_proto.type();
+
+  // Handle dispensable inputs
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    std::string input_name = input.name();
+
+    // Delete dispensable tensor unless specified in op_ins_map
+    if (input.dispensable()) {
+      if (!op_ins_map.count(op_name) ||
+          !op_ins_map[op_name].count(input_name)) {
+        VLOG(6) << "Removing Dispensable Input: " << input_name;
+
+        // in_vars
+        auto iter = in_vars->begin();
+        for (iter = in_vars->begin(); iter != in_vars->end(); iter++) {
+          if (iter->name() == input_name) {
+            break;
+          }
+        }
+        in_vars->erase(iter);
+
+        // grad_outs_slotname_map
+        auto grad_outs_slotname_map_purified = *grad_outs_slotname_map;
+        for (const auto& iter : *grad_outs_slotname_map) {
+          const std::string& grad_output_name = iter.first;
+          const std::string& matched_input_name = iter.second;
+          if (matched_input_name == input_name) {
+            grad_outs_slotname_map_purified.erase(grad_output_name);
+
+            PADDLE_ENFORCE(
+                grad_outs->count(grad_output_name) > 0,
+                paddle::platform::errors::Fatal(
+                    "Unable to find gradient output name in grad_outs."));
+            // grad_outs
+            grad_outs->erase(grad_output_name);
+          }
+        }
+        *grad_outs_slotname_map = grad_outs_slotname_map_purified;
+
+        // grad_ins_fwd_slotname_map: output as tensorwrapper
+        if (grad_ins_fwd_slotname_map->count(input_name))
+          grad_ins_fwd_slotname_map->erase(input_name);
+
+        // grad_ins: output as tensorwrapper
+        if (grad_ins->count(input_name)) grad_ins->erase(input_name);
+      }
+    }
+  }
+
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    std::string output_name = output.name();
+
+    // Delete dispensable tensor unless specified in op_outs_map
+    if (output.dispensable()) {
+      if (!op_outs_map.count(op_name) ||
+          !op_outs_map[op_name].count(output_name)) {
+        VLOG(6) << "Removing Dispensable Output: " << output_name;
+
+        // out_vars
+        auto iter = out_vars->begin();
+        for (iter = out_vars->begin(); iter != out_vars->end(); iter++) {
+          if (iter->name() == output_name) {
+            break;
+          }
+        }
+        out_vars->erase(iter);
+
+        // grad_ins_grad_slotname_map
+        auto grad_ins_grad_slotname_map_purified = *grad_ins_grad_slotname_map;
+        for (const auto& iter : *grad_ins_grad_slotname_map) {
+          const std::string& grad_input_name = iter.first;
+          const std::string& matched_output_name = iter.second;
+          if (matched_output_name == output_name) {
+            grad_ins_grad_slotname_map_purified.erase(grad_input_name);
+
+            PADDLE_ENFORCE(
+                grad_ins->count(grad_input_name) > 0,
+                paddle::platform::errors::Fatal(
+                    "Unable to find gradient input name in grad_ins."));
+            // grad_ins
+            grad_ins->erase(grad_input_name);
+          }
+        }
+        *grad_ins_grad_slotname_map = grad_ins_grad_slotname_map_purified;
+
+        // grad_ins_fwd_slotname_map: output as tensorwrapper
+        if (grad_ins_fwd_slotname_map->count(output_name))
+          grad_ins_fwd_slotname_map->erase(output_name);
+
+        // grad_ins: output as tensorwrapper
+        if (grad_ins->count(output_name)) grad_ins->erase(output_name);
+      }
+    }
+  }
+
+  /* ------ Maping forward slot name to fwd position ------ */
+  size_t in_pos = 0;
+  for (const auto& var : *in_vars) {
+    VLOG(6) << "Mapping input tensor: " << var.name()
+            << " To position: " << in_pos;
+    (*fwd_inputs_name_pos_map)[var.name()] = in_pos;
+    in_pos++;
+  }
+
+  size_t out_pos = 0;
+  for (const auto& var : *out_vars) {
+    VLOG(6) << "Mapping output tensor: " << var.name()
+            << " To position: " << out_pos;
+    (*fwd_outputs_name_pos_map)[var.name()] = out_pos;
+    out_pos++;
+  }
+}
+
 /* -------------------------------- */
 /* --------- Collect Info --------- */
 /* -------------------------------- */
 static bool CollectInformationFromOpInfo(
     const paddle::framework::OpInfo& op_info,
-    std::vector<paddle::framework::AttributeMap>* grad_node_default_attr_maps,
     std::vector<std::string>* grad_op_types,
-    std::unordered_map<std::string, size_t>* fwd_inputs_name_pos_map,
-    std::unordered_map<std::string, size_t>* fwd_outputs_name_pos_map,
     std::map<std::string, std::string>* grad_outs_slotname_map,
     std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
     std::map<std::string, std::string>* grad_ins_grad_slotname_map,
+    std::vector<proto::OpProto::Var>* in_vars,
+    std::vector<proto::OpProto::Var>* out_vars,
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
         grad_ins,
@@ -380,6 +512,13 @@ static bool CollectInformationFromOpInfo(
   const std::string& op_type = op_proto.type();
   std::vector<int64_t> dims = {1, 1, 1, 1};
 
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    in_vars->push_back(input);
+  }
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    out_vars->push_back(output);
+  }
+
   /* ------ Prepare "ins" ------ */
   std::map<std::string,
            std::vector<std::shared_ptr<paddle::imperative::VarBase>>>
@@ -494,7 +633,6 @@ static bool CollectInformationFromOpInfo(
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
     // Each OpBase
     paddle::imperative::OpBase& op_base = *iter;
-    grad_node_default_attr_maps->push_back(op_base.DefaultAttrsMap());
     grad_op_types->push_back(op_base.Type());
   }
 
@@ -538,22 +676,6 @@ static bool CollectInformationFromOpInfo(
                    grad_outs_slotname_map);
   VLOG(6) << "Finished Slotname Matching for Grad_Outs";
 
-  /* ------ Maping forward slot name to fwd position ------ */
-  size_t in_pos = 0;
-  for (const auto& iter : ins) {
-    VLOG(6) << "Mapping input tensor: " << iter.first
-            << " To position: " << in_pos;
-    (*fwd_inputs_name_pos_map)[iter.first] = in_pos;
-    in_pos++;
-  }
-  size_t out_pos = 0;
-  for (const auto& iter : outs) {
-    VLOG(6) << "Mapping output tensor: " << iter.first
-            << " To position: " << out_pos;
-    (*fwd_outputs_name_pos_map)[iter.first] = out_pos;
-    out_pos++;
-  }
-
   return true;
 }
 
@@ -561,16 +683,13 @@ static bool CollectInformationFromOpInfo(
 /* --------- CodeGen: Forward GradNode Creation ------ */
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating GradNode Creation codes";
 
-  const std::string& op_type = op_proto.type();
-
   // [Generation] Construct GradOpNode
   // Run ComputeRequiredGrad
 
@@ -578,7 +697,7 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
 
@@ -602,7 +721,7 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
@@ -636,8 +755,8 @@ static std::string GenerateGradNodeCreationContent(
   // [GradOpNode] Generation
   std::string grad_node_creation_str = "";
 
-  size_t bwd_in_slot_num = op_proto.outputs().size();
-  size_t bwd_out_slot_num = op_proto.inputs().size();
+  size_t bwd_in_slot_num = out_vars.size();
+  size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
       "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
@@ -669,7 +788,7 @@ static std::string GenerateGradNodeCreationContent(
   // [GradOpNode] SetGradOutMeta
   // [GradOpNode] Add Edges
   std::string compute_require_grad_args = "trace_backward";
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
     compute_require_grad_args += ", &" + input_autograd_name;
@@ -689,7 +808,7 @@ static std::string GenerateGradNodeCreationContent(
   // [AutogradMeta] SetOutRank
   // [AutogradMeta] SetHistory
   std::string pass_stop_gradient_args = "false";
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
     pass_stop_gradient_args += ", &" + output_autograd_name;
@@ -743,8 +862,6 @@ static std::string AppendUseOp(const std::string& op_type) {
 /* --------- CodeGen: Forward ----- */
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
@@ -758,7 +875,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
         std::string,
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   /*
     // Forward Function Example:
   std::tuple<vector<Tensor>, Tensor, vector<Tensor>>
@@ -779,6 +897,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   ,ConstructDuplicableOutput(Out1Num)} };
 
         // According to op_proto->attrs()
+
         egr::legacy::RunOp("op_type", ins, outs, attr_map,
   Controller.Instance().GetExpectedPlace(), {});
 
@@ -795,8 +914,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   */
   VLOG(6) << "Generating Dygraph Forward Function";
 
-  const std::string& op_type = op_proto.type();
-
   std::string generated_function_body = "";
   std::string dygraph_function_args_str = "";
 
@@ -806,8 +923,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Ins Map
   std::string ins_contents_str = "";
-  std::vector<std::string> input_args_str_list(op_proto.inputs().size());
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  std::vector<std::string> input_args_str_list(in_vars.size());
+  for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     size_t input_position = fwd_inputs_name_pos_map.at(input_name);
     if (input.duplicable()) {
@@ -848,7 +965,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
     if (output.duplicable()) {
@@ -898,17 +1015,17 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
       "     &default_attrs, true, {});\n";
   std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_proto.type());
+      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
   generated_function_body += trace_op_str;
   generated_function_body += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
   // [Generation] Convert output VarBase to Vector/Tensor
-  size_t output_size = op_proto.outputs().size();
+  size_t output_size = out_vars.size();
   std::vector<std::string> return_contents(output_size);
   std::vector<std::string> return_types(output_size);
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string out_tensor_str;
     size_t return_position = fwd_outputs_name_pos_map.at(output_name);
@@ -937,8 +1054,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
   std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-      grad_node_default_attr_maps, fwd_inputs_name_pos_map,
-      fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map, op_proto);
+      fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
+      grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
   generated_function_body += grad_node_creation_body_str;
   generated_function_body += "\n";
   VLOG(6) << "Generated GradNode Creation codes";
@@ -1004,8 +1121,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 /* --------- CodeGen: GradNode::operator() ------ */
 /* ---------------------------------------------- */
 static std::string GenerateGradNodeCCContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::vector<std::string>& grad_op_types,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
@@ -1020,7 +1135,8 @@ static std::string GenerateGradNodeCCContents(
         std::string,
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating Grad Node CC";
 
   /* [Outline]
@@ -1066,7 +1182,6 @@ static std::string GenerateGradNodeCCContents(
   }
   */
 
-  const std::string& op_type = op_proto.type();
   std::string generated_grad_function_body = "";
 
   // [Generation] Get Tracer
@@ -1122,7 +1237,7 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Outs Map
   std::unordered_set<std::string> duplicable_input_name_set;
-  for (const auto& in : op_proto.inputs()) {
+  for (const auto& in : in_vars) {
     if (in.duplicable()) duplicable_input_name_set.insert(in.name());
   }
 
@@ -1173,7 +1288,7 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Attrs Map
   std::string trace_opbase_str = "";
-  for (size_t i = 0; i < grad_node_default_attr_maps.size(); i++) {
+  for (size_t i = 0; i < grad_op_types.size(); i++) {
     const std::string& op_base_type = grad_op_types[i];
 
     const char* TRACE_OP_TEMPLATE =
@@ -1230,10 +1345,9 @@ static std::string GenerateGradNodeCCContents(
 /* --------- CodeGen: GradNode Header ------ */
 /* ----------------------------------------- */
 static std::string GenerateGradNodeHeaderContents(
-    const std::vector<paddle::framework::AttributeMap>&
-        grad_node_default_attr_maps,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const proto::OpProto& op_proto) {
+    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
+    const std::vector<proto::OpProto::Var>& out_vars) {
   VLOG(6) << "Generating Grad Node Header";
 
   const char* GRAD_NODE_TEMPLATE =
@@ -1261,8 +1375,6 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "};";
 
-  const std::string& op_type = op_proto.type();
-
   // [Generation] Handle Attributes
   std::string set_attr_map_str =
       "   void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {\n     "
@@ -1279,12 +1391,12 @@ static std::string GenerateGradNodeHeaderContents(
 
   // [Generation] Handle TensorWrappers
   std::unordered_set<std::string> duplicable_tensors;
-  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+  for (const proto::OpProto::Var& input : in_vars) {
     if (input.duplicable()) {
       duplicable_tensors.insert(input.name());
     }
   }
-  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+  for (const proto::OpProto::Var& output : out_vars) {
     if (output.duplicable()) {
       duplicable_tensors.insert(output.name());
     }
@@ -1454,13 +1566,12 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* ----------------------------- */
     /* ---- Collect Information ---- */
     /* ----------------------------- */
-    std::vector<paddle::framework::AttributeMap> grad_node_default_attr_maps;
     std::vector<std::string> grad_op_types;
-    std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
-    std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
     std::map<std::string, std::string> grad_outs_slotname_map;
     std::map<std::string, std::string> grad_ins_fwd_slotname_map;
     std::map<std::string, std::string> grad_ins_grad_slotname_map;
+    std::vector<proto::OpProto::Var> in_vars;
+    std::vector<proto::OpProto::Var> out_vars;
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
         grad_ins;
@@ -1470,13 +1581,20 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     VLOG(6) << "-------- CollectInformationFromOpInfo -------";
     bool is_available = CollectInformationFromOpInfo(
-        op_info, &grad_node_default_attr_maps, &grad_op_types,
-        &fwd_inputs_name_pos_map, &fwd_outputs_name_pos_map,
-        &grad_outs_slotname_map, &grad_ins_fwd_slotname_map,
-        &grad_ins_grad_slotname_map, &grad_ins, &grad_outs);
+        op_info, &grad_op_types, &grad_outs_slotname_map,
+        &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map, &in_vars,
+        &out_vars, &grad_ins, &grad_outs);
 
     if (!is_available) continue;
 
+    VLOG(6) << "-------- PurifyOpProto -------";
+    std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
+    std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
+    PurifyOpProto(*op_proto, &fwd_inputs_name_pos_map,
+                  &fwd_outputs_name_pos_map, &grad_outs_slotname_map,
+                  &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map,
+                  &in_vars, &out_vars, &grad_ins, &grad_outs);
+
     /* --------------------------- */
     /* --------- CodeGen --------- */
     /* --------------------------- */
@@ -1484,10 +1602,10 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
         GenerateForwardFunctionContents(
-            grad_node_default_attr_maps, fwd_inputs_name_pos_map,
-            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
-            grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins,
-            grad_outs, *op_proto);
+            fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
+            grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map,
+            grad_outs_slotname_map, grad_ins, grad_outs, op_type, in_vars,
+            out_vars);
     std::string fwd_function_str = body_and_declaration.first;
     GenerateForwardDygraphFile(op_type, output_dir, fwd_function_str);
 
@@ -1498,16 +1616,16 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* ---- xxx_node.h ---- */
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
     std::string grad_node_h_str = GenerateGradNodeHeaderContents(
-        grad_node_default_attr_maps, grad_ins_fwd_slotname_map, *op_proto);
+        grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
     GenerateNodeHFile(op_type, output_dir, grad_node_h_str);
 
     /* ---- xxx_node.cc ---- */
     VLOG(6) << "-------- GenerateGradNodeCCContents -------";
     std::string grad_node_cc_str = GenerateGradNodeCCContents(
-        grad_node_default_attr_maps, grad_op_types, fwd_inputs_name_pos_map,
-        fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
-        grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        *op_proto);
+        grad_op_types, fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
+        grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map,
+        grad_outs_slotname_map, grad_ins, grad_outs, op_type, in_vars,
+        out_vars);
     GenerateNodeCCFile(op_type, output_dir, grad_node_cc_str);
 
     VLOG(6) << op_type << ": Finished Generation";
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 850f208359e050..749782f2413e5d 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/pybind/op_function_generator.h"
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -30,108 +32,6 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
-// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
-// determined by the OP`s proto automatically, i.e., all the inputs registered
-// in OpMaker.
-// However, some OPs have dispensable inputs, which means the input can
-// be none for some conditions. It is discovered that most dispensable inputs
-// is not used in imperative mode, so we drop those inputs when generating OP
-// functions. While, for very few OPs, the dispensable inputs are used, we
-// need to manually specify them in this map.
-std::map<std::string, std::set<std::string>> op_ins_map = {
-    {"layer_norm", {"X", "Scale", "Bias"}},
-    {"bincount", {"X", "Weights"}},
-    {"fused_attention",
-     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
-      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
-    {"instance_norm", {"X", "Scale", "Bias"}},
-    {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
-    {"label_smooth", {"X", "PriorDist"}},
-    {"assign", {"X"}},
-    {"reshape2", {"X", "Shape"}},
-    {"expand", {"X", "ExpandTimes"}},
-    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"X", "InScale", "InAccum", "InState"}},
-    {"nll_loss", {"X", "Label", "Weight"}},
-    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
-    {"gather", {"X", "Index", "Axis"}},
-    {"roi_pool", {"X", "ROIs", "RoisNum"}},
-    {"roi_align", {"X", "ROIs", "RoisNum"}},
-    {"psroi_pool", {"X", "ROIs", "RoisNum"}},
-    {"collect_fpn_proposals",
-     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
-    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
-    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
-    {"hierarchical_sigmoid",
-     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
-    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
-    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
-    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
-    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
-    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
-    {"run_program", {"X", "Params"}},
-    {"fused_feedforward",
-     {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale",
-      "Ln1Bias", "Ln2Scale", "Ln2Bias"}},
-    {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
-    {"matrix_rank", {"X", "TolTensor"}},
-    {"adam",
-     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
-      "Beta2Pow", "MasterParam"}},
-    {"adamw",
-     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
-      "Beta2Pow", "MasterParam"}},
-};
-
-// NOTE(zhiqiu): Like op_ins_map.
-// Commonly, the outputs in auto-generated OP function are determined by the
-// OP`s proto automatically, i.e., all the outputs registered in OpMaker.
-// However, some OPs have dispensable outputs, which means the output can
-// be none for some conditions. It is discovered that most dispensable outputs
-// is not used in imperative mode, so we drop those outputs when generating OP
-// functions. While, for very few OPs, the dispensable outputs are used, we
-// need to manually specify them in this map.
-std::map<std::string, std::set<std::string>> op_outs_map = {
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"batch_norm",
-     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
-      "ReserveSpace"}},
-    {"fused_attention",
-     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
-      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
-      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
-      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
-    {"sync_batch_norm",
-     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
-      "ReserveSpace"}},
-    {"unique", {"Out", "Index", "Indices", "Counts"}},
-    {"unique_consecutive", {"Out", "Index", "Counts"}},
-    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
-    {"matrix_nms", {"Out", "Index", "RoisNum"}},
-    {"distribute_fpn_proposals",
-     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
-    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
-    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
-    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
-    {"lamb",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
-    {"run_program", {"DOut"}},
-    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-};
-
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
 // generated in C++ automatically.
 // However, some OPs need to pass the outputs from Python instead of generating
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
new file mode 100644
index 00000000000000..ad7fa780976d7d
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+// NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
+// determined by the OP`s proto automatically, i.e., all the inputs registered
+// in OpMaker.
+// However, some OPs have dispensable inputs, which means the input can
+// be none for some conditions. It is discovered that most dispensable inputs
+// is not used in imperative mode, so we drop those inputs when generating OP
+// functions. While, for very few OPs, the dispensable inputs are used, we
+// need to manually specify them in this map.
+std::map<std::string, std::set<std::string>> op_ins_map = {
+    {"layer_norm", {"X", "Scale", "Bias"}},
+    {"bincount", {"X", "Weights"}},
+    {"fused_attention",
+     {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "SrcMask", "OutLinearW",
+      "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"instance_norm", {"X", "Scale", "Bias"}},
+    {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
+    {"label_smooth", {"X", "PriorDist"}},
+    {"assign", {"X"}},
+    {"reshape2", {"X", "Shape"}},
+    {"expand", {"X", "ExpandTimes"}},
+    {"slice", {"Input", "StartsTensor", "EndsTensor"}},
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"X", "InScale", "InAccum", "InState"}},
+    {"nll_loss", {"X", "Label", "Weight"}},
+    {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
+    {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"psroi_pool", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
+    {"hierarchical_sigmoid",
+     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
+    {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
+    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
+    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
+    {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
+    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
+    {"run_program", {"X", "Params"}},
+    {"fused_feedforward",
+     {"Dropout1Seed", "Dropout2Seed", "Linear1Bias", "Linear2Bias", "Ln1Scale",
+      "Ln1Bias", "Ln2Scale", "Ln2Bias"}},
+    {"faster_tokenizer", {"Text", "Vocab", "TextPair"}},
+    {"matrix_rank", {"X", "TolTensor"}},
+    {"adam",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+    {"adamw",
+     {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow",
+      "Beta2Pow", "MasterParam"}},
+};
+
+// NOTE(zhiqiu): Like op_ins_map.
+// Commonly, the outputs in auto-generated OP function are determined by the
+// OP`s proto automatically, i.e., all the outputs registered in OpMaker.
+// However, some OPs have dispensable outputs, which means the output can
+// be none for some conditions. It is discovered that most dispensable outputs
+// is not used in imperative mode, so we drop those outputs when generating OP
+// functions. While, for very few OPs, the dispensable outputs are used, we
+// need to manually specify them in this map.
+std::map<std::string, std::set<std::string>> op_outs_map = {
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"fused_attention",
+     {"LnMean", "LnVariance", "LnOut", "QKVOut", "QKVBiasOut", "TransposeOut2",
+      "QKOut", "QKTVOut", "SoftmaxOut", "AttnDropoutMaskOut", "AttnDropoutOut",
+      "SrcMaskOut", "FMHAOut", "OutLinearOut", "DropoutMaskOut", "Ln2Mean",
+      "Ln2Variance", "BiasDropoutResidualOut", "Y"}},
+    {"sync_batch_norm",
+     {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
+      "ReserveSpace"}},
+    {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"unique_consecutive", {"Out", "Index", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"matrix_nms", {"Out", "Index", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
+    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
+    {"lamb",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"run_program", {"DOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+};

From cc47c83caecccd2b660991bfaa09552017cbc0bf Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Wed, 1 Dec 2021 13:21:23 +0800
Subject: [PATCH 021/124] fix fc_fuse pass (#37694)

* fix fc_fuse

* modify cmake notest,test=windows_ci

* retrigger all the ci
---
 paddle/fluid/framework/ir/fc_fuse_pass.cc     | 26 ++++++++++++++++++
 .../fluid/framework/ir/fc_fuse_pass_tester.cc |  8 +++---
 .../unittests/ir/inference/CMakeLists.txt     |  5 +++-
 .../ir/inference/test_fc_fuse_pass.py         | 27 ++++++++++++++++---
 4 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index bb78cdab677526..e246a10961c0c6 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -130,6 +130,32 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
+
+    // Only support 2D-Tensor as weight for FC
+    std::vector<int64_t> w_shape = w->Var()->GetShape();
+    size_t w_rank = w_shape.size();
+    if (w_rank != 2) return;
+
+    // axis of elementwise_add should be -1 or x_num_col_dims
+    auto x_num_col_dims =
+        BOOST_GET_CONST(int, mul->Op()->GetAttr("x_num_col_dims"));
+    auto axis = BOOST_GET_CONST(int, elementwise_add->Op()->GetAttr("axis"));
+    if (axis != -1 && axis != x_num_col_dims) return;
+
+    // Shape of bias should be [1, out_size] or [out_size]
+    std::vector<int64_t> b_shape = bias->Var()->GetShape();
+    if (b_shape.size() == 1) {
+      if (b_shape[0] != w_shape[1]) {
+        return;
+      }
+    } else if (b_shape.size() == 2) {
+      if (b_shape[0] != 1 || b_shape[1] != w_shape[1]) {
+        return;
+      }
+    } else {
+      return;
+    }
+
     Node* relu = nullptr;
     Node* relu_out = nullptr;
     if (with_relu) {
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 5046911036818c..39b544e7160796 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -55,14 +55,14 @@ TEST(FCFusePass, basic) {
   auto* bias_0 = layers.data("conv2d_bias_0", {}, true);
   auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false);
   auto* relu_out_0 = layers.relu(conv2d_out);
-  auto* weights_0 = layers.data("weights_0", {}, true);
+  auto* weights_0 = layers.data("weights_0", {5, 4}, true);
   auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
-  auto* bias_1 = layers.data("bias_1", {}, true);
+  auto* bias_1 = layers.data("bias_1", {4}, true);
   auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
   auto* relu_out_1 = layers.relu(add_out_0);
-  auto* weights_1 = layers.data("weights_1", {}, true);
+  auto* weights_1 = layers.data("weights_1", {8, 9}, true);
   auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
-  auto* bias_2 = layers.data("bias_2", {}, true);
+  auto* bias_2 = layers.data("bias_2", {1, 9}, true);
   auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
   VLOG(4) << add_out_1;
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 0b127d2a11f768..4126e604cc1f63 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -71,8 +71,11 @@ set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
 set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
+
+if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
 set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
+endif()
 
 if (WITH_MKLDNN)
   set_tests_properties(test_mkldnn_prelu_op PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
index 1db3a007131aaf..dccc29e75f0367 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -46,6 +46,17 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_gpu=True)
         yield config, ["fc"], (1e-5, 1e-5)
 
+        # trt static_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=8,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['fc'], (1e-5, 1e-5)
+
     def add_ignore_pass_case(self):
         # Here we put some skip rules to avoid known bugs
         def teller1(program_config, predictor_config):
@@ -53,14 +64,22 @@ def teller1(program_config, predictor_config):
             x_shape = list(program_config.inputs["mul_x"].shape)
             y_shape = list(program_config.weights["mul_y"].shape)
             bias_shape = program_config.weights["bias"].shape
-            if (bias_shape != [y_shape[-1], ] and
-                    bias_shape != [1, y_shape[-1]]):
+            bias_shape = list(program_config.weights["bias"].shape)
+
+            if predictor_config.tensorrt_engine_enabled():
+                # TensorRT cann't handle all the situation of elementwise_add
+                # disable it until this problem fixed
+                predictor_config.exp_disable_tensorrt_ops(["elementwise_add"])
+
+            if bias_shape != [y_shape[-1]] and bias_shape != [1, y_shape[-1]]:
                 return True
             return False
 
         def teller2(program_config, predictor_config):
             # TODO fuse has bug while axis != -1
-            if program_config.ops[1].attrs["axis"] != -1:
+            axis = program_config.ops[1].attrs["axis"]
+            if axis != -1 and axis != program_config.ops[0].attrs[
+                    "x_num_col_dims"]:
                 return True
             return False
 
@@ -164,7 +183,7 @@ def sample_program_config(self, draw):
 
     def test(self):
         self.run_and_statis(
-            quant=False, max_examples=300, passes=["fc_fuse_pass"])
+            quant=False, max_examples=500, passes=["fc_fuse_pass"])
 
 
 if __name__ == "__main__":

From e0fc893764d08446ea0a64e0f2467d4927e52f30 Mon Sep 17 00:00:00 2001
From: TTerror <tangzhiyi11@users.noreply.github.com>
Date: Wed, 1 Dec 2021 13:22:08 +0800
Subject: [PATCH 022/124] add prior_box for kunlun (#37697)

* add prior_box for kunlun

* update

* update CMakeLists
---
 .../fluid/operators/detection/CMakeLists.txt  |  21 +-
 .../operators/detection/prior_box_op_xpu.cc   | 108 ++++++++++
 .../fluid/platform/device/xpu/xpu2_op_list.h  |   1 +
 .../unittests/xpu/test_prior_box_op_xpu.py    | 201 ++++++++++++++++++
 4 files changed, 321 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/prior_box_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py

diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 08c44a2d39ecf0..a85bca36464990 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -18,11 +18,20 @@ endfunction()
 if (WITH_ASCEND_CL)
     detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
     detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc)
-    detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc)
 else()
     detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
     detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
-    detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+endif()
+
+if(WITH_XPU)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc)
+elseif(WITH_ASCEND_CL)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu prior_box_op_npu.cc)
+else()
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 endif()
 
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
@@ -63,14 +72,6 @@ else()
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
-if(WITH_XPU)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc)
-elseif(WITH_ASCEND_CL)
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc)
-else()
-  detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu)
-endif()
-
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 # set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
new file mode 100644
index 00000000000000..bab394689546e4
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename K>
+class PriorBoxOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+    auto min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+    K step_w = static_cast<K>(ctx.Attr<float>("step_w"));
+    K step_h = static_cast<K>(ctx.Attr<float>("step_h"));
+    K offset = static_cast<K>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    K step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<K>(img_width) / feature_width;
+      step_height = static_cast<K>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<K>(ctx.GetPlace());
+    vars->mutable_data<K>(ctx.GetPlace());
+
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    auto boxes_data = boxes->data<K>();
+    auto vars_data = vars->data<K>();
+    xpu::VectorParam<float> aspect_ratios_param{
+        aspect_ratios.data(), static_cast<int>(aspect_ratios.size()), nullptr};
+    xpu::VectorParam<float> min_sizes_param{
+        min_sizes.data(), static_cast<int>(min_sizes.size()), nullptr};
+    xpu::VectorParam<float> max_sizes_param{
+        max_sizes.data(), static_cast<int>(max_sizes.size()), nullptr};
+
+    int ret = xpu::gen_prior_box(
+        dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param,
+        max_sizes_param, feature_height, feature_width, img_height, img_width,
+        offset, step_height, step_width, clip, min_max_aspect_ratios_order);
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU gen_prior_box kernel return wrong value[%d %s]",
+                          ret, XPUAPIErrorMsg[ret]));
+
+    int box_num = feature_height * feature_width * num_priors;
+    int vlen = variances.size();
+    for (int i = 0; i < box_num; ++i) {
+      ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K),
+                       XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
+                                              "XPU xpu_memcpy return wrong "
+                                              "value[%d %s] in prior_box.",
+                                              ret, XPUAPIErrorMsg[ret]));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(prior_box, ops::PriorBoxOpXPUKernel<float, float>);
+
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 78fc53cfc8535e..636b27e051122a 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -289,6 +289,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       // AddMore
   };
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
new file mode 100644
index 00000000000000..44137f4718743c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
@@ -0,0 +1,201 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import math
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+
+paddle.enable_static()
+
+
+class TestPriorBoxOp(XPUOpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset
+        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.use_xpu = True
+        self.set_data()
+
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+        self.set_max_sizes()
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.set_min_max_aspect_ratios_order()
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.clip = True
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 0:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w, (c_y - c_h) /
+                                self.image_h, (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h
+                            ]
+                            idx += 1
+
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
+                                           self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
+class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+
+if __name__ == '__main__':
+    unittest.main()

From feda7c1d678c87562190db0b790d0eae02d3b98e Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Wed, 1 Dec 2021 14:21:18 +0800
Subject: [PATCH 023/124] HostEventRecorder (#37629)

* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2.

* update HostEventTracer

* update HostEventTracer

* fix c++17

* update

* update

* update

* update

* fix bug

Co-authored-by: liutiexing <liutiexing@google.com>
---
 .../framework/new_executor/interpretercore.cc |   2 +-
 paddle/fluid/platform/profiler.cc             | 381 +++++++++++++++++-
 paddle/fluid/platform/profiler.h              |  28 +-
 paddle/fluid/platform/profiler_helper.h       |   2 +
 4 files changed, 397 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 94b2118ba9d73a..f954b297510071 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -514,7 +514,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
     ready_ops.pop();
     auto& instr_node = vec_instruction_.at(instr_id);
     auto* op = instr_node.OpBase();
-    platform::RecordEvent instruction_event(op->Type());
+    platform::RecordEvent instruction_event(op->Type().c_str());
     interpreter::WaitEvent(instr_node, place_);
 
     try {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 40d9bb99f44f55..f6d9c8f64fd35a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <mutex>  // NOLINT
 #include <random>
+#include <sstream>
 #include <string>
+#include <type_traits>
 
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -30,6 +32,290 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
 namespace paddle {
 namespace platform {
 
+struct DurationEvent {
+ public:
+  DurationEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
+                EventRole role)
+      : name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}
+
+  DurationEvent(std::function<void *(size_t)> &arena_allocator,
+                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+                EventRole role, const std::string &attr_str)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+    buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
+    strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
+    attr = buf;
+  }
+
+  DurationEvent(const std::function<void *(size_t)> &arena_allocator,
+                const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
+                EventRole role)
+      : start_ns(start_ns), end_ns(end_ns), role(role) {
+    auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
+    strncpy(buf, name_str.c_str(), name_str.length() + 1);
+    name = buf;
+  }
+
+  const char *name = nullptr;  // not owned, designed for performance
+  uint64_t start_ns = 0;
+  uint64_t end_ns = 0;
+  EventRole role = EventRole::kOrdinary;
+  const char *attr = nullptr;  // not owned, designed for performance
+};
+
+template <typename HeadType, typename... RestTypes>
+struct ContainsStdString
+    : std::conditional_t<
+          std::is_same<std::string, std::remove_cv_t<std::remove_reference_t<
+                                        HeadType>>>::value,
+          std::true_type, ContainsStdString<RestTypes...>> {};
+
+template <typename TailType>
+struct ContainsStdString<TailType>
+    : std::is_same<std::string,
+                   std::remove_cv_t<std::remove_reference_t<TailType>>> {};
+
+template <typename EventType>
+class EventContainer {
+ public:
+  EventContainer() {
+    event_blocks_ = cur_event_block_ = new EventBlock;
+    str_blocks_ = cur_str_block_ = new StringBlock;
+  }
+  ~EventContainer() {
+    Reduce();
+    delete event_blocks_;
+    for (auto cur = str_blocks_; cur != nullptr;) {
+      auto next = cur->next;
+      delete cur;
+      cur = next;
+    }
+  }
+  DISABLE_COPY_AND_ASSIGN(EventContainer);
+
+ public:
+  // Record an event
+  template <typename... Args>
+  void Record(Args &&... args) {
+    DoRecord(ContainsStdString<Args...>(), std::forward<Args>(args)...);
+  }
+
+  // Get all events and clear the container
+  std::vector<EventType> Reduce();
+
+  // Return a buffer to store the string attribute of Event.
+  // HostEventRecorder locates in the static data section.
+  // So it's safe to use arena to avoid fragmented allocations.
+  char *GetStrBufFromArena(size_t size) { return GetStringStorage(size); }
+
+ private:
+  struct EventBlock {
+    union InitDeferedEvent {
+      InitDeferedEvent() {}
+      ~InitDeferedEvent() {}
+
+      EventType event;
+    };
+
+    static constexpr size_t kBlockSize = 1 << 24;  // 16 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+    static constexpr size_t kNumEvents = kAvailSize / sizeof(InitDeferedEvent);
+    static constexpr size_t kPadSize =
+        kAvailSize - kNumEvents * sizeof(InitDeferedEvent);
+    static constexpr size_t kMinimumEventsPerBlock = 1024;
+    static_assert(
+        kNumEvents >= kMinimumEventsPerBlock,
+        "EventType is too large for kBlockSize, make kBlockSize larger");
+
+    size_t offset = 0;
+    EventBlock *next = nullptr;
+    InitDeferedEvent events[kNumEvents];
+    char padding[kPadSize];
+  };
+  static_assert(sizeof(EventBlock) == EventBlock::kBlockSize,
+                "sizeof EventBlock must equal to kBlockSize");
+
+  struct StringBlock {
+    static constexpr size_t kBlockSize = 1 << 22;  // 4 MB
+    static constexpr size_t kAvailSize =
+        kBlockSize - sizeof(size_t) - sizeof(nullptr);
+
+    size_t offset = 0;
+    StringBlock *next = nullptr;
+    char storage[kAvailSize];
+  };
+  static_assert(sizeof(StringBlock) == StringBlock::kBlockSize,
+                "sizeof StringBlock must equal to kBlockSize");
+
+  // Record an event with string arguments
+  template <typename... Args>
+  void DoRecord(std::true_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    std::function<void *(size_t)> allocator = [this](size_t size) {
+      return GetStrBufFromArena(size);
+    };
+    new (storage) EventType(allocator, std::forward<Args>(args)...);
+  }
+
+  // Record an event without any string argument
+  template <typename... Args>
+  void DoRecord(std::false_type, Args &&... args) {
+    auto *storage = GetEventStorage();
+    new (storage) EventType(std::forward<Args>(args)...);
+  }
+
+  EventType *GetEventStorage();
+
+  char *GetStringStorage(size_t sz);
+
+  EventBlock *event_blocks_ = nullptr;
+  EventBlock *cur_event_block_ = nullptr;
+  StringBlock *str_blocks_ = nullptr;
+  StringBlock *cur_str_block_ = nullptr;
+};
+
+template <typename EventType>
+std::vector<EventType> EventContainer<EventType>::Reduce() {
+  std::vector<EventType> all_events;
+  size_t event_cnt = 0;
+  for (auto cur = event_blocks_; cur != nullptr; cur = cur->next) {
+    event_cnt += cur->offset;
+  }
+  all_events.reserve(event_cnt);
+  for (auto cur = event_blocks_; cur != nullptr;) {
+    for (size_t i = 0; i < cur->offset; ++i) {
+      all_events.emplace_back(cur->events[i].event);
+    }
+    auto next = cur->next;
+    delete cur;
+    cur = next;
+  }
+  event_blocks_ = cur_event_block_ = new EventBlock;
+  return std::move(all_events);
+}
+
+template <typename EventType>
+EventType *EventContainer<EventType>::GetEventStorage() {
+  if (UNLIKELY(cur_event_block_->offset >=
+               EventBlock::kNumEvents)) {  // another block
+    cur_event_block_->next = new EventBlock;
+    cur_event_block_ = cur_event_block_->next;
+  }
+  auto &obj = cur_event_block_->events[cur_event_block_->offset].event;
+  ++cur_event_block_->offset;
+  return &obj;
+}
+
+template <typename EventType>
+char *EventContainer<EventType>::GetStringStorage(size_t sz) {
+  if (UNLIKELY(cur_str_block_->offset + sz >
+               StringBlock::kAvailSize)) {  // another block
+    cur_str_block_->next = new StringBlock;
+    cur_str_block_ = cur_str_block_->next;
+  }
+  char *storage = cur_str_block_->storage + cur_str_block_->offset;
+  cur_str_block_->offset += sz;
+  return storage;
+}
+
+struct ThreadEventSection {
+  std::string thread_name;
+  uint64_t thread_id;
+  std::vector<DurationEvent> events;
+};
+
+class ThreadEventRecorder {
+ public:
+  ThreadEventRecorder();
+  DISABLE_COPY_AND_ASSIGN(ThreadEventRecorder);
+
+ public:
+  // Forward call to EventContainer::Record
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    base_evt_cntr_.Record(std::forward<Args>(args)...);
+  }
+
+  ThreadEventSection GatherEvents() {
+    ThreadEventSection thr_sec;
+    thr_sec.thread_name = thread_name_;
+    thr_sec.thread_id = thread_id_;
+    thr_sec.events = std::move(base_evt_cntr_.Reduce());
+    return std::move(thr_sec);
+  }
+
+ private:
+  uint64_t thread_id_;
+  std::string thread_name_;
+  EventContainer<DurationEvent> base_evt_cntr_;
+};
+
+struct HostEventSection {
+  std::string process_name;
+  uint64_t process_id;
+  std::vector<ThreadEventSection> thr_sections;
+};
+
+class HostEventRecorder {
+ public:
+  // singleton
+  static HostEventRecorder &GetInstance() {
+    static HostEventRecorder instance;
+    return instance;
+  }
+
+  // If your string argument has a longer lifetime than the Event,
+  // use 'const char*'. e.g.: string literal, op name, etc.
+  // Do your best to avoid using 'std::string' as the argument type.
+  // It will cause deep-copy to harm performance.
+  template <typename... Args>
+  void RecordEvent(Args &&... args) {
+    GetThreadLocalRecorder().RecordEvent(std::forward<Args>(args)...);
+  }
+
+  // Poor performance, call it at the ending
+  HostEventSection GatherEvents();
+
+  void RegisterThreadRecorder(uint64_t tid, ThreadEventRecorder *recorder) {
+    const std::lock_guard<std::mutex> guard(thread_recorders_lock_);
+    thread_recorders_[tid] = recorder;
+  }
+
+ private:
+  HostEventRecorder() = default;
+  DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
+
+  ThreadEventRecorder &GetThreadLocalRecorder() {
+    static thread_local ThreadEventRecorder tls_recorder;
+    return tls_recorder;
+  }
+
+  std::mutex thread_recorders_lock_;
+  std::unordered_map<uint64_t, ThreadEventRecorder *> thread_recorders_;
+};
+
+static uint64_t GetThreadId() {
+  return std::hash<std::thread::id>{}(std::this_thread::get_id());
+}
+
+ThreadEventRecorder::ThreadEventRecorder() {
+  thread_id_ = GetThreadId();
+  HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
+}
+
+HostEventSection HostEventRecorder::GatherEvents() {
+  HostEventSection host_sec;
+  host_sec.thr_sections.reserve(thread_recorders_.size());
+  for (auto &kv : thread_recorders_) {
+    host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
+  }
+  return std::move(host_sec);
+}
+
 MemEvenRecorder MemEvenRecorder::recorder;
 
 Event::Event(EventType type, std::string name, uint32_t thread_id,
@@ -57,8 +343,44 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
+RecordEvent::RecordEvent(const char *name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name);
+    is_pushed_ = true;
+  }
+#endif
+#endif
+  if (UNLIKELY(g_enable_host_event_recorder_hook == false)) {
+    RecordEvent(name, role, "none");
+    return;
+  }
+  shallow_copy_name_ = name;
+  role_ = role;
+  start_ns_ = PosixInNsec();
+}
+
+RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name.c_str());
+    is_pushed_ = true;
+  }
+#endif
+#endif
+  if (UNLIKELY(g_enable_host_event_recorder_hook == false)) {
+    RecordEvent(name, role, "none");
+    return;
+  }
+  name_ = new std::string(name);
+  role_ = role;
+  start_ns_ = PosixInNsec();
+}
+
 RecordEvent::RecordEvent(const std::string &name, const EventRole role,
-                         const std::string attr) {
+                         const std::string &attr) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -67,17 +389,26 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role,
   }
 #endif
 #endif
+  if (g_enable_host_event_recorder_hook) {
+    name_ = new std::string(name);
+    start_ns_ = PosixInNsec();
+    attr_ = new std::string(attr);
+    return;
+  }
+
   if (g_state == ProfilerState::kDisabled || name.empty()) return;
 
   // do some initialization
+  name_ = new std::string(name);
   start_ns_ = PosixInNsec();
   role_ = role;
+  attr_ = new std::string(attr);
   is_enabled_ = true;
   // lock is not needed, the code below is thread-safe
   // Maybe need the same push/pop behavior.
   Event *e = PushEvent(name, role, attr);
   SetCurAnnotation(e);
-  name_ = e->name();
+  // name_ = e->name();
 }
 
 RecordEvent::~RecordEvent() {
@@ -88,15 +419,36 @@ RecordEvent::~RecordEvent() {
   }
 #endif
 #endif
+  uint64_t end_ns = PosixInNsec();
+  if (LIKELY(g_enable_host_event_recorder_hook)) {
+    if (LIKELY(shallow_copy_name_ != nullptr)) {
+      HostEventRecorder::GetInstance().RecordEvent(shallow_copy_name_,
+                                                   start_ns_, end_ns, role_);
+    } else if (name_ != nullptr) {
+      if (attr_ == nullptr) {
+        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
+                                                     role_);
+      } else {
+        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
+                                                     role_, *attr_);
+      }
+    }
+    delete name_;
+    delete attr_;
+    return;
+  }
+
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
-                          BlockDepth(), g_thread_id);
+    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
+                          g_thread_id);
   }
   ClearCurAnnotation();
-  PopEvent(name_, role_);
+  PopEvent(*name_, role_);
+  delete name_;
+  delete attr_;
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -148,11 +500,11 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
   PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
 
-RecordRPCEvent::RecordRPCEvent(const std::string &name) {
+/*RecordRPCEvent::RecordRPCEvent(const std::string &name) {
   if (FLAGS_enable_rpc_profiler) {
     event_.reset(new platform::RecordEvent(name));
   }
-}
+}*/
 
 RecordBlock::RecordBlock(int block_id)
     : is_enabled_(false), start_ns_(PosixInNsec()) {
@@ -362,5 +714,20 @@ void NvprofEnableRecordEvent() {
 
 void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
 
+void EnableHostEventRecorder() { g_enable_host_event_recorder_hook = true; }
+
+std::string PrintHostEvents() {
+  std::ostringstream oss;
+  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
+    oss << thr_evt_sec.thread_id << std::endl;
+    for (const auto &evt : thr_evt_sec.events) {
+      oss << "{ " << evt.name << " | " << evt.start_ns << " | " << evt.end_ns
+          << " }" << std::endl;
+    }
+  }
+  return oss.str();
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index fbae6165e313a2..de814faec2523e 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -128,31 +128,38 @@ struct MemEvenRecorder {
 };
 
 struct RecordEvent {
-  RecordEvent(const std::string& name,
-              const EventRole role = EventRole::kOrdinary,
-              const std::string attr = "none");
+  explicit RecordEvent(const std::string& name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  explicit RecordEvent(const char* name,
+                       const EventRole role = EventRole::kOrdinary);
+
+  RecordEvent(const std::string& name, const EventRole role,
+              const std::string& attr);
 
   ~RecordEvent();
 
   bool is_enabled_{false};
   bool is_pushed_{false};
-  uint64_t start_ns_;
   // Event name
-  std::string name_;
+  const std::string* name_{nullptr};
+  const char* shallow_copy_name_{nullptr};
+  uint64_t start_ns_;
   // Need to distinguish name by op type, block_id, program_id and perhaps
   // different kernel invocations within an op.
-  std::string full_name_;
+  // std::string full_name_;
   EventRole role_{EventRole::kOrdinary};
+  const std::string* attr_{nullptr};
 };
 
-class RecordRPCEvent {
+/*class RecordRPCEvent {
  public:
   explicit RecordRPCEvent(const std::string& name);
   ~RecordRPCEvent() {}
 
  private:
   std::unique_ptr<RecordEvent> event_;
-};
+};*/
 
 struct RecordBlock {
   explicit RecordBlock(int block_id);
@@ -242,5 +249,10 @@ int64_t ListenerId();
 void NvprofEnableRecordEvent();
 void NvprofDisableRecordEvent();
 
+void EnableHostEventRecorder();
+
+// Defined for UT
+std::string PrintHostEvents();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index a8438263cb97b9..3408971efa4115 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -47,6 +47,8 @@ static TracerOption g_tracer_option = TracerOption::kDefault;
 static ProfilerState g_state = ProfilerState::kDisabled;
 // To hook RecordEvent's events, use it to nvtx timeline
 static bool g_enable_nvprof_hook = false;
+// To hook RecordEvent, use HostEventRecorder
+static bool g_enable_host_event_recorder_hook = false;
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;

From 79095918644e24e9f6e57ad544af0a35f00b4010 Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Wed, 1 Dec 2021 16:34:56 +0800
Subject: [PATCH 024/124] fix build in xpu (#37699)

---
 .../fluid/eager/accumulation/gradient_accumulation.cc  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 7345c3612381bc..9d475d96e56ce0 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -193,13 +193,14 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
 
   // TODO(jiabin): Support NPU here
   PADDLE_TENSOR_ADD(float);
-  // NOTE(phlrain): xpu only support float
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
   PADDLE_TENSOR_ADD(double);
   // NOTE(chenweihang): only support complex grad tensor accumulated,
   // support selected rows if needed in the future
   PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
   PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-
+#endif
 #undef PADDLE_TENSOR_ADD
 
   if (data_type == paddle::framework::proto::VarType::FP16) {
@@ -268,13 +269,14 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
 
   // TODO(jiabin): Support NPU here
   PADDLE_TENSOR_ADD(float);
-  // NOTE(phlrain): xpu only support float
+// NOTE(phlrain): xpu only support float
+#ifndef PADDLE_WITH_XPU
   PADDLE_TENSOR_ADD(double);
   // NOTE(chenweihang): only support complex grad tensor accumulated,
   // support selected rows if needed in the future
   PADDLE_TENSOR_ADD(paddle::platform::complex<float>);
   PADDLE_TENSOR_ADD(paddle::platform::complex<double>);
-
+#endif
 #undef PADDLE_TENSOR_ADD
 
   if (data_type == paddle::framework::proto::VarType::FP16) {

From 0adc2006bad9521de147c905372ccbfd2c1c02ef Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 1 Dec 2021 18:25:34 +0800
Subject: [PATCH 025/124] [fleet_executor] auto STOP msg and auto notify
 carrier  (#37742)

---
 .../distributed/fleet_executor/carrier.cc     | 49 +++++++++++++++----
 .../distributed/fleet_executor/carrier.h      | 17 +++----
 .../fleet_executor/compute_interceptor.cc     | 22 ++++++---
 .../fleet_executor/compute_interceptor.h      |  1 +
 .../fleet_executor/fleet_executor.cc          |  6 +--
 .../fleet_executor/fleet_executor.h           |  2 +-
 .../distributed/fleet_executor/interceptor.cc | 16 +++---
 .../distributed/fleet_executor/message_bus.cc |  4 ++
 .../test/compute_interceptor_run_op_test.cc   | 15 ++----
 .../test/compute_interceptor_test.cc          | 20 +++-----
 10 files changed, 92 insertions(+), 60 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 8a4f10473e3d27..73f22592dc3a75 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -24,14 +25,14 @@ namespace distributed {
 
 USE_INTERCEPTOR(Compute);
 
-void Carrier::Init(
-    const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-    framework::Scope* root_scope, framework::Scope* minibatch_scope,
-    const std::vector<framework::Scope*>& microbatch_scopes,
-    const platform::Place& place) {
+void Carrier::Init(std::shared_ptr<RuntimeGraph> runtime_graph,
+                   framework::Scope* root_scope,
+                   framework::Scope* minibatch_scope,
+                   const std::vector<framework::Scope*>& microbatch_scopes,
+                   const platform::Place& place) {
   PADDLE_ENFORCE_EQ(is_init_, false, platform::errors::AlreadyExists(
                                          "Carrier is already init."));
-  interceptor_id_to_node_ = interceptor_id_to_node;
+  runtime_graph_ = runtime_graph;
   minibatch_scope_ = minibatch_scope;
   microbatch_scopes_ = microbatch_scopes;
   place_ = place;
@@ -41,15 +42,34 @@ void Carrier::Init(
   is_init_ = true;
 }
 
-Carrier::~Carrier() {
+void Carrier::Release() {
   // NOTE(wangxi): must join before `Derived Interceptor` destruct,
   // otherwise Derived object will be destructed before thread complete.
+
+  // Sending STOP msg to the source interceptor
+  MessageBus& msg_bus = MessageBus::Instance();
+  PADDLE_ENFORCE_EQ(msg_bus.IsInit(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Message bus has not been initialized."));
+  for (int64_t id : source_interceptor_ids_) {
+    VLOG(3) << "Carrier Release is sending stop to source interceptor " << id
+            << ".";
+    InterceptorMessage stop_msg;
+    // source node STOP is send by carrier, so set src_id=-1
+    stop_msg.set_src_id(-1);
+    stop_msg.set_dst_id(id);
+    stop_msg.set_message_type(STOP);
+    msg_bus.Send(stop_msg);
+  }
+
   // TODO(wangxi): Maybe need a better to use thread.
   for (auto& interceptor : interceptor_idx_to_interceptor_) {
     interceptor.second->Join();
   }
 }
 
+Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
+
 bool Carrier::EnqueueInterceptorMessage(
     const InterceptorMessage& interceptor_message) {
   // enqueue message to interceptor
@@ -139,6 +159,17 @@ void Carrier::SetCreatingFlag(bool flag) {
   creating_interceptors_ = flag;
   creating_flag_mutex_.unlock();
   if (!flag) {
+    for (auto& pair : interceptor_idx_to_interceptor_) {
+      // update the source interceptor id
+      if (std::find(source_interceptor_ids_.begin(),
+                    source_interceptor_ids_.end(),
+                    pair.first) == source_interceptor_ids_.end()) {
+        auto task = pair.second->GetTaskNode();
+        if (task != nullptr && task->upstream().empty()) {
+          source_interceptor_ids_.emplace_back(pair.first);
+        }
+      }
+    }
     // finish create interceptors outside, handle tmp messsages
     HandleTmpMessages();
   }
@@ -161,9 +192,9 @@ void Carrier::HandleTmpMessages() {
 
 void Carrier::CreateInterceptors() {
   // create each Interceptor
-  if (!interceptor_id_to_node_.empty()) {
+  if (!(runtime_graph_->intercepter_id_to_node().empty())) {
     // no auto init since there is no config
-    for (const auto& item : interceptor_id_to_node_) {
+    for (const auto& item : runtime_graph_->intercepter_id_to_node()) {
       int64_t interceptor_id = item.first;
       TaskNode* task_node = item.second;
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index b5976b297f9139..0c54201c94034f 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -39,6 +39,7 @@ namespace distributed {
 
 class TaskNode;
 class InterceptorMessageServiceImpl;
+class RuntimeGraph;
 
 // A singleton MessageBus
 class Carrier final {
@@ -48,13 +49,13 @@ class Carrier final {
     return carrier;
   }
 
-  void Init(
-      const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
-      framework::Scope* root_scope, framework::Scope* minibatch_scope,
-      const std::vector<framework::Scope*>& microbatch_scopes,
-      const platform::Place& place);
+  void Init(std::shared_ptr<RuntimeGraph> runtime_graph,
+            framework::Scope* root_scope, framework::Scope* minibatch_scope,
+            const std::vector<framework::Scope*>& microbatch_scopes,
+            const platform::Place& place);
 
   ~Carrier();
+  void Release();
 
   // Enqueue a message to corresponding interceptor id
   bool EnqueueInterceptorMessage(const InterceptorMessage& interceptor_message);
@@ -84,9 +85,6 @@ class Carrier final {
 
   void HandleTmpMessages();
 
-  // interceptor logic id to the Nodes info
-  std::unordered_map<int64_t, TaskNode*> interceptor_id_to_node_;
-
   // interceptor logic id to actually interceptor
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
       interceptor_idx_to_interceptor_;
@@ -105,7 +103,8 @@ class Carrier final {
   framework::Scope* root_scope_;
   framework::Scope* minibatch_scope_;
   paddle::platform::Place place_;
-  paddle::platform::DeviceContext* dev_ctx_ = nullptr;
+  paddle::platform::DeviceContext* dev_ctx_{nullptr};
+  std::shared_ptr<RuntimeGraph> runtime_graph_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index fd55aa2aa1c465..3d4078c932f702 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -51,6 +51,11 @@ void ComputeInterceptor::PrepareDeps() {
                           "times, but now max_run_times=%ld",
                           node_->max_run_times()));
   }
+
+  // If there is no downstream or every downstream is in different rank,
+  // then this interceptor is the last one for current rank.
+  // This can be get during init, can be cached for later use.
+  is_last_ = downstream.empty();
 }
 
 void ComputeInterceptor::IncreaseReady(int64_t up_id) {
@@ -129,7 +134,8 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
 
     InterceptorMessage ready_msg;
     ready_msg.set_message_type(DATA_IS_READY);
-    VLOG(3) << "ComputeInterceptor Send data_is_ready msg to " << down_id;
+    VLOG(3) << "ComputeInterceptor " << interceptor_id_
+            << " Send data_is_ready msg to " << down_id;
     Send(down_id, ready_msg);
   }
 }
@@ -148,7 +154,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATE_IS_USELESS);
-    VLOG(3) << "ComputeInterceptor Reply data_is_useless msg to " << up_id;
+    VLOG(3) << "ComputeInterceptor " << interceptor_id_
+            << " Reply data_is_useless msg to " << up_id;
     Send(up_id, reply_msg);
   }
 }
@@ -159,7 +166,7 @@ void ComputeInterceptor::Run() {
 
     // step_ %= node_->max_run_times();
     for (auto op : node_->ops()) {
-      auto* scope = microbatch_scopes_[step_ % node_->max_slot_nums()];
+      auto* scope = microbatch_scopes_[step_ % node_->max_run_times()];
       op->Run(*scope, place_);
     }
     ++step_;
@@ -168,6 +175,10 @@ void ComputeInterceptor::Run() {
     SendDataReadyToDownStream();
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
+    // Try to stop Carrier
+    if (step_ % node_->max_run_times() == 0 && is_last_) {
+      StopCarrier();
+    }
   }
 
   // If there is no limit, source interceptor can be executed
@@ -221,11 +232,6 @@ void ComputeInterceptor::TryStop() {
     Send(down_id, stop);
   }
   stop_ = true;
-
-  if (out_buffs_.size() == 0) {
-    // TODO(fleet executor dev) need a better place to notify
-    StopCarrier();
-  }
 }
 
 void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 97e6da2f00eaea..8ed443ca971fb1 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -44,6 +44,7 @@ class ComputeInterceptor : public Interceptor {
 
  private:
   bool is_source_{false};
+  bool is_last_{false};
   int64_t step_{0};
 
   // upstream_id-->(max_ready_size, ready_size)
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index ec60ec5fd5901a..e84e37a58eb5cb 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -38,7 +38,7 @@ FleetExecutor::~FleetExecutor() {
 void FleetExecutor::Init(const framework::ProgramDesc& program_desc,
                          framework::Scope* scope,
                          const platform::Place& place) {
-  runtime_graph_ = std::make_unique<RuntimeGraph>(program_desc, exe_desc_);
+  runtime_graph_ = std::make_shared<RuntimeGraph>(program_desc, exe_desc_);
   root_scope_ = scope;
   place_ = place;
   PADDLE_ENFORCE_NOT_NULL(root_scope_, platform::errors::InvalidArgument(
@@ -58,8 +58,8 @@ void FleetExecutor::Init(const framework::ProgramDesc& program_desc,
 void FleetExecutor::InitCarrier() {
   Carrier& carrier_instance = Carrier::Instance();
   if (!carrier_instance.IsInit()) {
-    carrier_instance.Init(runtime_graph_->intercepter_id_to_node(), root_scope_,
-                          minibatch_scope_, microbatch_scopes_, place_);
+    carrier_instance.Init(runtime_graph_, root_scope_, minibatch_scope_,
+                          microbatch_scopes_, place_);
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index 7be18772e9ec9f..cee739506b7e62 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -47,7 +47,7 @@ class FleetExecutor final {
   void InitCarrier();
   void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
   FleetExecutorDesc exe_desc_;
-  std::unique_ptr<RuntimeGraph> runtime_graph_;
+  std::shared_ptr<RuntimeGraph> runtime_graph_;
   framework::Scope* root_scope_;
   framework::Scope* minibatch_scope_;
   platform::Place place_;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 63c2bb3fc6eecb..26927f34c6879b 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -46,7 +46,6 @@ void Interceptor::Handle(const InterceptorMessage& msg) {
     VLOG(3) << "Interceptor is using default message handler. This handler is "
                "only used for test purpose. Check whether you init interceptor "
                "in the proper way.";
-
     if (msg.message_type() == DATA_IS_READY) {
       if (node_->role() != 2) {
         VLOG(3) << "Fake handler is sending DATA_IS_READY message to: "
@@ -54,14 +53,19 @@ void Interceptor::Handle(const InterceptorMessage& msg) {
         InterceptorMessage data_is_ready_msg;
         data_is_ready_msg.set_message_type(DATA_IS_READY);
         Send(interceptor_id_ + 1, data_is_ready_msg);
+      } else {
+        // NOTE: max run time is reach for last interceptor
+        StopCarrier();
       }
-      VLOG(3) << "Fake handler is sending stop message to it self.";
-      InterceptorMessage stop_msg;
-      stop_msg.set_message_type(STOP);
-      Send(interceptor_id_, stop_msg);
     } else if (msg.message_type() == STOP) {
       stop_ = true;
-      StopCarrier();
+      if (node_->role() != 2) {
+        VLOG(3) << "Fake handler is sending STOP message to: "
+                << interceptor_id_ + 1 << ".";
+        InterceptorMessage stop_msg;
+        stop_msg.set_message_type(STOP);
+        Send(interceptor_id_ + 1, stop_msg);
+      }
     }
   }
 }
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index de2171e68e19e2..688a6f3a388218 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -57,6 +57,10 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
+  // NOTE: fleet_executor inits carrier before message bus,
+  // therefore the message bus's destructor will be called first
+  Carrier& carrier = Carrier::Instance();
+  carrier.Release();
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   server_.Stop(1000);
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 2d9776738f8318..c5348db83e0298 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -61,15 +61,15 @@ TEST(ComputeInterceptor, Compute) {
   std::vector<framework::Scope*> scopes = {scope, scope};
   platform::Place place = platform::CPUPlace();
 
+  Carrier& carrier = Carrier::Instance();
+
   MessageBus& msg_bus = MessageBus::Instance();
   msg_bus.Init({{0, 0}, {1, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
-  Carrier& carrier = Carrier::Instance();
-
   // FIXME: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a =
-      new TaskNode(0, ops, 0, 0, 2, 2);  // role, ops, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0);
+      new TaskNode(0, ops, 0, 0, 2, 0);  // role, ops, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 2, 0);
 
   // a->b
   node_a->AddDownstreamTask(1);
@@ -90,13 +90,6 @@ TEST(ComputeInterceptor, Compute) {
   msg.set_src_id(-1);
   msg.set_dst_id(0);
   carrier.EnqueueInterceptorMessage(msg);
-
-  // stop
-  InterceptorMessage stop;
-  stop.set_message_type(STOP);
-  stop.set_src_id(-1);
-  stop.set_dst_id(0);
-  carrier.EnqueueInterceptorMessage(stop);
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 3cfd3073c8cb9c..8f44b2035aea02 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -35,31 +35,25 @@ class StartInterceptor : public Interceptor {
   void NOP(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
       stop_ = true;
+      InterceptorMessage stop;
+      stop.set_message_type(STOP);
+      Send(1, stop);  // stop 1, compute
       return;
     }
     std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
               << std::endl;
-    ++count_;
-    if (count_ == 3) {
-      InterceptorMessage stop;
-      stop.set_message_type(STOP);
-      Send(msg.dst_id(), stop);  // stop 0, this
-      Send(msg.src_id(), stop);  // stop 1, compute
-    }
   }
-  int count_{0};
 };
 
 TEST(ComputeInterceptor, Compute) {
+  Carrier& carrier = Carrier::Instance();
   MessageBus& msg_bus = MessageBus::Instance();
   msg_bus.Init({{0, 0}, {1, 0}, {2, 0}}, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
-  Carrier& carrier = Carrier::Instance();
-
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* node_a = new TaskNode(0, 0, 0, 0, 0);  // role, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 0, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 0, 0);
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
 
   // a->b->c
   node_a->AddDownstreamTask(1);

From 44def66a18b6490db310e202334f1a5fd5dd4f2e Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Wed, 1 Dec 2021 19:13:39 +0800
Subject: [PATCH 026/124] Remove cpp layer (#37730)

* optimizer __call__ to make dygraph faster

* fix return type

* remove cpp Layer
---
 paddle/fluid/imperative/layer.h               | 10 --------
 paddle/fluid/pybind/imperative.cc             | 24 -------------------
 python/paddle/fluid/dygraph/layers.py         |  6 ++---
 .../tests/unittests/test_imperative_basic.py  |  2 --
 python/paddle/framework/io.py                 |  6 ++---
 5 files changed, 6 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ec5fb63f0d9339..892c864027d110 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -281,16 +281,6 @@ class VarBase {
   static ThreadSafeNameSet name_set_;
 };
 
-class Layer {
- public:
-  virtual ~Layer() {}
-
-  virtual std::vector<std::shared_ptr<VarBase>> Forward(
-      const std::vector<std::shared_ptr<VarBase>>& inputs) {
-    return {};
-  }
-};
-
 std::shared_ptr<GradOpNode> CreateGradOpNode(
     const framework::OperatorBase& op, const NameVarBaseMap& ins,
     const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 5ff0e58d858017..cad0c1e70cc03c 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -59,18 +59,6 @@ PyTypeObject *g_varbase_pytype = nullptr;
 
 namespace py = ::pybind11;
 
-class Layer : public imperative::Layer {
- public:
-  using imperative::Layer::Layer;  // Inherit constructors
-
-  std::vector<std::shared_ptr<imperative::VarBase>> Forward(
-      const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
-      override {
-    PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
-                      Forward, inputs);  // NOLINT
-  }
-};
-
 template <typename T>
 static T PyObjectCast(PyObject *obj) {
   try {
@@ -2051,18 +2039,6 @@ void BindImperative(py::module *m_ptr) {
       .def_property_readonly("type", &imperative::VarBase::Type)
       .def_property_readonly("dtype", &imperative::VarBase::DataType);
 
-  // NOTE(zhiqiu): set the metaclass of Layer.
-  // See details: https://github.com/pybind/pybind11/pull/679
-  // https://github.com/pybind/pybind11/blob/028812ae7eee307dca5f8f69d467af7b92cc41c8/tests/test_methods_and_attributes.cpp#L284
-  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(
-      m, "Layer", py::metaclass((PyObject *)&PyType_Type));  // NOLINT
-  layer.def(py::init<>())
-      .def("forward",
-           [](imperative::Layer &self,
-              const std::vector<std::shared_ptr<imperative::VarBase>> &inputs) {
-             return self.Forward(inputs);
-           });
-
   py::class_<imperative::jit::ProgramDescTracer>(m, "ProgramDescTracer", "")
       .def("create_program_desc",
            &imperative::jit::ProgramDescTracer::CreateProgramDesc)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 662e233bd40d61..11812398ba4550 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -78,7 +78,7 @@ def remove(self):
             del hooks[self._hook_id]
 
 
-class Layer(core.Layer):
+class Layer(object):
     """
     Dynamic graph Layer based on OOD, includes the parameters of the layer, the structure of the forward graph and so on.
 
@@ -976,7 +976,7 @@ def forward(self, input):
                 for prefix, layer in model.named_sublayers():
                     print(prefix, layer)
         """
-        assert (isinstance(sublayer, core.Layer) or sublayer == None)
+        assert (isinstance(sublayer, Layer) or sublayer == None)
 
         self._sub_layers[name] = sublayer
         return sublayer
@@ -1143,7 +1143,7 @@ def _remove_if_exist(*dicts):
             params[name] = None
         else:
             layers = self.__dict__.get('_sub_layers', None)
-            if isinstance(value, core.Layer):
+            if isinstance(value, Layer):
                 if layers is None:
                     raise ValueError(
                         "super(YourLayer, self).__init__() should be called first"
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 3a1ff82290e0f3..d6835069b9d2a0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -400,8 +400,6 @@ def test_set_persistable(self):
 
     def test_layer(self):
         with fluid.dygraph.guard():
-            cl = core.Layer()
-            cl.forward([])
             l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8b72f05f363cba..8367205a7e7c2a 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -253,7 +253,7 @@ def create_layer_dispatch_table(layer):
         dispatch_table_layer[layer.__class__] = reduce_Layer
         return layer
 
-    _parse_every_object(obj, lambda v: isinstance(v, core.Layer),
+    _parse_every_object(obj, lambda v: isinstance(v, fluid.Layer),
                         create_layer_dispatch_table)
 
     def add_dispatch_table():
@@ -316,7 +316,7 @@ def _is_state_dict(obj):
     if isinstance(obj, dict):
 
         def condition(obj):
-            return isinstance(obj, (core.Layer, Program, core.VarBase,
+            return isinstance(obj, (fluid.Layer, Program, core.VarBase,
                                     core.LoDTensor, core.SelectedRows))
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict 
@@ -422,7 +422,7 @@ def _parse_every_object(obj, condition_func, convert_func):
 
 def _parse_load_result(obj, return_numpy):
     def is_layer(obj):
-        return isinstance(obj, core.Layer)
+        return isinstance(obj, fluid.Layer)
 
     def parse_layer(obj):
         temp_dict = _parse_load_result(obj.__dict__, False)

From 934e5d0938b8f48171ad70767483b293b8c139e5 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 1 Dec 2021 19:39:06 +0800
Subject: [PATCH 027/124] add vlog to auto_growth_best_fit_allocator (#37601)

---
 .../fluid/memory/allocation/auto_growth_best_fit_allocator.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 9f34f5198a1796..dd2a65d889d8d9 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -100,11 +100,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
+  VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
   return new BlockAllocation(block_it);
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
-  VLOG(10) << "Free " << allocation->size() << " bytes";
+  VLOG(10) << "Free " << allocation->size()
+           << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;

From caff666813a62c399980b84984e3b517a7d468d3 Mon Sep 17 00:00:00 2001
From: wangye707 <39162106+wangye707@users.noreply.github.com>
Date: Wed, 1 Dec 2021 20:30:05 +0800
Subject: [PATCH 028/124] add xpu_base_url parameter (#37712)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 添加xpu指定url参数
---
 cmake/external/xpu.cmake | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 7f828fd66e2aad..d89ecd27c0954d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -34,8 +34,13 @@ ELSE ()
   SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
 ENDIF()
 
-SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+if(NOT DEFINED XPU_BASE_URL)
+  SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211129")
+else()
+  SET(XPU_BASE_URL "${XPU_BASE_URL}")
+endif()
+
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

From 1a1aeff65ea2bb139f669ea74e6b3226e0bbfa59 Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Wed, 1 Dec 2021 20:34:24 +0800
Subject: [PATCH 029/124] [heterps]fix launch_utils.py. test=develop (#37752)

* fix launch_utils.py. test=develop

* fix launch_utils.py. test=develop
---
 python/paddle/distributed/fleet/launch_utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index f7f50e76af61b9..d87bdb47932ef1 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -1180,18 +1180,14 @@ def get_role_endpoints(self, args):
                 _, self.current_node_ip = get_host_name_ip()
             else:
                 self.current_node_ip = pod_ip
-            if not self.distribute_mode == DistributeMode.PS_HETER:
-                assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                      % (self.current_node_ip, self.node_ips)
-        if self.current_node_ip in self.node_ips:
-            self.node_rank = self.node_ips.index(self.current_node_ip)
-            logger.debug(
-                "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
-                format(self.node_ips, self.current_node_ip, self.node_rank))
+            assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                  % (self.current_node_ip, self.node_ips)
+        self.node_rank = self.node_ips.index(self.current_node_ip)
+        logger.debug(
+            "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
+            format(self.node_ips, self.current_node_ip, self.node_rank))
 
     def start_ps(self):
-        if not self.current_node_ip in self.node_ips:
-            return
         cluster = Cluster(hdfs=None)
         server_rank = 0
         worker_rank = 0

From b0d580a217f293a31b5f09dbce6eca9e126c512f Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 1 Dec 2021 21:25:22 +0800
Subject: [PATCH 030/124] Fix inplace addto pass by setting dtype correctly
 (#37717)

* fix inplace addto pass

* update

* fix ut

* improve ci coverage

* fix musl ci compile error
---
 .../details/share_tensor_buffer_functor.cc    | 10 +--
 .../details/share_tensor_buffer_functor.h     |  8 ++-
 .../details/share_tensor_buffer_op_handle.cc  |  9 +--
 .../details/share_tensor_buffer_op_handle.h   |  2 +-
 .../buffer_shared_inplace_op_pass.cc          |  3 +-
 .../inplace_addto_op_pass.cc                  |  4 +-
 .../memory_optimize_pass/memory_reuse_pass.cc |  6 +-
 paddle/fluid/framework/tensor.h               |  2 +
 paddle/fluid/operators/CMakeLists.txt         |  1 +
 paddle/fluid/operators/share_buffer_op.cc     |  3 +-
 paddle/fluid/operators/share_buffer_op.h      | 16 +++--
 .../fluid/operators/share_buffer_op_test.cc   | 71 +++++++++++++++++++
 .../unittests/test_apply_pass_to_program.py   |  2 +-
 13 files changed, 110 insertions(+), 27 deletions(-)
 create mode 100644 paddle/fluid/operators/share_buffer_op_test.cc

diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index ccc64a9cdc3352..1225e2ee025b2e 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -39,14 +39,14 @@ ShareTensorBufferFunctor::ShareTensorBufferFunctor(
     Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
     const std::vector<std::string> &out_var_names, const bool &is_variant_scope,
-    bool share_dims)
+    bool share_dims_and_dtype)
     : scope_(scope),
       scope_idx_(scope_idx),
       op_type_(op_type),
       in_var_infos_(in_var_infos),
       out_var_names_(out_var_names),
       is_variant_scope_(is_variant_scope),
-      share_dims_(share_dims) {
+      share_dims_and_dtype_(share_dims_and_dtype) {
   PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
                     platform::errors::PreconditionNotMet(
                         "The number of input variables and output variables "
@@ -147,12 +147,14 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
       // NOTE(zhiqiu): In the case of inplace addto, if the operator of
       // the in_out_vars is skipped during running, we should set the dims of
       // output as the same as input.
-      if (share_dims_) {
+      if (share_dims_and_dtype_) {
         out_tensor->Resize(in_tensor.dims());
+        out_tensor->ShareDataTypeWith(in_tensor);
       }
 
       VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
-              << in_var_info->Name() << " -> " << out_var_names_[i];
+              << in_var_info->Name() << " -> " << out_var_names_[i]
+              << " share_dims_and_dtype = " << share_dims_and_dtype_;
     }
   }
 }
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 528b047bccc13a..f0ddb3f0137a29 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -73,12 +73,14 @@ class ShareTensorBufferFunctor {
       Scope *scope, size_t scope_idx, const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
       const std::vector<std::string> &out_var_names,
-      const bool &is_variant_scope, bool share_dims = false);
+      const bool &is_variant_scope, bool share_dims_and_dtype = false);
 
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
-  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+  void SetShareDimsAndDtype(bool share_dims_and_dtype) {
+    share_dims_and_dtype_ = share_dims_and_dtype;
+  }
 
   void operator()(Scope *exec_scope);
 
@@ -108,7 +110,7 @@ class ShareTensorBufferFunctor {
   // NOTE(zhiqiu): In the case of inplace addto, if the operator of
   // the in_out_vars is skipped during running, we should set the dims of output
   // as the same as input.
-  bool share_dims_{false};
+  bool share_dims_and_dtype_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 7e10c669ac478b..aa942415fb4040 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -64,10 +64,10 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
     ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names, bool share_dims)
+    const std::vector<std::string> &out_var_names, bool share_dims_and_dtype)
     : OpHandleBase(node),
       functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
-               is_variant_scope_, share_dims) {}
+               is_variant_scope_, share_dims_and_dtype) {}
 
 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -79,8 +79,9 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
   functor_.AddReuseVarPair(in_var_info, out_var_name);
 }
 
-void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
-  functor_.SetShareDims(share_dims);
+void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
+    bool share_dims_and_dtype) {
+  functor_.SetShareDimsAndDtype(share_dims_and_dtype);
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index dd2364fec4af52..d3852a85d019b9 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -56,7 +56,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
-  void SetShareDims(bool share_dims);
+  void SetShareDimsAndDtype(bool share_dims_and_dtype);
 
   const ShareTensorBufferFunctor &Functor() const { return functor_; }
 
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index bf7cd55fab2689..1ca6e989f275c6 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -283,7 +283,8 @@ void BufferSharedInplaceOpPass::ApplyImpl(ProgramDesc *main_program,
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
     op->SetOutput("XOut", inputs);  // add necessary dependency
-    op->SetAttr("share_dims", std::vector<bool>(inputs.size(), false));
+    op->SetAttr("share_dims_and_dtype",
+                std::vector<bool>(inputs.size(), false));
   }
   block->Flush();
 }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
index d09de5be84c358..0ed2ec51b89cb7 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -277,7 +277,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1,
   grad_add_op_desc->SetInput("X", {in_var_1->Name()});
   grad_add_op_desc->SetOutput("Out", {out_var->Name()});
   grad_add_op_desc->SetOutput("XOut", {in_var_1->Name()});
-  grad_add_op_desc->SetAttr("share_dims", std::vector<bool>(1, true));
+  grad_add_op_desc->SetAttr("share_dims_and_dtype", std::vector<bool>(1, true));
 
   // Add share_buffer op between in_var_0 and in_var_1
   OpDesc share_buffer_op;
@@ -285,7 +285,7 @@ static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1,
   share_buffer_op.SetInput("X", {in_var_0->Name()});
   share_buffer_op.SetOutput("Out", {in_var_1->Name()});
   share_buffer_op.SetOutput("XOut", {in_var_0->Name()});
-  share_buffer_op.SetAttr("share_dims", std::vector<bool>(1, false));
+  share_buffer_op.SetAttr("share_dims_and_dtype", std::vector<bool>(1, false));
 
   auto *new_share_buffer_op = graph->CreateOpNode(&share_buffer_op);
   new_share_buffer_op->inputs.push_back(in_var_0);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index f6465d385841dd..9d1e2301704b3d 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -329,7 +329,7 @@ bool MemoryReusePass::IsVarPairReusable(
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
                                   details::VarHandle *out_var,
-                                  bool share_dims) const {
+                                  bool share_dims_and_dtype) const {
   PADDLE_ENFORCE_GT(
       (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
       platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -349,8 +349,8 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
     share_buffer_op->AddInput(in_var);
   }
 
-  if (share_dims) {
-    share_buffer_op->SetShareDims(true);
+  if (share_dims_and_dtype) {
+    share_buffer_op->SetShareDimsAndDtype(true);
   }
 
   share_buffer_op->AddReuseVarPair(
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 7f8d7bffa986e3..2efaa3f37f9e9a 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -260,6 +260,8 @@ class Tensor {
     // should not be copied.
   }
 
+  void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; }
+
   bool IsSharedBufferWith(const Tensor& src) const {
     return holder_ && holder_ == src.Holder();
   }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index a8f35d61f3c4cc..0c3572ab655381 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -203,6 +203,7 @@ elseif(WITH_ROCM)
 else()
     cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3)
 endif()
+cc_test(share_buffer_op_cpp_test SRCS share_buffer_op_test.cc DEPS lod_tensor device_context share_buffer_op) 
 
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
diff --git a/paddle/fluid/operators/share_buffer_op.cc b/paddle/fluid/operators/share_buffer_op.cc
index a161b9272b7b20..f6a6c9695b2adf 100644
--- a/paddle/fluid/operators/share_buffer_op.cc
+++ b/paddle/fluid/operators/share_buffer_op.cc
@@ -49,7 +49,8 @@ class ShareBufferOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), The output tensors which are the same as X. It is "
               "used to build the graph dependency")
         .AsDuplicable();
-    AddAttr<std::vector<bool>>("share_dims", "Whether to share dims")
+    AddAttr<std::vector<bool>>("share_dims_and_dtype",
+                               "Whether to share dims and data type")
         .SetDefault(std::vector<bool>());
     AddComment(
         R"DOC(Operator used to perform inplace memory reuse. It should be not exposed to Python APIs.)DOC");
diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h
index 5138ad9d54b79a..1d0abf14f577e7 100644
--- a/paddle/fluid/operators/share_buffer_op.h
+++ b/paddle/fluid/operators/share_buffer_op.h
@@ -29,12 +29,13 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
     size_t n = inputs.size();
     PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied(
                                              "Variable number not match."));
-    const auto &share_dims = ctx.Attr<std::vector<bool>>("share_dims");
-    if (!share_dims.empty()) {
-      PADDLE_ENFORCE_EQ(
-          n, share_dims.size(),
-          platform::errors::PermissionDenied(
-              "Attribute share_dims number not match input variable number."));
+    const auto &share_dims_and_dtype =
+        ctx.Attr<std::vector<bool>>("share_dims_and_dtype");
+    if (!share_dims_and_dtype.empty()) {
+      PADDLE_ENFORCE_EQ(n, share_dims_and_dtype.size(),
+                        platform::errors::PermissionDenied(
+                            "Attribute share_dims_and_dtype number not match "
+                            "input variable number."));
     }
 
     const std::vector<std::string> *input_args = nullptr,
@@ -50,8 +51,9 @@ class ShareBufferOpKernel : public framework::OpKernel<T> {
       outputs[i]->ShareBufferWith(*inputs[i]);
       VLOG(10) << "Share tensor buffer " << (*input_args)[i] << " -> "
                << (*output_args)[i];
-      if (!share_dims.empty() && share_dims[i]) {
+      if (!share_dims_and_dtype.empty() && share_dims_and_dtype[i]) {
         outputs[i]->Resize(inputs[i]->dims());
+        outputs[i]->ShareDataTypeWith(*inputs[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/share_buffer_op_test.cc b/paddle/fluid/operators/share_buffer_op_test.cc
new file mode 100644
index 00000000000000..60220981cab1d2
--- /dev/null
+++ b/paddle/fluid/operators/share_buffer_op_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_OP(share_buffer);
+
+namespace paddle {
+namespace framework {
+
+TEST(test_share_buffer_op, test_share_buffer_op) {
+  std::vector<std::string> inputs = {"X1", "X2"};
+  std::vector<std::string> outputs = {"Y1", "Y2"};
+  std::vector<DDim> dims = {{2, 3, 4}, {5, 6}};
+  std::vector<bool> share_dims_and_dtype = {false, true};
+
+  size_t n = inputs.size();
+  EXPECT_EQ(n, outputs.size());
+  EXPECT_EQ(n, dims.size());
+  EXPECT_EQ(n, share_dims_and_dtype.size());
+
+  OpDesc desc;
+  desc.SetType("share_buffer");
+  desc.SetInput("X", inputs);
+  desc.SetOutput("Out", outputs);
+  desc.SetOutput("XOut", inputs);
+  desc.SetAttr("share_dims_and_dtype", share_dims_and_dtype);
+
+  auto op = OpRegistry::CreateOp(desc);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  platform::Place place = platform::CUDAPlace(0);
+#else
+  platform::Place place = platform::CPUPlace();
+#endif
+
+  Scope scope;
+  for (size_t i = 0; i < n; ++i) {
+    auto *in_tensor = scope.Var(inputs[i])->GetMutable<LoDTensor>();
+    in_tensor->Resize(dims[i]);
+    in_tensor->mutable_data<float>(place);
+    scope.Var(outputs[i])->GetMutable<LoDTensor>();
+  }
+  op->Run(scope, place);
+  platform::DeviceContextPool::Instance().Get(place)->Wait();
+
+  for (size_t i = 0; i < n; ++i) {
+    const auto &in_tensor = scope.Var(inputs[i])->Get<LoDTensor>();
+    const auto &out_tensor = scope.Var(outputs[i])->Get<LoDTensor>();
+    EXPECT_TRUE(out_tensor.IsSharedBufferWith(in_tensor));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 422cb58ff9ab6b..4552d600bafd74 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -123,7 +123,7 @@ def check_after_applied(self, main, startup):
             if op.type != "share_buffer":
                 continue
 
-            share_dims = op.attr("share_dims")
+            share_dims = op.attr("share_dims_and_dtype")
             if share_dims:
                 for i in range(len(share_dims)):
                     self.assertEqual(share_dims[0], share_dims[i])

From 6abe7dcb0d40069b46eff6a27b847c3daee7080e Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 1 Dec 2021 21:57:38 +0800
Subject: [PATCH 031/124] support bool index for int tensor (#37761)

---
 python/paddle/fluid/variable_index.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 19067b8ae12528..5aa7f9c972f9b6 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -674,14 +674,6 @@ def _setitem_impl_(var, item, value):
 
 # the item is a tensor of bool 
 def set_value_for_bool_tensor(var, item, value):
-
-    # TODO(zyfncg): Now scatter_nd_add only support float32 and float64 tensor, 
-    # so in the current version we also only support float32 and float64 tensor, 
-    # this problem will be fixed in the future.
-    if var.dtype != core.VarDesc.VarType.FP32 and var.dtype != core.VarDesc.VarType.FP64:
-        raise TypeError("Only support float and double tensor for bool index, "
-                        "but received {}.".format(var.dtype))
-
     if len(item.shape) > len(var.shape):
         raise IndexError("The dims of bool index doesn't match indexed array, "
                          "the dims of bool index except to be equal or less "

From 7094251ba12387a2f7bb572b896c52c9c61420bf Mon Sep 17 00:00:00 2001
From: Sylwester Fraczek <sylwester.fraczek@intel.com>
Date: Wed, 1 Dec 2021 16:10:58 +0100
Subject: [PATCH 032/124] dequantize matmul and matmul_v2 Y weights in
 quant2_int8 (#37618)

* dequantize matmul and matmul_v2 Y weights in qat2_int8

* review fix

* split conv and mul tests, add matmul test

* fixup

* fix ci build

* remove unused variables

* formatting fix

* remove extra newline at end of file
---
 .../quantization/quant2_int8_mkldnn_pass.py   |   8 +-
 .../tests/test_quant2_int8_mkldnn_pass.py     | 160 ++++++++++--------
 2 files changed, 96 insertions(+), 72 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index bc97e5cf6c9586..4c9c4058318a97 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -69,7 +69,7 @@ def __init__(self,
         self._mul_ops = ['mul']
         self._fc_ops = ['fc']
         self._relu_ops = ['relu', 'relu6']
-        self._matmul_ops = ['matmul']
+        self._matmul_ops = ['matmul', 'matmul_v2']
         self._gru_ops = ['fusion_gru', 'multi_gru']
         self._lstm_ops = ['fusion_lstm']
         self._weight_thresholds = {}
@@ -328,14 +328,18 @@ def _swap_inputs(self, op, old_input, new_input):
     def _dequantize_weights(self, graph):
         def _is_int8_weights(op_node, weight_name):
             weight_var_name = op_node.input(weight_name)[0]
+            if self._scope.find_var(weight_var_name) is None:
+                return False
             weight = self._load_param(self._scope, weight_var_name)
             return np.all(np.mod(weight, 1) == 0)
 
+        mul_and_matmul_ops = self._mul_ops + self._matmul_ops
         for op in graph.all_op_nodes():
             if op.name() in self._conv_ops and _is_int8_weights(op, "Filter"):
                 self._dequantize_op_weights(graph, op, "Filter", "Output")
-            elif op.name() in self._mul_ops and _is_int8_weights(op, "Y"):
+            elif op.name() in mul_and_matmul_ops and _is_int8_weights(op, "Y"):
                 self._dequantize_op_weights(graph, op, "Y", "Out")
+
         return graph
 
     def _dequantize_op_weights(self, graph, op_node, weight_name, output_name):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 9ba0164afbe607..994f89ab3e9f30 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -23,7 +23,93 @@
 paddle.enable_static()
 
 
-class TestQuant2Int8MkldnnPass(unittest.TestCase):
+class TestQuant2Int8MkldnnPassMul(unittest.TestCase):
+    def op_name(self):
+        return "mul"
+
+    def setUp(self):
+        self.scope = fluid.Scope()
+        self.place = fluid.CPUPlace()
+        self.dtype = np.float32
+        self.use_mkldnn = True
+
+        self.quantized_ops = self.op_name()
+        self.mul_input_size = [1, 3]
+        self.mul_weights_size = [3, 5]
+        self.mul_output_size = [1, 5]
+        self.mul_input = np.random.random(self.mul_input_size).astype(
+            self.dtype)
+        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
+        self.mul_weights_bad = np.ones([1, 1], self.dtype)
+        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
+        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
+
+        self.variables_mul = {
+            "mul_input": self.mul_input,
+            "mul_weights": self.mul_weights,
+            "mul_output": self.mul_output,
+            "mul_weights_bad": self.mul_weights_bad
+        }
+
+    def prepare_program_mul(self, program):
+        block = program.global_block()
+        for name in self.variables_mul:
+            block.create_var(
+                name=name,
+                dtype="float32",
+                shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(
+            type=self.op_name(),
+            inputs={
+                "X": block.var('mul_input'),
+                "Y": block.var('mul_weights')
+            },
+            outputs={"Out": block.var('mul_output')},
+            attrs={'use_mkldnn': self.use_mkldnn})
+
+    def test_dequantize_op_weights(self):
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            self.prepare_program_mul(program)
+            graph = IrGraph(core.Graph(program.desc), for_test=True)
+
+            op_node = ""
+            for op in graph.all_op_nodes():
+                if op.op().type() == self.op_name():
+                    op_node = op
+                    break
+            assert op_node != "", "op of type %s not found" % self.op_name()
+
+            qpass = Quant2Int8MkldnnPass(
+                self.quantized_ops,
+                _scope=self.scope,
+                _place=self.place,
+                _core=core,
+                _debug=False)
+            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights"], self.place)
+            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+            assert np.allclose(
+                self.scope.find_var("mul_weights").get_tensor(),
+                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
+                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
+                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])
+
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights_bad"], self.place)
+            with self.assertRaises(ValueError):
+                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+
+class TestQuant2Int8MkldnnPassMatmulV2(TestQuant2Int8MkldnnPassMul):
+    def op_name(self):
+        return "matmul_v2"
+
+
+class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase):
     def setUp(self):
         self.scope = fluid.Scope()
         self.place = fluid.CPUPlace()
@@ -46,7 +132,7 @@ def setUp(self):
         self.conv_output = np.ndarray(self.conv_output_size).astype(self.dtype)
         self.conv_output2 = np.ndarray(self.conv_output2_size).astype(
             self.dtype)
-        self.quantized_ops = 'conv2d,mul'
+        self.quantized_ops = 'conv2d'
         self.variables = {
             "input": self.input,
             "filter": self.filter,
@@ -54,24 +140,8 @@ def setUp(self):
             "conv_output": self.conv_output,
             "conv_output2": self.conv_output2,
         }
-        self.mul_input_size = [1, 3]
-        self.mul_weights_size = [3, 5]
-        self.mul_output_size = [1, 5]
-        self.mul_input = np.random.random(self.mul_input_size).astype(
-            self.dtype)
-        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
-        self.mul_weights_bad = np.ones([1, 1], self.dtype)
-        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
-        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
 
-        self.variables_mul = {
-            "mul_input": self.mul_input,
-            "mul_weights": self.mul_weights,
-            "mul_output": self.mul_output,
-            "mul_weights_bad": self.mul_weights_bad
-        }
-
-    def prepare_program(self, program):
+    def prepare_program_conv2d(self, program):
         block = program.global_block()
         for name in self.variables:
             block.create_var(
@@ -111,23 +181,6 @@ def prepare_program(self, program):
                 'fuse_brelu': True
             })
 
-    def prepare_program_mul(self, program):
-        block = program.global_block()
-        for name in self.variables_mul:
-            block.create_var(
-                name=name,
-                dtype="float32",
-                shape=self.variables_mul[name].shape)
-
-        mul_op1 = block.append_op(
-            type="mul",
-            inputs={
-                "X": block.var('mul_input'),
-                "Y": block.var('mul_weights')
-            },
-            outputs={"Out": block.var('mul_output')},
-            attrs={'use_mkldnn': self.use_mkldnn})
-
     def remove_fuse_activation_attribute(self, graph):
         for op in graph.all_op_nodes():
             op.op().remove_attr("fuse_activation")
@@ -150,7 +203,7 @@ def check_graph_after_pass(self, graph):
     def test_quant_update_activation(self):
         program = fluid.Program()
         with fluid.program_guard(program):
-            self.prepare_program(program)
+            self.prepare_program_conv2d(program)
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
             self.check_graph_before_pass(graph)
@@ -163,39 +216,6 @@ def test_quant_update_activation(self):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
-    def test_dequantize_op_weights(self):
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            self.prepare_program_mul(program)
-            graph = IrGraph(core.Graph(program.desc), for_test=True)
-
-            for op in graph.all_op_nodes():
-                if op.op().type() == "mul":
-                    op_node = op
-                    break
-
-            qpass = Quant2Int8MkldnnPass(
-                self.quantized_ops,
-                _scope=self.scope,
-                _place=self.place,
-                _core=core,
-                _debug=False)
-            qpass._weight_thresholds["mul_output"] = self.mul_output_scale
-            param = self.scope.var("mul_weights").get_tensor()
-            param.set(self.variables_mul["mul_weights"], self.place)
-            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
-
-            assert np.allclose(
-                self.scope.find_var("mul_weights").get_tensor(),
-                [[1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
-                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.],
-                 [1. / 127., 2. / 127., 3. / 127., 4. / 127., 5. / 127.]])
-
-            param = self.scope.var("mul_weights").get_tensor()
-            param.set(self.variables_mul["mul_weights_bad"], self.place)
-            with self.assertRaises(ValueError):
-                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
-
 
 if __name__ == '__main__':
     unittest.main()

From 9ecb74613e13fe1341a5a33d6389d862c8f2b98b Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 2 Dec 2021 09:45:53 +0800
Subject: [PATCH 033/124] Enabled Eager Dygraph AutoCodeGen for 500+ existing
 ops (#37753)

* Handled dispensable tensors in AutoCodeGen for Eager Dygraph

* Enabled Eager Dygraph AutoCodeGen for 500+ existing ops
---
 .../eager/auto_code_generator/CMakeLists.txt  |   4 +-
 .../auto_code_generator/eager_generator.cc    |  40 +-
 .../eager/auto_code_generator/op_list.txt     | 501 ++++++++++++++++++
 .../tests/task_tests/eager_utils_test.cc      |   2 +-
 paddle/fluid/eager/utils.cc                   |   4 +-
 paddle/fluid/eager/utils.h                    |   2 +-
 6 files changed, 538 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 03cec80b682b11..187c3db4452229 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -47,12 +47,12 @@ if(WIN32)
     endif()
 
     add_custom_target(eager_codegen
-      COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" 
+      COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt"
       DEPENDS ${EAGER_CODEGEN_DEPS}
       VERBATIM)
 else()
     add_custom_target(eager_codegen
-          COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
+          COMMAND "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/op_list.txt"
           DEPENDS eager_generator
           VERBATIM)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 136eaebe2cc4bf..283153585866ab 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <gflags/gflags.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -26,6 +27,9 @@
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DEFINE_bool(generate_all, false,
+            "Generate all operators currently registered in Paddle");
+
 static std::unordered_set<std::string> operators_to_skip = {
     "fused_elemwise_add_activation",  // No Default Attr
     "fused_elemwise_activation",      // No Default Attr
@@ -40,12 +44,10 @@ static std::unordered_set<std::string> operators_to_skip = {
     "pull_box_sparse",
     "fused_attention",
     "diag_v2",
-};
-
-static std::unordered_set<std::string> operators_to_codegen = {
-    "sigmoid",      "matmul_v2",   "reduce_sum", "elementwise_add",
-    "share_buffer", "var_conv_2d", "split"};
+    "transfer_dtype",
+    "c_split"};
 
+static std::unordered_set<std::string> operators_to_codegen = {};
 static std::unordered_set<std::string> skipped_operators = {};
 
 namespace paddle {
@@ -353,7 +355,10 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   // Only handle matmul_v2 for now
   VLOG(1) << "------ Analyzing Op ------: " << op_type;
 
-  if (!operators_to_codegen.count(op_type)) return false;
+  if (!FLAGS_generate_all) {
+    if (!operators_to_codegen.count(op_type)) return false;
+  }
+
   if (operators_to_skip.count(op_type)) return false;
 
   return true;
@@ -976,7 +981,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
           paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
       dygraph_function_args_str += arg_str;
       const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::ConstructDuplicableOutput(%s) },";
+          "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput(%s) },";
       outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
                                                    output_name, outnum);
     } else {
@@ -1253,7 +1258,7 @@ static std::string GenerateGradNodeCCContents(
 
       if (duplicable_input_name_set.count(fwd_input_name)) {
         const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::ConstructDuplicableOutput( "
+            "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
             "this->OutputMeta()[%d].Size() ) },";
         outs_contents_str += paddle::string::Sprintf(
             GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
@@ -1639,13 +1644,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 }  // namespace framework
 }  // namespace paddle
 
+static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
+  std::string line;
+  std::ifstream op_list_file(op_list_path);
+  if (op_list_file.is_open()) {
+    while (getline(op_list_file, line)) {
+      operators_to_codegen.insert(line);
+    }
+    op_list_file.close();
+  } else {
+    PADDLE_THROW(
+        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
+  }
+}
+
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
+  if (argc != 3) {
     std::cerr << "argc must be 2" << std::endl;
     return -1;
   }
 
   std::string eager_root = argv[1];
+  std::string op_list_path = argv[2];
+
+  CollectOperatorsToCodeGen(op_list_path);
   paddle::framework::DygraphCodeGeneration(eager_root);
 
   return 0;
diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt
index 00a9abde156fba..6bfba753633f33 100644
--- a/paddle/fluid/eager/auto_code_generator/op_list.txt
+++ b/paddle/fluid/eager/auto_code_generator/op_list.txt
@@ -2,3 +2,504 @@ sigmoid
 matmul_v2
 reduce_sum
 elementwise_add
+rsqrt
+multihead_matmul
+addmm
+gru
+round
+rank_attention
+fused_embedding_fc_lstm
+where_index
+bicubic_interp
+arg_min
+tile
+bilinear_tensor_product
+ctc_align
+pow2_decay_with_linear_warmup
+split
+fc
+clear_float_status
+load
+elementwise_max
+adadelta
+chunk_eval
+check_finite_and_unscale
+sparse_momentum
+tan
+adam
+fsp
+where
+logical_xor
+multiclass_nms3
+one_hot_v2
+sequence_softmax
+affine_channel
+triangular_solve
+sequence_topk_avg_pooling
+space_to_depth
+reverse
+fused_embedding_eltwise_layernorm
+expand_v2
+lgamma
+solve
+deformable_psroi_pooling
+instance_norm
+decode_jpeg
+gather_nd
+reduce_prod
+matrix_rank
+asin
+lstmp
+iou_similarity
+huber_loss
+one_hot
+sequence_slice
+lookup_table
+softplus
+depthwise_conv2d
+fused_fc_elementwise_layernorm
+sigmoid_cross_entropy_with_logits
+exp
+scatter
+equal_all
+searchsorted
+fusion_squared_mat_sub
+unique
+log
+conv_shift
+smooth_l1_loss
+linear_interp_v2
+momentum
+temporal_shift
+nce
+mv
+proximal_gd
+memcpy_h2d
+add_position_encoding
+cosh
+hash
+grad_add
+sign
+prelu
+linspace
+fill_diagonal
+logsigmoid
+load_combine
+fetch_v2
+randperm
+sequence_scatter
+partial_sum
+relu6
+conv3d
+lstm_unit
+not_equal
+transpose2
+uniform_random_batch_size_like
+unfold
+lrn
+softmax_with_cross_entropy
+isfinite_v2
+bernoulli
+max_pool3d_with_index
+gaussian_random
+flatten2
+matmul
+cvm
+adamax
+masked_select
+range
+bitwise_not
+trace
+multinomial
+modified_huber_loss
+roll
+squared_l2_distance
+conv3d_transpose
+share_data
+fake_quantize_abs_max
+unique_with_counts
+fill
+concat
+fill_zeros_like
+hierarchical_sigmoid
+isinf_v2
+squeeze
+multiclass_nms2
+bpr_loss
+fft_c2c
+bicubic_interp_v2
+reshape
+coalesce_tensor
+roi_align
+reshape2
+reduce_any
+unstack
+scatter_nd_add
+sequence_reshape
+bilateral_slice
+fill_any_like
+empty
+pad_constant_like
+pool2d
+size
+imag
+eigh
+stack
+dgc_momentum
+lamb
+generate_proposals_v2
+bitwise_or
+gru_unit
+fake_channel_wise_quantize_dequantize_abs_max
+sampling_id
+unsqueeze2
+average_accumulates
+sequence_enumerate
+fusion_seqconv_eltadd_relu
+bce_loss
+generate_proposal_labels
+im2sequence
+isinf
+adagrad
+linear_chain_crf
+retinanet_target_assign
+fusion_group
+teacher_student_sigmoid_loss
+random_crop
+lookup_table_v2
+detection_map
+l1_norm
+sqrt
+fused_elemwise_activation
+slogdeterminant
+share_buffer
+bitwise_and
+diag_embed
+unbind
+dropout
+moving_average_abs_max_scale
+beam_search
+log_loss
+greater_than
+kron
+sigmoid_focal_loss
+rmsprop
+conv2d
+uniform_random_inplace
+maxout
+linear_interp
+auc
+logical_or
+batch_norm
+acos
+unpool
+cumprod
+sample_logits
+pull_box_extended_sparse
+crop_tensor
+fill_constant
+deformable_conv
+generate_mask_labels
+locality_aware_nms
+expand_as
+matrix_power
+greater_equal
+generate_proposals
+bilinear_interp
+inplace_abn
+softshrink
+mul
+data_norm
+get_tensor_from_selected_rows
+spp
+floor
+gelu
+retinanet_detection_output
+minus
+push_dense
+silu
+sequence_erase
+real
+nearest_interp_v2
+dgc_clip_by_norm
+squeeze2
+strided_slice
+conj
+precision_recall
+save
+fusion_seqexpand_concat_fc
+fake_quantize_range_abs_max
+depthwise_conv2d_transpose
+positive_negative_pair
+square
+var_conv_2d
+log1p
+fused_softmax_mask_upper_triangle
+clip_by_norm
+atan2
+box_decoder_and_assign
+fft_r2c
+roi_pool
+overlap_add
+fill_constant_batch_size_like
+fill_any
+dequantize_log
+max_pool2d_with_index
+pad3d
+norm
+viterbi_decode
+mish
+box_coder
+flatten
+elementwise_mod
+margin_cross_entropy
+pull_sparse
+logical_and
+pow
+stanh
+label_smooth
+merged_momentum
+ascend_trigger
+fused_feedforward
+rpn_target_assign
+roi_perspective_transform
+expand
+prroi_pool
+pool3d
+memcpy
+distribute_fpn_proposals
+frame
+bincount
+shape
+group_norm
+resnet_unit
+sequence_expand_as
+cos_sim
+eigvals
+save_combine
+class_center_sample
+read_file
+isfinite
+arg_max
+equal
+fake_dequantize_max_abs
+qr
+anchor_generator
+layer_norm
+merge_selected_rows
+less_equal
+rnn
+fusion_lstm
+lars_momentum
+hard_sigmoid
+isnan
+elementwise_floordiv
+correlation
+histogram
+gather_tree
+segment_pool
+sync_batch_norm
+fusion_repeated_fc_relu
+nop
+fused_attention
+expand_as_v2
+filter_by_instag
+diag_v2
+pull_box_sparse
+nll_loss
+dot
+scale
+ncclBcast
+shuffle_batch
+ncclReduce
+diag
+multiplex
+leaky_relu
+allclose
+adamw
+elementwise_pow
+prior_box
+p_norm
+unique_consecutive
+lod_reset
+pad
+sequence_conv
+log10
+set_value
+bitwise_xor
+center_loss
+randint
+attention_lstm
+uniform_random
+slice
+meshgrid
+hard_swish
+sin
+mean_iou
+pad2d
+inverse
+spectral_norm
+shuffle_channel
+psroi_pool
+seed
+ceil
+eig
+reduce_min
+cos
+ncclAllReduce
+cudnn_lstm
+digamma
+assign_value
+increment
+tdm_sampler
+fused_softmax_mask
+sequence_reverse
+eigvalsh
+diagonal
+trunc
+log2
+marker
+tanh
+yolov3_loss
+graph_send_recv
+accuracy
+atan
+less_than
+unsqueeze
+crf_decoding
+log_softmax
+ftrl
+matrix_nms
+top_k_v2
+cast
+tanh_shrink
+hard_shrink
+multiclass_nms
+fusion_transpose_flatten_concat
+sequence_unpad
+fused_elemwise_add_activation
+pull_sparse_v2
+frobenius_norm
+crop
+cross_entropy2
+skip_layernorm
+tdm_child
+fused_embedding_seq_pool
+erf
+conv2d_inception_fusion
+trilinear_interp
+logsumexp
+fusion_seqpool_concat
+alloc_float_status
+sequence_concat
+fusion_seqpool_cvm_concat
+similarity_focus
+argsort
+sequence_expand
+sgd
+fused_bn_add_activation
+bilinear_interp_v2
+clip
+deformable_conv_v1
+hinge_loss
+determinant
+conv2d_transpose
+memcpy_d2h
+softsign
+fake_quantize_dequantize_abs_max
+broadcast_tensors
+grid_sampler
+fft_c2r
+pyramid_hash
+fake_quantize_dequantize_moving_average_abs_max
+multi_dot
+sequence_pool
+transpose
+top_k
+dist
+affine_grid
+gaussian_random_batch_size_like
+fake_channel_wise_dequantize_max_abs
+reciprocal
+sequence_mask
+fill_diagonal_tensor
+abs
+partial_concat
+elu
+index_select
+row_conv
+cross
+elementwise_mul
+decayed_adagrad
+bipartite_match
+run_program
+fake_quantize_moving_average_abs_max
+mine_hard_examples
+target_assign
+lstm
+truncated_gaussian_random
+match_matrix_tensor
+elementwise_div
+kldiv_loss
+cumsum
+sum
+proximal_adagrad
+update_loss_scaling
+shard_index
+selu
+mean
+gumbel_softmax
+sequence_pad
+tree_conv
+assign
+flatten_contiguous_range
+tril_triu
+brelu
+celu
+reduce_mean
+sinh
+rank_loss
+reduce_max
+fusion_gru
+fill_zeros_like2
+expm1
+squared_l2_norm
+elementwise_sub
+margin_rank_loss
+faster_tokenizer
+relu
+is_empty
+reduce_all
+edit_distance
+bmm
+yolo_box
+soft_relu
+density_prior_box
+eye
+swish
+cross_entropy
+dpsgd
+cholesky
+batch_fc
+nearest_interp
+gather
+trilinear_interp_v2
+box_clip
+isnan_v2
+softmax
+conv2d_fusion
+fused_batch_norm_act
+get_float_status
+index_sample
+elementwise_min
+logical_not
+collect_fpn_proposals
+pixel_shuffle
+thresholded_relu
+polygon_box_transform
+lookup_table_dequant
+warpctc
+fake_channel_wise_quantize_abs_max
+dequantize_abs_max
+svd
+flip
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index c7c27dcc1d1508..ea9aae83ff1891 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -60,7 +60,7 @@ TEST(EagerUtils, AutoGradMeta) {
   std::vector<AutogradMeta*> autograd_metas =
       EagerUtils::multi_autograd_meta(&ets);
   std::vector<AutogradMeta*> unsafe_autograd_metas =
-      EagerUtils::unsafe_autograd_meta(&ets);
+      EagerUtils::unsafe_autograd_meta(ets);
   CHECK_NOTNULL(unsafe_autograd_metas[0]);
   CHECK_NOTNULL(unsafe_autograd_metas[1]);
 
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 28eefd62c5aa0a..be06bf9eb344ba 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -48,9 +48,9 @@ AutogradMeta* EagerUtils::unsafe_autograd_meta(const egr::EagerTensor& target) {
 }
 
 std::vector<AutogradMeta*> EagerUtils::unsafe_autograd_meta(
-    std::vector<egr::EagerTensor>* targets) {
+    const std::vector<egr::EagerTensor>& targets) {
   std::vector<AutogradMeta*> metas;
-  for (const egr::EagerTensor& t : *targets) {
+  for (const egr::EagerTensor& t : targets) {
     metas.push_back(unsafe_autograd_meta(t));
   }
   return metas;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index f7e226a2aba36c..03f922e5bf9ba9 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -114,7 +114,7 @@ class EagerUtils {
   // This method will return an AutogradMeta pointer unsafely.
   static AutogradMeta* unsafe_autograd_meta(const egr::EagerTensor& target);
   static std::vector<AutogradMeta*> unsafe_autograd_meta(
-      std::vector<egr::EagerTensor>* targets);
+      const std::vector<egr::EagerTensor>& targets);
 
   template <typename T, typename... Args>
   static bool ComputeRequireGrad(T trace_backward, Args&&... args) {

From c0d5b7eceb04fbb043de846df184c52009f9f61e Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 2 Dec 2021 10:26:23 +0800
Subject: [PATCH 034/124] simplify_with_basic_ops_pass UT (#37704)

* first commit

* more uts

* file name duplicated

* timeout

* Update CMakeLists.txt

change TIMEOUT from 120 to 240

* Update CMakeLists.txt

more time

* Update CMakeLists.txt

timeout

* Update CMakeLists.txt

60s
---
 .../ir/simplify_with_basic_ops_pass.cc        |   5 +
 .../unittests/ir/inference/CMakeLists.txt     |   1 +
 ...t_simplify_with_basic_ops_pass_autoscan.py | 154 ++++++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py

diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
index b2b1a7515f0a50..2d60129165a60e 100644
--- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -231,3 +232,7 @@ void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var,
 
 REGISTER_PASS(simplify_with_basic_ops_pass,
               paddle::framework::ir::SimplifyWithBasicOpsPass);
+REGISTER_PASS_CAPABILITY(simplify_with_basic_ops_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "scale", 0));
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 4126e604cc1f63..f59f686e78a32d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -71,6 +71,7 @@ set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
 set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
+set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
 
 if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
 set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
new file mode 100644
index 00000000000000..03e9feb418a82b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_simplify_with_basic_ops_pass_autoscan.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestSimplifyWithBasicOpsPassUpscale(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        #scale = draw(st.floats(min_value=0.01, max_value=1.0))
+        #bias = draw(st.floats(min_value=0.01, max_value=2.0))
+        #bias_after_scale = draw(st.booleans())
+        fix_seed = draw(st.booleans())
+        dropout_implementation = "upscale_in_train"
+        dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
+        seed = draw(st.integers(min_value=0, max_value=512))
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=4))
+        is_test = True
+
+        dropout_op = OpConfig(
+            "dropout",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["dropout_output"]},
+            fix_seed=fix_seed,
+            dropout_implementation=dropout_implementation,
+            dropout_prob=dropout_prob,
+            seed=seed,
+            is_test=is_test)
+        relu_op = OpConfig(
+            "relu",
+            inputs={"X": ["dropout_output"]},
+            outputs={"Out": ["relu_out"]})
+        ops = [dropout_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["relu_out"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ['relu'], (1e-5, 1e-5)
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=30,
+            passes=["simplify_with_basic_ops_pass"],
+            min_success_num=30)
+
+
+class TestSimplifyWithBasicOpsPassDowngrade(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        fix_seed = draw(st.booleans())
+        dropout_implementation = "downgrade_in_infer"
+        dropout_prob = draw(st.floats(min_value=0.0, max_value=1.0))
+        seed = draw(st.integers(min_value=0, max_value=512))
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=4))
+        is_test = True
+
+        dropout_op = OpConfig(
+            "dropout",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["dropout_output"]},
+            fix_seed=fix_seed,
+            dropout_implementation=dropout_implementation,
+            dropout_prob=dropout_prob,
+            seed=seed,
+            is_test=is_test)
+        relu_op = OpConfig(
+            "relu",
+            inputs={"X": ["dropout_output"]},
+            outputs={"Out": ["relu_out"]})
+        ops = [dropout_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["relu_out"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+        config = self.create_inference_config(use_gpu=False)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['scale', 'relu'], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=30,
+            passes=["simplify_with_basic_ops_pass"],
+            min_success_num=30)
+
+
+if __name__ == "__main__":
+    unittest.main()

From ddf38a3f806cfae466a0643269548b84162bba54 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Thu, 2 Dec 2021 10:43:21 +0800
Subject: [PATCH 035/124] [fleet_executor] Add amplifier interceptor and 1F1B
 scheduler test (#37755)

---
 .../distributed/fleet_executor/CMakeLists.txt |  3 +-
 .../fleet_executor/amplifier_interceptor.cc   | 82 ++++++++++++++++
 .../fleet_executor/amplifier_interceptor.h    | 43 +++++++++
 .../distributed/fleet_executor/carrier.cc     |  1 +
 .../fleet_executor/compute_interceptor.cc     | 15 +--
 .../fleet_executor/compute_interceptor.h      | 13 ++-
 .../distributed/fleet_executor/task_node.h    | 17 ++++
 .../fleet_executor/test/CMakeLists.txt        |  6 ++
 .../interceptor_pipeline_long_path_test.cc    | 94 +++++++++++++++++++
 .../interceptor_pipeline_short_path_test.cc   | 90 ++++++++++++++++++
 10 files changed, 352 insertions(+), 12 deletions(-)
 create mode 100644 paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
 create mode 100644 paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
 create mode 100644 paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
 create mode 100644 paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 641110802f1fd3..b615088b3b1118 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -11,7 +11,7 @@ else()
 endif()
 
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
-        interceptor.cc compute_interceptor.cc interceptor_message_service.cc message_bus.cc
+        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper
         ${BRPC_DEPS})
 
@@ -19,6 +19,7 @@ if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(compute_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.h PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(message_bus.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(fleet_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
new file mode 100644
index 00000000000000..7d71f8e7b2242e
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h"
+
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace distributed {
+
+AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id,
+                                           TaskNode* node)
+    : ComputeInterceptor(interceptor_id, node) {
+  run_per_steps_ = node->run_per_steps();
+  run_at_offset_ = node->run_at_offset();
+  reply_up_per_steps_ = node->reply_up_per_steps();
+  send_down_per_steps_ = node->send_down_per_steps();
+
+  PADDLE_ENFORCE_GE(
+      run_per_steps_, 1,
+      platform::errors::InvalidArgument(
+          "run_per_steps must >= 1, but now is %ld", run_per_steps_));
+  PADDLE_ENFORCE_GE(
+      run_at_offset_, 0,
+      platform::errors::InvalidArgument(
+          "run_at_offset must >= 0, but now is %ld", run_at_offset_));
+  PADDLE_ENFORCE_LT(run_at_offset_, run_per_steps_,
+                    platform::errors::InvalidArgument(
+                        "run_at_offset must < run_per_steps, must now "
+                        "run_at_offset=%ld run_per_steps=%ld",
+                        run_at_offset_, run_per_steps_));
+  PADDLE_ENFORCE_GE(
+      reply_up_per_steps_, 1,
+      platform::errors::InvalidArgument(
+          "reply_up_per_steps must >= 1, but now is %ld", reply_up_per_steps_));
+  PADDLE_ENFORCE_GE(send_down_per_steps_, 1,
+                    platform::errors::InvalidArgument(
+                        "send_down_per_steps must >= 1, but now is %ld",
+                        send_down_per_steps_));
+}
+
+void AmplifierInterceptor::RunOps() {
+  // run_per_steps_, run_at_offset_
+  // 4, 0 --> run at step 0, 4, 8, 12
+  // 4, 3 --> run at step 3, 7, 11, 15
+  if ((step_ % run_per_steps_) == run_at_offset_) {
+    ComputeInterceptor::RunOps();
+  }
+}
+
+void AmplifierInterceptor::SendDataReadyToDownStream() {
+  // run multi times, send ready one times to downstream, that is
+  // input multi times, output one times
+  if (step_ % send_down_per_steps_ == 0) {
+    ComputeInterceptor::SendDataReadyToDownStream();
+  }
+}
+
+void AmplifierInterceptor::ReplyCompletedToUpStream() {
+  // run multi times, reply one times to upstream, that is
+  // input one times, output multi times
+  if (step_ % reply_up_per_steps_ == 0) {
+    ComputeInterceptor::ReplyCompletedToUpStream();
+  }
+}
+
+REGISTER_INTERCEPTOR(Amplifier, AmplifierInterceptor);
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
new file mode 100644
index 00000000000000..776aa8d3e88db1
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+class AmplifierInterceptor : public ComputeInterceptor {
+ public:
+  AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
+
+ private:
+  void RunOps() override;
+  void SendDataReadyToDownStream() override;
+  void ReplyCompletedToUpStream() override;
+
+  int64_t run_per_steps_{1};
+  int64_t run_at_offset_{0};
+
+  // one input produces multi times output
+  int64_t reply_up_per_steps_{1};
+  // one output need multi times input
+  int64_t send_down_per_steps_{1};
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 73f22592dc3a75..55878a1000ec45 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -24,6 +24,7 @@ namespace paddle {
 namespace distributed {
 
 USE_INTERCEPTOR(Compute);
+USE_INTERCEPTOR(Amplifier);
 
 void Carrier::Init(std::shared_ptr<RuntimeGraph> runtime_graph,
                    framework::Scope* root_scope,
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 3d4078c932f702..09275dc10a136d 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -160,15 +160,18 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
   }
 }
 
+void ComputeInterceptor::RunOps() {
+  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops.";
+  for (auto op : node_->ops()) {
+    op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+  }
+}
+
 void ComputeInterceptor::Run() {
   while (IsInputReady() && CanWriteOutput() && !ShouldReset()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
-    // step_ %= node_->max_run_times();
-    for (auto op : node_->ops()) {
-      auto* scope = microbatch_scopes_[step_ % node_->max_run_times()];
-      op->Run(*scope, place_);
-    }
+    RunOps();
     ++step_;
 
     // send to downstream and increase buff used
@@ -176,7 +179,7 @@ void ComputeInterceptor::Run() {
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
     // Try to stop Carrier
-    if (step_ % node_->max_run_times() == 0 && is_last_) {
+    if (is_last_ && (step_ % node_->max_run_times() == 0)) {
       StopCarrier();
     }
   }
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index 8ed443ca971fb1..ae253f844aab4e 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -25,6 +25,14 @@ class ComputeInterceptor : public Interceptor {
  public:
   ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
 
+ protected:
+  virtual void RunOps();
+  virtual void SendDataReadyToDownStream();
+  virtual void ReplyCompletedToUpStream();
+
+  int64_t step_{0};
+
+ private:
   void PrepareDeps();
 
   void IncreaseReady(int64_t up_id);
@@ -33,19 +41,14 @@ class ComputeInterceptor : public Interceptor {
   bool CanWriteOutput();
   bool ShouldReset();
 
-  void SendDataReadyToDownStream();
-  void ReplyCompletedToUpStream();
-
   void Run();
   void Compute(const InterceptorMessage& msg);
 
   void ReceivedStop(int64_t up_id);
   void TryStop();
 
- private:
   bool is_source_{false};
   bool is_last_{false};
-  int64_t step_{0};
 
   // upstream_id-->(max_ready_size, ready_size)
   std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 8f4f9d80c42a58..762b46d6230ee0 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -44,12 +44,22 @@ class TaskNode final {
   int32_t role() const { return role_; }
   int64_t max_run_times() const { return max_run_times_; }
   int64_t max_slot_nums() const { return max_slot_nums_; }
+  int64_t run_per_steps() const { return run_per_steps_; }
+  int64_t run_at_offset() const { return run_at_offset_; }
+  int64_t reply_up_per_steps() const { return reply_up_per_steps_; }
+  int64_t send_down_per_steps() const { return send_down_per_steps_; }
   const std::unordered_set<int64_t>& upstream() const { return upstream_; }
   const std::unordered_set<int64_t>& downstream() const { return downstream_; }
   const std::string& type() const { return type_; }
   const paddle::framework::ProgramDesc& program() const { return program_; }
   const std::vector<OperatorBase*>& ops() const { return ops_; }
 
+  void SetRunPerSteps(int64_t value) { run_per_steps_ = value; }
+  void SetRunAtOffset(int64_t value) { run_at_offset_ = value; }
+  void SetReplyUpPerSteps(int64_t value) { reply_up_per_steps_ = value; }
+  void SetSendDownPerSteps(int64_t value) { send_down_per_steps_ = value; }
+  void SetType(const std::string& type) { type_ = type; }
+
   bool AddUpstreamTask(int64_t task_id);
   bool AddDownstreamTask(int64_t task_id);
   std::string DebugString() const;
@@ -76,6 +86,13 @@ class TaskNode final {
   int64_t max_run_times_;
   int64_t max_slot_nums_;
 
+  int64_t run_per_steps_{1};
+  int64_t run_at_offset_{0};
+  // one input produces multi times output
+  int64_t reply_up_per_steps_{1};
+  // one output need multi times input
+  int64_t send_down_per_steps_{1};
+
   std::string type_;
 };
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
index b0f00d70584768..d4587b90c87f3d 100644
--- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
@@ -4,6 +4,12 @@ cc_test(interceptor_ping_pong_test SRCS interceptor_ping_pong_test.cc DEPS fleet
 set_source_files_properties(compute_interceptor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(compute_interceptor_test SRCS compute_interceptor_test.cc DEPS fleet_executor ${BRPC_DEPS})
 
+set_source_files_properties(interceptor_pipeline_short_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(interceptor_pipeline_short_path_test SRCS interceptor_pipeline_short_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+
+set_source_files_properties(interceptor_pipeline_long_path_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(interceptor_pipeline_long_path_test SRCS interceptor_pipeline_long_path_test.cc DEPS fleet_executor ${BRPC_DEPS})
+
 set_source_files_properties(compute_interceptor_run_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(compute_interceptor_run_op_test SRCS compute_interceptor_run_op_test.cc DEPS fleet_executor ${BRPC_DEPS} op_registry fill_constant_op elementwise_add_op scope device_context)
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
new file mode 100644
index 00000000000000..b3fdb0b7adff01
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+void LinkNodes(const std::vector<TaskNode*>& nodes) {
+  size_t size = nodes.size();
+  if (size <= 1) return;
+
+  {  // i = 0
+    TaskNode* now = nodes[0];
+    TaskNode* next = nodes[1];
+    now->AddDownstreamTask(next->task_id());
+  }
+  {  // i = size - 1
+    TaskNode* prev = nodes[size - 2];
+    TaskNode* now = nodes[size - 1];
+    now->AddUpstreamTask(prev->task_id());
+  }
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    TaskNode* prev = nodes[i - 1];
+    TaskNode* now = nodes[i];
+    TaskNode* next = nodes[i + 1];
+
+    now->AddUpstreamTask(prev->task_id());
+    now->AddDownstreamTask(next->task_id());
+  }
+}
+
+TEST(AmplifierInterceptor, Amplifier) {
+  Carrier& carrier = Carrier::Instance();
+  MessageBus& msg_bus = MessageBus::Instance();
+  msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}},
+               {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
+
+  int64_t micro_steps = 3;
+
+  // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* node_a = new TaskNode(0, 0, 0, 1, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 1, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 1, 0);
+  TaskNode* node_d = new TaskNode(0, 0, 3, 1, 0);
+  TaskNode* node_e = new TaskNode(0, 0, 4, 1, 0);
+  TaskNode* node_f = new TaskNode(0, 0, 5, 1, 0);
+
+  // a->b->c->d->e->f
+  LinkNodes({node_a, node_b, node_c, node_d, node_e, node_f});
+
+  // LR->b(1:3)->F->B->e(3:1)->U
+  node_b->SetReplyUpPerSteps(micro_steps);
+  node_e->SetSendDownPerSteps(micro_steps);
+
+  carrier.SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
+  carrier.SetInterceptor(1, InterceptorFactory::Create("Amplifier", 1, node_b));
+  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier.SetInterceptor(3, InterceptorFactory::Create("Compute", 3, node_d));
+  carrier.SetInterceptor(4, InterceptorFactory::Create("Amplifier", 4, node_e));
+  carrier.SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f));
+
+  carrier.SetCreatingFlag(false);
+
+  // start
+  InterceptorMessage msg;
+  msg.set_message_type(DATA_IS_READY);
+  msg.set_src_id(-1);
+  msg.set_dst_id(0);
+  carrier.EnqueueInterceptorMessage(msg);
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
new file mode 100644
index 00000000000000..db42135040ae74
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+
+namespace paddle {
+namespace distributed {
+
+void LinkNodes(const std::vector<TaskNode*>& nodes) {
+  size_t size = nodes.size();
+  if (size <= 1) return;
+
+  {  // i = 0
+    TaskNode* now = nodes[0];
+    TaskNode* next = nodes[1];
+    now->AddDownstreamTask(next->task_id());
+  }
+  {  // i = size - 1
+    TaskNode* prev = nodes[size - 2];
+    TaskNode* now = nodes[size - 1];
+    now->AddUpstreamTask(prev->task_id());
+  }
+
+  for (size_t i = 1; i < size - 1; ++i) {
+    TaskNode* prev = nodes[i - 1];
+    TaskNode* now = nodes[i];
+    TaskNode* next = nodes[i + 1];
+
+    now->AddUpstreamTask(prev->task_id());
+    now->AddDownstreamTask(next->task_id());
+  }
+}
+
+TEST(AmplifierInterceptor, Amplifier) {
+  Carrier& carrier = Carrier::Instance();
+  MessageBus& msg_bus = MessageBus::Instance();
+  msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}}, {{0, ""}}, "");
+
+  int64_t micro_steps = 3;
+
+  // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* node_a =
+      new TaskNode(0, 0, 0, micro_steps, 0);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
+  TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0);
+
+  // a->b->c->d
+  LinkNodes({node_a, node_b, node_c, node_d});
+
+  node_a->SetRunPerSteps(micro_steps);
+  node_d->SetRunPerSteps(micro_steps);
+  node_d->SetRunAtOffset(micro_steps - 1);
+
+  carrier.SetInterceptor(0, InterceptorFactory::Create("Amplifier", 0, node_a));
+  carrier.SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier.SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier.SetInterceptor(3, InterceptorFactory::Create("Amplifier", 3, node_d));
+
+  carrier.SetCreatingFlag(false);
+
+  // start
+  InterceptorMessage msg;
+  msg.set_message_type(DATA_IS_READY);
+  msg.set_src_id(-1);
+  msg.set_dst_id(0);
+  carrier.EnqueueInterceptorMessage(msg);
+}
+
+}  // namespace distributed
+}  // namespace paddle

From 85e5ab2e0596333ef922a9ed808b0586ee2b08e3 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 2 Dec 2021 11:01:07 +0800
Subject: [PATCH 036/124] [NPU] add int64 support for scatter op (#37440)

* [NPU] add int64 support for scatter op

* [NPU] delete debug codes

* [NPU] optimize codes
---
 paddle/fluid/operators/scatter_op_npu.cc      | 53 +++++++++++++++----
 .../unittests/npu/test_scatter_op_npu.py      | 31 +++++++++--
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index de368e6e802193..8d92ea41665135 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -48,18 +48,49 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
       index = &tmp_tensor;
     }
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto op_func_update = [](const std::vector<Tensor>& inputs,
+                             const std::vector<Tensor>& outputs,
+                             const NPUAttributeMap& attrs,
+                             const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner =
+          NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+    auto op_func_add = [](const std::vector<Tensor>& inputs,
+                          const std::vector<Tensor>& outputs,
+                          const NPUAttributeMap& attrs,
+                          const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner =
+          NpuOpRunner("TensorScatterAdd", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
 
     if (overwrite) {
-      const auto& runner_update = NpuOpRunner(
-          "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
-      runner_update.Run(stream);
+      if (x->type() == framework::proto::VarType::INT64) {
+        NpuOpRunner::TypeAdapter(
+            {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_update,
+            {framework::proto::VarType::INT32, framework::proto::VarType::INT32,
+             framework::proto::VarType::INT32},
+            {framework::proto::VarType::INT32});
+      } else {
+        const auto& runner_update = NpuOpRunner(
+            "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
+        runner_update.Run(dev_ctx.stream());
+      }
     } else {
-      const auto& runner_add =
-          NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
-      runner_add.Run(stream);
+      if (x->type() == framework::proto::VarType::INT64) {
+        NpuOpRunner::TypeAdapter(
+            {*x, *index, *updates}, {*out}, {}, dev_ctx, op_func_add,
+            {framework::proto::VarType::INT32, framework::proto::VarType::INT32,
+             framework::proto::VarType::INT32},
+            {framework::proto::VarType::INT32});
+      } else {
+        const auto& runner_add =
+            NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
+        runner_add.Run(dev_ctx.stream());
+      }
     }
   }
 };
@@ -70,6 +101,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(
     scatter, ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+#endif
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
                           paddle::platform::float16>);
 #endif
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
index c05b53d9a48621..c353654641932e 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -27,7 +27,7 @@
 SEED = 2021
 
 
-class TestCast1(OpTest):
+class TestCast1_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -50,7 +50,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast2(OpTest):
+class TestCast_INT32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -73,7 +73,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast3(OpTest):
+class TestCast2_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -96,7 +96,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
-class TestCast4(OpTest):
+class TestCast3_FP32(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "scatter"
@@ -120,5 +120,28 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
+class TestCast_INT64(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("int64")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.zeros((1, 2)).astype("int64")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
 if __name__ == '__main__':
     unittest.main()

From e8c6c7dfe5a7746a12e02a9df0bca4b3ddc08ca5 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Thu, 2 Dec 2021 11:02:28 +0800
Subject: [PATCH 037/124] cinn_compiler add RemoveIdentity pass after
 Decomposer, test=develop (#37738)

---
 paddle/fluid/framework/paddle2cinn/cinn_compiler.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 360c9270782083..7fc8eff3d31c9e 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -193,6 +193,8 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors};
   auto frontend_program = symbol();
   ProgramPass::Apply(&frontend_program, target, {"Decomposer"});
+  auto fetch_ids = symbol.GetFetchIds();
+  ::cinn::frontend::ApplyPass(&frontend_program, fetch_ids, "RemoveIdentity");
   auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>(
       frontend_program, target);
   VLOG(1) << "-- The " << compiled_num << "-th compilation ("
@@ -201,7 +203,6 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   ApplyPass(cinn_graph.get(), "OpFusion");
   auto scope = BuildScope(target, cinn_graph);
 
-  auto fetch_ids = symbol.GetFetchIds();
   VLOG(4) << "All fetch var ids in CINN: "
           << string::join_strings(fetch_ids, ',');
 

From 29ebf621ea5a9d717993c2bc42d6e37fdfed5fc3 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Thu, 2 Dec 2021 11:05:18 +0800
Subject: [PATCH 038/124] fix typos: is_integer in attribute.py (#37749)

* fix typos: is_integer in attribute.py
* add more test cases for fft
---
 python/paddle/fft.py                          | 22 ++++----
 .../fluid/tests/unittests/fft/test_fft.py     | 53 +++++++++++++++++++
 python/paddle/tensor/attribute.py             |  2 +-
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index a62e502203b631..305eb1e23181b1 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -15,7 +15,7 @@
 from typing import Sequence
 import numpy as np
 import paddle
-from .tensor.attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from .tensor.attribute import is_complex, is_floating_point, is_integer, _real_to_complex_dtype, _complex_to_real_dtype
 from .fluid.framework import in_dygraph_mode
 from . import _C_ops
 from .fluid.data_feeder import check_variable_and_dtype
@@ -196,7 +196,7 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
 
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fft_r2c(
             x, n, axis, norm, forward=True, onesided=False, name=name)
     else:
@@ -260,7 +260,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
             #   0.14285714+6.25898038e-01j]
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fft_r2c(
             x, n, axis, norm, forward=False, onesided=False, name=name)
     else:
@@ -521,7 +521,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fftn_r2c(
             x, s, axes, norm, forward=True, onesided=False, name=name)
     else:
@@ -585,7 +585,7 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
 
     """
-    if is_interger(x) or is_floating_point(x):
+    if is_integer(x) or is_floating_point(x):
         return fftn_r2c(
             x, s, axes, norm, forward=False, onesided=False, name=name)
     else:
@@ -1355,7 +1355,7 @@ def ifftshift(x, axes=None, name=None):
 
 # internal functions
 def fft_c2c(x, n, axis, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1388,7 +1388,7 @@ def fft_c2c(x, n, axis, norm, forward, name):
 
 
 def fft_r2c(x, n, axis, norm, forward, onesided, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, paddle.get_default_dtype())
     _check_normalization(norm)
     axis = axis if axis is not None else -1
@@ -1425,7 +1425,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
 
 
 def fft_c2r(x, n, axis, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1464,7 +1464,7 @@ def fft_c2r(x, n, axis, norm, forward, name):
 
 
 def fftn_c2c(x, s, axes, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
@@ -1512,7 +1512,7 @@ def fftn_c2c(x, s, axes, norm, forward, name):
 
 
 def fftn_r2c(x, s, axes, norm, forward, onesided, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, paddle.get_default_dtype())
     _check_normalization(norm)
     if s is not None:
@@ -1567,7 +1567,7 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
 
 
 def fftn_c2r(x, s, axes, norm, forward, name):
-    if is_interger(x):
+    if is_integer(x):
         x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index 0ef7a1e939e022..7ee5a04ece496b 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -120,6 +120,33 @@ def test_fft(self):
                     atol=ATOL.get(str(self.x.dtype))))
 
 
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, -1, 'backward'),
+     ('test_x_complex', rand_x(
+         5, complex=True), None, -1,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), 11, -1,
+                    'backward'), ('test_n_smaller_than_input_length', rand_x(
+                        5, min_dim_len=5, complex=True), 3, -1, 'backward'),
+     ('test_axis_not_last', rand_x(5), None, 3, 'backward'),
+     ('test_norm_forward', rand_x(5), None, 3, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, 3, 'ortho')])
+class TestIfft(unittest.TestCase):
+    def test_fft(self):
+        """Test ifft with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertTrue(
+                np.allclose(
+                    scipy.fft.ifft(self.x, self.n, self.axis, self.norm),
+                    paddle.fft.ifft(
+                        paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                    rtol=RTOL.get(str(self.x.dtype)),
+                    atol=ATOL.get(str(self.x.dtype))))
+
+
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
     ('test_n_nagative', rand_x(2), -1, -1, 'backward', ValueError),
@@ -230,6 +257,32 @@ def test_fftn(self):
                 atol=ATOL.get(str(self.x.dtype)))
 
 
+@place(DEVICES)
+@parameterize(
+    (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'),
+    [('test_x_float64', rand_x(5, np.float64), None, None, 'backward'),
+     ('test_x_complex128', rand_x(
+         5, complex=True), None, None,
+      'backward'), ('test_n_grater_input_length', rand_x(
+          5, max_dim_len=5), (6, 6), (1, 2), 'backward'), (
+              'test_n_smaller_input_length', rand_x(
+                  5, min_dim_len=5, complex=True), (3, 3), (1, 2), 'backward'),
+     ('test_axis_not_default', rand_x(5), None, (1, 2),
+      'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'),
+     ('test_norm_ortho', rand_x(5), None, None, 'ortho')])
+class TestIFftn(unittest.TestCase):
+    def test_ifftn(self):
+        """Test ifftn with norm condition
+        """
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                scipy.fft.ifftn(self.x, self.n, self.axis, self.norm),
+                paddle.fft.ifftn(
+                    paddle.to_tensor(self.x), self.n, self.axis, self.norm),
+                rtol=RTOL.get(str(self.x.dtype)),
+                atol=ATOL.get(str(self.x.dtype)))
+
+
 @place(DEVICES)
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm'), [
     ('test_x_complex128',
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 8d8c2a83de1dbd..44b34b3e2d67ea 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -60,7 +60,7 @@ def is_floating_point(x):
     return is_fp_dtype
 
 
-def is_interger(x):
+def is_integer(x):
     dtype = x.dtype
     is_int_dtype = (dtype == core.VarDesc.VarType.UINT8 or
                     dtype == core.VarDesc.VarType.INT8 or

From 20e19776b00616f8263ac1fc72957fbe966a960a Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Thu, 2 Dec 2021 11:18:51 +0800
Subject: [PATCH 039/124] Add dygraph sharding stage2 (#37707)

---
 .../sharding_optimizer_stage2.py              |   6 -
 .../meta_parallel/sharding/sharding_stage2.py | 505 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/dygraph_sharding_stage2.py      | 204 +++++++
 .../unittests/test_dygraph_sharding_stage2.py |  31 ++
 5 files changed, 743 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 9595896188ba5e..ba1b5222394e2d 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -68,7 +68,6 @@ def __init__(self,
                  broadcast_fp16=False,
                  offload=False,
                  device="gpu",
-                 accumulation_steps=None,
                  **kw):
 
         super().__init__(optim._learning_rate, params, kw)
@@ -86,7 +85,6 @@ def __init__(self,
         self._optim = optim
         self._local_params = params
         self._default_device = device
-        self._accumulation_steps = accumulation_steps
 
         assert group is not None, "Distributed communication group is must be gived"
         self.group = group
@@ -136,10 +134,6 @@ def segment_params(self):
     def local_params(self):
         return self._local_params
 
-    @property
-    def accumulation_steps(self):
-        return self._accumulation_steps
-
     @property
     def param2rank(self):
         """Map the params to the rank which owns them"""
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
new file mode 100644
index 00000000000000..8ac4a7e99c7d71
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -0,0 +1,505 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#Taken and modified for fairscale from:
+#    https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/data_parallel/sharded_ddp.py
+#Commit: 8acbec718f3c70a6b9785470bb9e05cd84fc3f8e
+
+import os
+import contextlib
+import logging
+import time
+import functools
+import numpy as np
+from itertools import chain
+from functools import reduce
+from collections import deque
+
+import paddle
+from paddle import nn
+import paddle.distributed as dist
+
+from ...utils.internal_storage import GradStorage
+from .sharding_utils import Taskflow, Type
+
+
+def _trainable(param):
+    return param.trainable
+
+
+class ShardingStage2(nn.Layer):
+    """ 
+    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    .. warning: ShardingStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
+    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
+    """
+
+    # TODO (Baibaifan) 
+    # Feature Notes::
+    # 1. Unified memory for param and param.grad to InternalStorage.
+    # 2. Divide param.grad according to rank to centrally apply for and release GPU memory.
+    # 3. Dynamically adjust training parameters and models。
+    # 4. Support offload function.
+    # 5. Support the establishment of independent communication groups.
+
+    def __init__(
+            self,
+            layer,
+            sharding_optimizer,
+            group,
+            sync_buffers=False,
+            pertrain_sync_models=True,
+            buffer_max_size=2**23,  #8MB
+            auto_refresh_trainable=True,
+            device="gpu",
+            use_grad_storage=True,
+            accumulate_grads=False):
+        super().__init__()
+
+        # training options
+        self._layer = layer
+        self._sharding_optimizers = [sharding_optimizer] if not isinstance(
+            sharding_optimizer, list) else sharding_optimizer
+        self._sync_buffers = sync_buffers
+        self._auto_refresh_trainable = auto_refresh_trainable
+
+        # Gradient accumulation, Gradient flip
+        self._accumulate_grads = accumulate_grads
+
+        # Communication related attributes
+        assert group is not None, "Distributed communication group is must be gived"
+        self._group = group
+        self._world_size_scaling = 1.0 / self._group.nranks
+        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
+        self._rank = self._group.rank
+        self._global_root_rank = 0  # picking rank 0 as the reference
+        self._global_ranks = self._group.ranks
+        self._default_device = device
+
+        # Global statistical parameters
+        self._all_params = list(
+            chain(
+                * [optim.local_params for optim in self._sharding_optimizers]))
+        self._trainable_params = []
+        self._grad_reduced = []
+        self._trainable_param2rank = {}
+        self._trainable_param2align = {}
+        self._trainable_mask = list(map(_trainable, self._all_params))
+        self._param_grads = []
+
+        # Set grad storage size & Display param sizes and model sizes
+        model_size = sum(
+            [np.prod(p.shape) for p in self._layer.parameters()]).item()
+        self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
+                                                       model_size)
+        self._use_grad_storage = use_grad_storage
+        self._grad_storages = {}  # {dtype: {rank: GradStorage}}
+        self._has_grad_storage = []
+        self._grad_storage_list = []
+
+        # Set backward pass hooks
+        self._bw_hooks = []
+
+        # Synchronous all ranks models
+        if pertrain_sync_models:
+            self._sync_params_and_buffers()
+
+        # Set tasks flow
+        self._tasks_flow = deque()
+
+    def forward(self, *inputs, **kwargs):
+        """
+        A wrapper for Sharding Stage2 layer.
+        - Fresh trainable params or rebuild grad storage
+        - Sync layer's buffer params
+        - Clear all flags states
+        - Forward for origin layers
+        """
+
+        # Whether to need to reset trainable parameters
+        needs_fresh = len(self._bw_hooks) == 0 and self.training
+
+        if self._auto_refresh_trainable:
+            needs_fresh |= self._detect_train_change()
+
+        # Front hook
+        self._init_internal_storage(needs_fresh)
+
+        # Sync layer's buffers state
+        if self._sync_buffers:
+            self.__sync_buffers()
+
+        # Normal FW on the base model
+        fw = self._layer(*inputs, **kwargs)
+
+        return fw
+
+    def clear_gradients(self):
+        """
+        Set zero to the gradient of the optimizer's current rank trainable parameters.
+        """
+        # Release grad storages
+        for dtype in self._grad_storages.keys():
+            if self._rank in self._grad_storages[dtype].keys():
+                self._grad_storages[dtype][self._rank].buffer.zero_()
+
+        # Release params
+        for param in self._trainable_params:
+            if param.name in self._param_grads and param.grad is not None:
+                param.clear_gradient()
+
+    def grad_scale(self):
+        """
+        Before the gradient accumulation, scale the gradient.
+        """
+        # Scale grad storages
+        for dtype in self._grad_storages.keys():
+            if self._rank in self._grad_storages[dtype].keys():
+                self._grad_storages[dtype][self._rank].buffer.scale_(
+                    scale=self._world_size_scaling)
+
+        # Scale params
+        for param in self._trainable_params:
+            if param.name in self._param_grads and param.grad is not None:
+                param.grad.scale_(scale=self._world_size_scaling)
+                param._reset_grad_inplace_version()
+
+    def _init_internal_storage(self, needs_fresh):
+        """
+        Judge Fresh trainable params or rebuild grad storage.
+        """
+        if needs_fresh:
+            self._fresh_trainable()
+        else:
+            self._build_grad_storages()
+
+        # Clear all flags state 
+        self._clear_counters()
+
+    def to(self, device=None, dtype=None, blocking=True):
+        """
+        Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
+        """
+        assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
+
+    def _fresh_trainable(self):
+        """ Whether to update training parameters. """
+
+        # Make sure that this is not done while gradients are waiting to be reduced (if no_sync context for instance)
+        if reduce(lambda x, y: x or y, self._grad_reduced, False):
+            logging.warning("Grads waiting to be reduced.")
+
+        self._trainable_params = list(
+            filter(lambda x: x.trainable, self._all_params))
+        self._trainable_params.sort(key=lambda x: np.prod(x.shape))
+
+        self._trainable_param2rank = {}
+        for optim in self._sharding_optimizers:
+            # Need to be wrappered for Sharding Stage2 Optimizer
+            if len(optim.param_storages.keys()) == 0:
+                optim.update_opt_status()
+
+            # Get the parameters split by the optimizer according to rank
+            for per_rank_params in optim.dtype_rank_params.values(
+            ):  # all the params from all ranks
+                for params in per_rank_params:
+                    for param in filter(lambda x: x.trainable, params):
+                        self._trainable_param2rank[
+                            param.name] = optim.param2rank[param.name]
+                        self._trainable_param2align[
+                            param.name] = optim._param2align[param.name]
+
+        self._setup_use_grad_storage()
+
+        # wait next func hook support
+        self._setup_backward_hooks()
+
+    @paddle.no_grad()
+    def __sync_buffers(self):
+        """
+        Sync all the param buffers from all ranks (exp: batch norm statistics).
+        """
+
+        for buffer in self._layer.buffers(include_sublayers=True):
+            dist.broadcast(
+                buffer,
+                self._global_root_rank,
+                self._group,
+                use_calc_stream=True)
+        # Multi stream operation will be supported later
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped layer."""
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self._layer, name)
+
+    @paddle.no_grad()
+    def _clear_counters(self):
+        """Reset all the grad reduce and call counters."""
+        if self.training:
+            self._grad_reduced = [True for _ in self._trainable_params]
+
+        if self._use_grad_storage:
+            for grad_storage in self._grad_storage_list:
+                grad_storage.reset_checked_in()
+
+        if not self._accumulate_grads:
+            self._grads_flipped = False
+
+    def _get_reduce_fn(self, index, param, dst_rank):
+        """
+        There are two ways to reduce gradient.
+        - 1. Do not use use_grad_storage or exceeded buffer_max_size will be reduced separately.
+        - 2. Use grad_storage Reduce the storage to get the full gradient from different ranks.
+        """
+
+        if not self._use_grad_storage or not self._has_grad_storage[index]:
+            # Direct reduction
+            @paddle.no_grad()
+            def reduce(*_):
+                # Skip gradient reduction, do not change status information
+                if self._grad_reduced[index]:
+                    assert param.grad is not None, "Parameter gradient cannot be None"
+
+                    # Change reduce information
+                    self._grad_reduced[index] = False
+                    if not self._accumulate_grads:
+                        param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version()
+
+                    # Clear the gradient that does not belong to the current rank through the callback function
+                    def cleanup():
+                        if dst_rank != self._rank:
+                            param.clear_gradient(False)
+
+                    # Synchronize the reduce parameter gradient
+                    self._tasks_flow.append(
+                        Taskflow(
+                            task=dist.reduce(
+                                tensor=param.grad,
+                                dst=dst_rank,
+                                group=self._group,
+                                use_calc_stream=True),
+                            callback=cleanup))
+
+                    # Multi stream operation will be supported later
+                    dist.wait(
+                        tensor=param.grad,
+                        group=self._group,
+                        use_calc_stream=True)
+
+                    # Clear the task flow and trigger callback to clear the redundant gradient
+                    self._clear_task_flow()
+
+        else:
+            # Buffer reduction
+            @paddle.no_grad()
+            def reduce(*_):
+                # Skip gradient reduction, do not change status information
+                if self._grad_reduced[index]:
+                    assert param.grad is not None, "Parameter gradient cannot be None"
+
+                    # Change reduce information
+                    self._grad_reduced[index] = False
+                    grad_storage = self._grad_storages[param.dtype][dst_rank]
+                    grad_storage.params_checked_in += 1
+
+                    if grad_storage.all_checked_in:
+                        assert grad_storage.buffer is not None
+
+                        # Normalize all ranks grad_storage
+                        if not self._accumulate_grads:
+                            grad_storage.buffer.scale_(
+                                scale=self._world_size_scaling)
+
+                        # Clearing up the grad_storage buffer
+                        def cleanup():
+                            if dst_rank != self._rank:
+                                for p in grad_storage._params:
+                                    p.clear_gradient(False)
+                                    p._gradient_set_empty(False)
+
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )
+
+                        # Reduce the bucket
+                        grad_storage.sent = True
+                        self._tasks_flow.append(
+                            Taskflow(
+                                task=dist.reduce(
+                                    tensor=grad_storage.buffer,
+                                    dst=grad_storage.destination,
+                                    group=self._group,
+                                    use_calc_stream=True),
+                                callback=cleanup))
+
+                        # Multi stream operation will be supported later
+                        dist.wait(
+                            tensor=grad_storage.buffer,
+                            group=self._group,
+                            use_calc_stream=True)
+
+                    # Clear the task flow and trigger callback to clear the redundant gradient
+                    self._clear_task_flow()
+
+        return reduce
+
+    def _setup_backward_hooks(self):
+        """
+        Set the backward hook to synchronize the gradients of all rank by reduce group ranks.
+        """
+
+        # Remove previous backward hooks
+        while len(self._bw_hooks) > 0:
+            self._bw_hooks.pop().remove()
+
+        # Go through the parameters, attach the hook
+        self._grad_accs = []
+        if not self.training:
+            return
+
+        for index, param in enumerate(self._trainable_params):
+            dst_rank = self._trainable_param2rank[param.name]
+
+            reduce_function = self._get_reduce_fn(index, param, dst_rank)
+
+            self._bw_hooks.append(
+                param._register_backward_hook(reduce_function))
+
+    @paddle.no_grad()
+    def _sync_params_and_buffers(self):
+        """
+        Sync all model states for all ranks
+        """
+
+        for t in self._layer.parameters():
+            dist.broadcast(
+                t,
+                src=self._global_root_rank,
+                group=self._group,
+                use_calc_stream=True)
+
+        # Multi stream operation will be supported later
+        dist.wait(tensor=t, group=self._group, use_calc_stream=True)
+
+    def _setup_use_grad_storage(self):
+        """
+        Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters.
+        """
+
+        if not self._use_grad_storage:
+            return
+
+        # According to parameters's numel sort, allocate memory of parameter gradient to continuous memory according to rank
+        self._grad_storages = {}
+        self._has_grad_storage = [False for _ in self._trainable_params]
+
+        for index, param in enumerate(self._trainable_params):
+            dst_rank = self._trainable_param2rank[param.name]
+
+            if param.dtype not in self._grad_storages.keys():
+                self._grad_storages[param.dtype] = {}
+
+            if dst_rank not in self._grad_storages[param.dtype].keys():
+                self._grad_storages[param.dtype][dst_rank] = GradStorage(
+                    self._buffer_max_size[param.dtype],
+                    dtype=param.dtype,
+                    device=self._default_device,
+                    destination=dst_rank,
+                    parm2align=self._trainable_param2align)
+
+            # Criteria to decide whether this parameter is to be put in GradStorage
+            if self._grad_storages[param.dtype][dst_rank].can_add_grad_view(
+                    param, self._trainable_param2align[param.name]):
+                self._grad_storages[param.dtype][dst_rank].add_grad(
+                    param, self._trainable_param2align[param.name])
+                self._has_grad_storage[index] = True
+            else:
+                self._param_grads.append(param.name)
+                print(
+                    "Can not add param: {}, param's shape: {}, param align: {}, grad_storages fill: {}, ".
+                    format(param.name, param.shape, self._trainable_param2align[
+                        param.name], self._grad_storages[param.dtype][dst_rank]
+                           ._fill))
+
+        self._grad_storage_list = list(
+            chain(* [
+                self._grad_storages[dtype].values()
+                for dtype in self._grad_storages.keys()
+            ]))
+
+    def _clear_task_flow(self):
+        """Try to consume the previous tasks."""
+        while len(self._tasks_flow) > 0:
+            task = self._tasks_flow.popleft()
+            if task.callback is not None:
+                task.callback()
+
+    def _detect_train_change(self):
+        # Current trainable parameters
+        trainable_mask = list(map(_trainable, self._all_params))
+
+        # Whether parameters trainability changed
+        trainability_changed = trainable_mask != self._trainable_mask
+
+        # The whole model is not trainable but we still have grad hooks
+        trainability_changed |= not self.training and len(self._bw_hooks) > 0
+
+        if trainability_changed:
+            logging.warning(
+                "Trainable params changed, because of eval/train mode or parameter freezing/unfreeze."
+            )
+            self._trainable_mask = trainable_mask
+
+        return trainability_changed
+
+    def _build_grad_storages(self):
+        """
+        Rebuild grad storages.
+        """
+        # Rebuild fp16/fp32 grad storages
+        for dtype in self._grad_storages.keys():
+            for dst_rank, grad_storage in self._grad_storages[dtype].items():
+                if dst_rank != self._rank:
+                    grad_storage.manumal_relase()
+                    grad_storage.rebuild()
+
+    def _rank_buffer_size(self, buffer_max_size, model_size):
+        """
+        Generate the minimum buffer size for each rank & Display param sizes and model sizes.
+        """
+
+        # Initialize buffer size
+        rank_buffer_size = {}
+        for shard_opt in self._sharding_optimizers:
+            if shard_opt.rank_buffer_size:
+                for dtype in shard_opt.rank_buffer_size.keys():
+                    sizes = max(shard_opt.rank_buffer_size[dtype].values())
+                    rank_buffer_size[dtype] = min(sizes, buffer_max_size)
+
+        if Type.fp16.value in rank_buffer_size.keys():
+            # FP16 GradStorage and model size
+            print(
+                "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
+                format(rank_buffer_size[Type.fp16.value] / 2**19, model_size / 2
+                       **19))
+        if Type.fp32.value in rank_buffer_size.keys():
+            # FP32 GradStorage and model size
+            print(
+                "====== FP32 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======".
+                format(rank_buffer_size[Type.fp32.value] / 2**18, model_size / 2
+                       **18))
+        return rank_buffer_size
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 099dadd6173900..15f857f6087302 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -33,6 +33,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
+list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -244,6 +245,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1039,6 +1041,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
new file mode 100644
index 00000000000000..bc62d18c860226
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -0,0 +1,204 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import DygraphShardingOptimizer
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+
+seed = 2021
+epoch = 2
+batch_size = 32
+
+strategy = fleet.DistributedStrategy()
+strategy.hybrid_configs = {
+    "dp_degree": 2,
+    "mp_degree": 1,
+    "pp_degree": 1,
+    "sharding_degree": 1
+}
+fleet.init(is_collective=True, strategy=strategy)
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(10000, 10000)
+        self._linear2 = Linear(10000, 10000)
+        self._linear3 = Linear(10000, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator():
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(10000).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, stage=1):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              use_pure_fp16=False,
+              all_test=False,
+              accumulate_grad=False):
+    if sharding_stage == 1:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_check_parallel_group()
+    else:
+        group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(
+        model=model, use_pure_fp16=use_pure_fp16, stage=sharding_stage)
+
+    if use_pure_fp16:
+        model, optimizer = paddle.amp.decorate(
+            models=model,
+            optimizers=optimizer,
+            level='O2',
+            save_dtype='float32')
+
+    if sharding_stage == 2:
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(), optim=optimizer, group=group)
+        if all_test:
+            model = ShardingStage2(
+                model, optimizer, group=group, accumulate_grads=accumulate_grad)
+        else:
+            model = ShardingStage2(model, optimizer, group=group)
+    else:
+        optimizer = fleet.distributed_optimizer(optimizer)
+        model = fleet.distributed_model(model)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(enable=use_pure_fp16, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            avg_loss.backward()
+
+            if accumulate_grad and batch_id == 2:
+                model.grad_scale()
+                optimizer.step()
+                model.clear_gradients()
+                return model.parameters()
+
+            if not accumulate_grad:
+                optimizer.step()
+
+                if sharding_stage == 2:
+                    model.clear_gradients()
+                else:
+                    optimizer.clear_grad()
+
+            if all_test and batch_id == 2:
+                return model.parameters()
+
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
+    return model.parameters()
+
+
+def test_stage1_stage2():
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp3 = MLP()
+    mlp4 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    stage1_params = train_mlp(mlp, sharding_stage=1, use_pure_fp16=False)
+    stage2_params = train_mlp(mlp, sharding_stage=2, use_pure_fp16=False)
+    for i in range(len(stage1_params)):
+        np.testing.assert_allclose(
+            stage1_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+
+    stage2_params = train_mlp(
+        mlp3, sharding_stage=2, use_pure_fp16=True, all_test=True)
+    stage2_accumulate_grad = train_mlp(
+        mlp4,
+        sharding_stage=2,
+        use_pure_fp16=True,
+        all_test=True,
+        accumulate_grad=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage2_accumulate_grad)):
+            if stage2_params[i].name == stage2_accumulate_grad[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage2_accumulate_grad[j].numpy(),
+                    rtol=1e-6)
+
+    return
+
+
+if __name__ == '__main__':
+    test_stage1_stage2()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
new file mode 100644
index 00000000000000..c5cf8c5d5ed690
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage2(TestMultipleGpus):
+
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_optimizer_stage2(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2.py')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 67cb5476a7445087bc0161eb9b394abf6772ef2d Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Thu, 2 Dec 2021 11:29:10 +0800
Subject: [PATCH 040/124] [BugFix] Fix svd_op attrs typo (#37744)

---
 python/paddle/tensor/linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index c5bf19e83ded8a..f333b527db38f8 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1573,7 +1573,7 @@ def svd(x, full_matrices=False, name=None):
         outputs={'U': u,
                  'VH': vh,
                  'S': s},
-        attr=attrs, )
+        attrs=attrs, )
     return u, s, vh
 
 

From f6d485a85fef5fbce272450a574dc545016e47b4 Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Thu, 2 Dec 2021 11:50:00 +0800
Subject: [PATCH 041/124] fix trainer_pass.py (#37779)

---
 .../fluid/incubate/fleet/parameter_server/ir/trainer_pass.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 11fa70b70ba852..ff10c8ea097ad0 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -771,7 +771,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 
                 """
                 output_vars_no_grad = []
-                for key in pre_op.output_names:
+                for key in op.output_names:
                     for varname in op.output(key):
                         if varname == "@EMPTY@":
                             continue

From 00dfebe8f7fd9ab66685bc40ae0a7a60b808f664 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Thu, 2 Dec 2021 12:44:15 +0800
Subject: [PATCH 042/124] modify configure for pass stable project (#37760)

* modify configure for pass stable project

* modify configure for pass stable project
---
 .../tests/unittests/ir/inference/CMakeLists.txt     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index f59f686e78a32d..58affe1b843fdd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -47,9 +47,12 @@ if(WITH_MKLDNN)
   endforeach()
 endif()
 
-foreach(target ${TEST_INFERENCE_IR_PASSES})
-  py_test_modules(${target} MODULES ${target})
-endforeach()
+if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
+  foreach(target ${TEST_INFERENCE_IR_PASSES})
+    py_test_modules(${target} MODULES ${target})
+  endforeach()
+endif()
+
 if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
@@ -74,8 +77,8 @@ set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
 set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
 
 if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
-set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
 endif()
 
 if (WITH_MKLDNN)

From cfd6a8fccdfa1256d069997148a06507c961497a Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 2 Dec 2021 13:08:52 +0800
Subject: [PATCH 043/124] [new-exec] fix the new variable name in DataTransfer
 (#37756)

---
 paddle/fluid/framework/new_executor/data_transfer.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 15e6b2a1ff9391..064dfa0170bdb7 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -137,7 +137,7 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
+  auto* ptr = local_scope->Var(*new_var_name);
 
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
@@ -171,8 +171,8 @@ std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
-  var_scope->SetVarDesc(var_name, nullptr);
+  auto* ptr = local_scope->Var(*new_var_name);
+
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
 
@@ -211,7 +211,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
   // 1. Generate new_var_name and Initialize it
   *new_var_name =
       var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
-  auto* ptr = local_scope->Var(new_var_name);
+  auto* ptr = local_scope->Var(*new_var_name);
 
   auto var_type = var_scope->Var(var_name)->Type();
   InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));

From bfb857796fd0e1de6605bc50b19864f93969c1c3 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Thu, 2 Dec 2021 14:00:21 +0800
Subject: [PATCH 044/124] optimize dygraph performance with move runtime import
 to begining (#37759)

* optimize dygraph probl

* refine code

* fix convert dtype error

* fix import datafeeder error
---
 python/paddle/fluid/data_feeder.py            |  2 +-
 python/paddle/fluid/dygraph/base.py           | 22 +++++++++++++++++-
 .../dygraph_to_static/program_translator.py   | 23 +------------------
 python/paddle/fluid/dygraph/layers.py         |  3 +--
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 52be7493cf229b..60f844b27bef1f 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -93,10 +93,10 @@ def check_type(input, input_name, expected_type, op_name, extra_message=''):
     if in_dygraph_mode():
         return
 
-    from .dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # NOTE: `in_declarative_mode` is used to determined whether this op is called under
     # @declarative in transformation from dygrah to static layer. We add VarBase in
     # expected_type to skip checking because varBase may be created and used in unusual way.
+    from .dygraph.base import in_declarative_mode
     # Need a better design to be fix this.
     if in_declarative_mode():
         if not isinstance(expected_type, tuple):
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 460831f8745b31..f54a1629196a0c 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -33,6 +33,17 @@
     'enabled', 'to_variable'
 ]
 
+# Flag that indicates whether running code under `@declarative`
+_in_declarative_mode_ = False
+
+
+def in_declarative_mode():
+    """
+    Return a bool value that indicates whether running code under `@declarative`
+
+    """
+    return _in_declarative_mode_
+
 
 def _switch_to_static_graph_(func):
     def __impl__(*args, **kwargs):
@@ -45,6 +56,16 @@ def __impl__(*args, **kwargs):
 switch_to_static_graph = wrap_decorator(_switch_to_static_graph_)
 
 
+@signature_safe_contextmanager
+def _switch_declarative_mode_guard_(is_declarative=True):
+
+    global _in_declarative_mode_
+    original_val = _in_declarative_mode_
+    _in_declarative_mode_ = is_declarative
+    yield
+    _in_declarative_mode_ = original_val
+
+
 @signature_safe_contextmanager
 def program_desc_tracing_guard(enable):
     tracer = framework._dygraph_tracer()
@@ -63,7 +84,6 @@ def program_desc_tracing_guard(enable):
 
 @signature_safe_contextmanager
 def param_guard(parameters):
-    from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
     # Note: parameters is a reference of self._parameters or self._buffers
     if in_declarative_mode() and not framework.in_dygraph_mode() and parameters:
         origin_parameters = parameters.copy()
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index d5d0e8ab88b869..19479a190c3b9e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -573,28 +573,6 @@ def function_spec(self):
         return self._function_spec
 
 
-# Flag that indicates whether running code under `@declarative`
-_in_declarative_mode_ = False
-
-
-def in_declarative_mode():
-    """
-    Return a bool value that indicates whether running code under `@declarative`
-
-    """
-    return _in_declarative_mode_
-
-
-@signature_safe_contextmanager
-def _switch_declarative_mode_guard_(is_declarative=True):
-
-    global _in_declarative_mode_
-    original_val = _in_declarative_mode_
-    _in_declarative_mode_ = is_declarative
-    yield
-    _in_declarative_mode_ = original_val
-
-
 def _verify_init_in_dynamic_mode(class_instance):
     """
     Verifies the instance is initialized in dynamic mode.
@@ -658,6 +636,7 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         startup_program.random_seed = framework.default_startup_program(
         ).random_seed
 
+        from paddle.fluid.dygraph.base import _switch_declarative_mode_guard_
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 11812398ba4550..0373c1e63da815 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -31,7 +31,7 @@
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard
+from .base import program_desc_tracing_guard, param_guard, in_declarative_mode
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
@@ -917,7 +917,6 @@ def __call__(self, *inputs, **kwargs):
         # In case of ControlFlow, true_fn and false_fn will contain
         # parameters that may not trigger logic of `Operator` to create
         # them. we add this to make sure all parameters is available.
-        from paddle.fluid.dygraph.dygraph_to_static.program_translator import in_declarative_mode
 
         if in_declarative_mode() and not framework.in_dygraph_mode():
             with param_guard(self._parameters), param_guard(self._buffers):

From 0074a3c9e3bfd642f7d645707c3d3816096cca82 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:45:43 +0800
Subject: [PATCH 045/124] [Fleet Executor] Refine runtime graph (#37703)

---
 .../distributed/fleet_executor/CMakeLists.txt |  2 +-
 .../fleet_executor/fleet_executor.cc          |  6 +-
 .../fleet_executor/fleet_executor.h           |  1 -
 .../fleet_executor/fleet_executor_desc.proto  |  2 +-
 .../distributed/fleet_executor/interceptor.cc | 31 +------
 .../fleet_executor/runtime_graph.cc           | 16 +++-
 .../distributed/fleet_executor/task_node.cc   |  7 ++
 .../distributed/fleet_executor/task_node.h    |  2 +
 paddle/fluid/pybind/bind_fleet_executor.cc    |  4 +-
 python/paddle/fluid/executor.py               | 87 ++++++++++---------
 10 files changed, 78 insertions(+), 80 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index b615088b3b1118..4ef5e77aad1110 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
         interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
-        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper
+        DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry
         ${BRPC_DEPS})
 
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index e84e37a58eb5cb..3479157de5c454 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -31,9 +31,7 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
                                  "Error occurs while parsing string to proto"));
 }
 
-FleetExecutor::~FleetExecutor() {
-  // Destroy Executor
-}
+FleetExecutor::~FleetExecutor() { root_scope_->DropKids(); }
 
 void FleetExecutor::Init(const framework::ProgramDesc& program_desc,
                          framework::Scope* scope,
@@ -113,8 +111,6 @@ void FleetExecutor::Run() {
   carrier_instance.Start();
 }
 
-void FleetExecutor::Release() { root_scope_->DropKids(); }
-
 void FleetExecutor::CopyParameters(int microbatch_id,
                                    const framework::ProgramDesc& program) {
   auto& global_block = program.Block(0);
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index cee739506b7e62..ac857fb6c38a21 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -39,7 +39,6 @@ class FleetExecutor final {
   void Init(const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place);
   void Run();
-  void Release();
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
index 1b12f1239dcbd7..0da98ab13b9ffe 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
@@ -21,7 +21,7 @@ message RankInfo {
 }
 
 message FleetExecutorDesc {
-  optional string grain = 1 [ default = "coarse" ];
+  optional string strategy = 1 [ default = "Origin" ];
   optional int64 cur_rank = 2 [ default = 0 ]; // Rank id of current processor
   repeated RankInfo cluster_info = 3;
   optional int32 dp_degree = 4 [ default = 1 ];
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.cc b/paddle/fluid/distributed/fleet_executor/interceptor.cc
index 26927f34c6879b..dd7b89c4b81199 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.cc
@@ -40,34 +40,9 @@ void Interceptor::Join() {
 void Interceptor::RegisterMsgHandle(MsgHandle handle) { handle_ = handle; }
 
 void Interceptor::Handle(const InterceptorMessage& msg) {
-  if (handle_) {
-    handle_(msg);
-  } else {
-    VLOG(3) << "Interceptor is using default message handler. This handler is "
-               "only used for test purpose. Check whether you init interceptor "
-               "in the proper way.";
-    if (msg.message_type() == DATA_IS_READY) {
-      if (node_->role() != 2) {
-        VLOG(3) << "Fake handler is sending DATA_IS_READY message to: "
-                << interceptor_id_ + 1 << ".";
-        InterceptorMessage data_is_ready_msg;
-        data_is_ready_msg.set_message_type(DATA_IS_READY);
-        Send(interceptor_id_ + 1, data_is_ready_msg);
-      } else {
-        // NOTE: max run time is reach for last interceptor
-        StopCarrier();
-      }
-    } else if (msg.message_type() == STOP) {
-      stop_ = true;
-      if (node_->role() != 2) {
-        VLOG(3) << "Fake handler is sending STOP message to: "
-                << interceptor_id_ + 1 << ".";
-        InterceptorMessage stop_msg;
-        stop_msg.set_message_type(STOP);
-        Send(interceptor_id_ + 1, stop_msg);
-      }
-    }
-  }
+  PADDLE_ENFORCE_NOT_NULL(handle_, platform::errors::PreconditionNotMet(
+                                       "Message handle is not registered."));
+  handle_(msg);
 }
 
 void Interceptor::StopCarrier() {
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index b32db6c2294b80..21026ee3f973b7 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -100,11 +100,25 @@ std::vector<OpRole> RuntimeGraph::functionality_order = {
 RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
                            const FleetExecutorDesc& exe_desc)
     : exe_desc_(exe_desc) {
-  if (exe_desc.grain() == "coarse") {
+  if (exe_desc.strategy() == "1F1B") {
     SplitProgramBasedFunctionality(program);
     AssignTaskToIntercepter();
     FakeDependence();
     FakeRuntimeInfo();
+  } else if (exe_desc.strategy() == "Origin") {
+    int64_t cur_rank = exe_desc_.cur_rank();
+    int64_t max_run_times = exe_desc_.num_micro_batches();
+    int64_t max_slot_nums = exe_desc_.num_slots();
+    auto task_node = std::make_unique<TaskNode>(program, cur_rank,
+                                                max_run_times, max_slot_nums);
+    task_node->SetType("Compute");
+    task_nodes_.emplace_back(std::move(task_node));
+    int64_t task_id = task_nodes_[0]->task_id();
+    intercepter_id_to_rank_.insert({task_id, cur_rank});
+    intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()});
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Strategy %s is None of 1F1B or Origin.", exe_desc.strategy()));
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 07fd091b04d977..00b256da6af383 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -30,6 +31,12 @@ TaskNode::TaskNode(const framework::ProgramDesc& program, int64_t rank,
   // Should be serially invoked, not thread-safe
   static int64_t task_node_cnt = 0;
   task_id_ = task_node_cnt++;
+  for (const auto& op_desc : program.Block(0).AllOps()) {
+    ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
+  }
+  for (const auto& op : ops_vec_) {
+    ops_.emplace_back(op.get());
+  }
 }
 
 TaskNode::TaskNode(int32_t role, const std::vector<OperatorBase*>& ops,
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 762b46d6230ee0..f5704e6ae0cccb 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -76,10 +76,12 @@ class TaskNode final {
  private:
   DISABLE_COPY_AND_ASSIGN(TaskNode);
   TaskNode() = default;
+  // ops_ will be removed in the future
   std::vector<OperatorBase*> ops_;
   std::unordered_set<int64_t> upstream_;
   std::unordered_set<int64_t> downstream_;
   framework::ProgramDesc program_;
+  std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
   int32_t role_;
   int64_t rank_;
   int64_t task_id_;
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 115be1b8ba8b4d..40d325fae458ff 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -16,6 +16,7 @@
 #include <pybind11/stl.h>
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,8 +33,7 @@ void BindFleetExecutor(py::module* m) {
   py::class_<FleetExecutor>(*m, "FleetExecutor")
       .def(py::init<const std::string&>())
       .def("init", &FleetExecutor::Init)
-      .def("run", &FleetExecutor::Run)
-      .def("release", &FleetExecutor::Release);
+      .def("run", &FleetExecutor::Run);
 
   py::class_<TaskNode>(*m, "TaskNode")
       .def(py::init<const framework::ProgramDesc&, int64_t, int64_t, int64_t>())
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index dc4e3ce39af830..be408ea765ddc7 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -682,6 +682,8 @@ def __init__(self, place=None):
         self._enable_interpreter_core = _is_enable_standalone_executor()
         self._executor_cache = _ExecutorCache(self.place)
 
+        self._fleet_executor_cache = None
+
     def _get_scope_cache(self, program_cache_key):
         return self.scope_caches.get(program_cache_key, None)
 
@@ -1960,49 +1962,52 @@ def _run_using_fleet_executor(self,
                                   print_period=100,
                                   fetch_handler=None,
                                   use_program_cache=False):
-        scope, real_fetch_list, trainer_instance = \
-            self._prepare_pipeline_ctx(program, dataset, scope, thread,
-                                       is_infer, debug, fetch_list, fetch_info,
-                                       print_period, fetch_handler,
-                                       use_program_cache)
-        from ..distributed.fleet.proto import fleet_executor_desc_pb2
-        from google.protobuf import text_format
-        cur_rank = os.getenv("PADDLE_TRAINER_ID")
-        trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-        fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
-        nrank = 1
-        if cur_rank and trainer_endpoints_str:
-            fleet_exe_desc.cur_rank = int(cur_rank)
-            trainer_endpoints = trainer_endpoints_str.split(',')
-            for rank, endpoint in enumerate(trainer_endpoints):
+        if self._fleet_executor_cache is None:
+            from ..distributed.fleet.proto import fleet_executor_desc_pb2
+            from google.protobuf import text_format
+            cur_rank = os.getenv("PADDLE_TRAINER_ID")
+            trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+            fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
+            nrank = 1
+            if cur_rank and trainer_endpoints_str:
+                fleet_exe_desc.cur_rank = int(cur_rank)
+                trainer_endpoints = trainer_endpoints_str.split(',')
+                for rank, endpoint in enumerate(trainer_endpoints):
+                    rank_info = fleet_executor_desc_pb2.RankInfo()
+                    rank_info.rank = rank
+                    rank_info.ip_port = endpoint
+                    fleet_exe_desc.cluster_info.append(rank_info)
+                nrank = len(trainer_endpoints)
+            else:
+                fleet_exe_desc.cur_rank = 0
                 rank_info = fleet_executor_desc_pb2.RankInfo()
-                rank_info.rank = rank
-                rank_info.ip_port = endpoint
+                rank_info.rank = 0
+                rank_info.ip_port = ''
                 fleet_exe_desc.cluster_info.append(rank_info)
-            nrank = len(trainer_endpoints)
-        else:
-            fleet_exe_desc.cur_rank = 0
-            rank_info = fleet_executor_desc_pb2.RankInfo()
-            rank_info.rank = 0
-            rank_info.ip_port = ''
-            fleet_exe_desc.cluster_info.append(rank_info)
-            logging.warning("Fleet Executor will run on single device only.")
-        fleet_opt = program._pipeline_opt["fleet_opt"]
-        if "dist_strategy" in fleet_opt:
-            fleet_exe_desc.dp_degree = fleet_opt["dist_strategy"]["dp_degree"]
-            fleet_exe_desc.mp_degree = fleet_opt["dist_strategy"]["mp_degree"]
-            fleet_exe_desc.pp_degree = fleet_opt["dist_strategy"]["pp_degree"]
-        if "num_micro_batches" in fleet_opt:
-            fleet_exe_desc.num_micro_batches = fleet_opt["num_micro_batches"]
-        num_of_gpu = fleet_exe_desc.dp_degree * fleet_exe_desc.mp_degree * fleet_exe_desc.pp_degree
-        assert nrank == num_of_gpu, "The number of rank is not equal to the number of gpu."
-        fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
-        place = core.Place()
-        place.set_place(self.place)
-        fleet_exe.init(program._pipeline_opt["section_program"].desc, scope,
-                       place)
-        fleet_exe.run()
-        fleet_exe.release()
+                logging.warning(
+                    "Fleet Executor will run on single device only.")
+            fleet_opt = program._pipeline_opt["fleet_opt"]
+            if "dist_strategy" in fleet_opt:
+                fleet_exe_desc.dp_degree = fleet_opt["dist_strategy"][
+                    "dp_degree"]
+                fleet_exe_desc.mp_degree = fleet_opt["dist_strategy"][
+                    "mp_degree"]
+                fleet_exe_desc.pp_degree = fleet_opt["dist_strategy"][
+                    "pp_degree"]
+            if "num_micro_batches" in fleet_opt:
+                fleet_exe_desc.num_micro_batches = fleet_opt[
+                    "num_micro_batches"]
+            num_of_gpu = fleet_exe_desc.dp_degree * fleet_exe_desc.mp_degree * fleet_exe_desc.pp_degree
+            assert nrank == num_of_gpu, "The number of rank is not equal to the number of gpu."
+            fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
+            place = core.Place()
+            place.set_place(self.place)
+            if scope is None:
+                scope = global_scope()
+            fleet_exe.init(program._pipeline_opt["section_program"].desc, scope,
+                           place)
+            self._fleet_executor_cache = fleet_exe
+        self._fleet_executor_cache.run()
         return None
 
     def _run_pipeline(self,

From a710abeecd72d7ac80084eb6003c0ed371d7fc04 Mon Sep 17 00:00:00 2001
From: xiayanming <41795079@qq.com>
Date: Thu, 2 Dec 2021 16:05:02 +0800
Subject: [PATCH 046/124] fix fleet elastic bug (#37781)

---
 python/paddle/distributed/fleet/elastic/collective.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
index 82055314b0dc8c..de350e15d35c02 100644
--- a/python/paddle/distributed/fleet/elastic/collective.py
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -31,9 +31,9 @@ def launch(self):
         logger.info("collective lauchner launch ...")
         args = self.args
         self.tmp_dir = tempfile.mkdtemp()
+        cluster, pod = paddle.distributed.fleet.launch.get_cluster_info(args)
         global_envs = paddle.distributed.fleet.launch.get_global_envs(
             args, self.tmp_dir)
-        cluster, pod = paddle.distributed.fleet.launch.get_cluster_info(args)
 
         self.procs = start_local_trainers(
             cluster,

From 876aa71776c701e458703832a758ee65e0b15124 Mon Sep 17 00:00:00 2001
From: seemingwang <seemingwang@users.noreply.github.com>
Date: Thu, 2 Dec 2021 16:14:55 +0800
Subject: [PATCH 047/124] support distributed graph_split load and query.
 (#37740)

---
 .../distributed/service/graph_brpc_client.cc  |  36 +++
 .../distributed/service/graph_brpc_client.h   |   2 +
 .../distributed/service/graph_brpc_server.cc  |  17 ++
 .../distributed/service/graph_brpc_server.h   |   4 +
 .../fluid/distributed/service/sendrecv.proto  |   1 +
 .../distributed/table/common_graph_table.cc   | 212 ++++++++++++--
 .../distributed/table/common_graph_table.h    |  14 +-
 .../distributed/table/graph/graph_node.cc     |   3 +
 paddle/fluid/distributed/test/CMakeLists.txt  |   3 +
 .../distributed/test/graph_node_split_test.cc | 275 ++++++++++++++++++
 10 files changed, 534 insertions(+), 33 deletions(-)
 create mode 100644 paddle/fluid/distributed/test/graph_node_split_test.cc

diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index c5ad4b0099479d..a9682d6a6efcc9 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -514,6 +514,42 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
   return fut;
 }
 
+std::future<int32_t> GraphBrpcClient::load_graph_split_config(
+    uint32_t table_id, std::string path) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      server_size, [&, server_size = this->server_size ](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        size_t fail_num = 0;
+        for (size_t request_idx = 0; request_idx < server_size; ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG) != 0) {
+            ++fail_num;
+            break;
+          }
+        }
+        ret = fail_num == 0 ? 0 : -1;
+        closure->set_promise_value(ret);
+      });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < server_size; i++) {
+    int server_index = i;
+    closure->request(server_index)
+        ->set_cmd_id(PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG);
+    closure->request(server_index)->set_table_id(table_id);
+    closure->request(server_index)->set_client_id(_client_id);
+    closure->request(server_index)->add_params(path);
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(server_index),
+                     closure->request(server_index),
+                     closure->response(server_index), closure);
+  }
+  return fut;
+}
 std::future<int32_t> GraphBrpcClient::use_neighbors_sample_cache(
     uint32_t table_id, size_t total_size_limit, size_t ttl) {
   DownpourBrpcClosure *closure = new DownpourBrpcClosure(
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
index e3d2ff1d32d722..2e5d5b6ee93cbe 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.h
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -93,6 +93,8 @@ class GraphBrpcClient : public BrpcPsClient {
   virtual std::future<int32_t> use_neighbors_sample_cache(uint32_t table_id,
                                                           size_t size_limit,
                                                           size_t ttl);
+  virtual std::future<int32_t> load_graph_split_config(uint32_t table_id,
+                                                       std::string path);
   virtual std::future<int32_t> remove_graph_node(
       uint32_t table_id, std::vector<uint64_t>& node_id_list);
   virtual int32_t initialize();
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
index 094ecbbd402c03..c1348e4804e2ba 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -204,6 +204,8 @@ int32_t GraphBrpcService::initialize() {
       &GraphBrpcService::sample_neighbors_across_multi_servers;
   _service_handler_map[PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE] =
       &GraphBrpcService::use_neighbors_sample_cache;
+  _service_handler_map[PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG] =
+      &GraphBrpcService::load_graph_split_config;
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
 
@@ -658,5 +660,20 @@ int32_t GraphBrpcService::use_neighbors_sample_cache(
   ((GraphTable *)table)->make_neighbor_sample_cache(size_limit, ttl);
   return 0;
 }
+
+int32_t GraphBrpcService::load_graph_split_config(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "load_graph_split_configrequest requires at least 1 "
+                      "argument1[file_path]");
+    return 0;
+  }
+  ((GraphTable *)table)->load_graph_split_config(request.params(0));
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
index d1a6aa63604f36..ecd78d28ca812a 100644
--- a/paddle/fluid/distributed/service/graph_brpc_server.h
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -126,6 +126,10 @@ class GraphBrpcService : public PsBaseService {
                                      PsResponseMessage &response,
                                      brpc::Controller *cntl);
 
+  int32_t load_graph_split_config(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl);
+
  private:
   bool _is_initialize_shard_info;
   std::mutex _initialize_shard_mutex;
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 8ee9b3590721a0..6dfaff1ffa1df5 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -58,6 +58,7 @@ enum PsCmdID {
   PS_GRAPH_SET_NODE_FEAT = 37;
   PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38;
   PS_GRAPH_USE_NEIGHBORS_SAMPLE_CACHE = 39;
+  PS_GRAPH_LOAD_GRAPH_SPLIT_CONFIG = 40;
 }
 
 message PsRequestMessage {
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index b690d71eab84d7..042a4dee62bda6 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -56,7 +56,7 @@ int32_t GraphTable::add_graph_node(std::vector<uint64_t> &id_list,
     tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
       for (auto &p : batch[i]) {
         size_t index = p.first % this->shard_num - this->shard_start;
-        this->shards[index].add_graph_node(p.first)->build_edges(p.second);
+        this->shards[index]->add_graph_node(p.first)->build_edges(p.second);
       }
       return 0;
     }));
@@ -79,7 +79,7 @@ int32_t GraphTable::remove_graph_node(std::vector<uint64_t> &id_list) {
     tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int {
       for (auto &p : batch[i]) {
         size_t index = p % this->shard_num - this->shard_start;
-        this->shards[index].delete_node(p);
+        this->shards[index]->delete_node(p);
       }
       return 0;
     }));
@@ -97,6 +97,7 @@ void GraphShard::clear() {
 }
 
 GraphShard::~GraphShard() { clear(); }
+
 void GraphShard::delete_node(uint64_t id) {
   auto iter = node_location.find(id);
   if (iter == node_location.end()) return;
@@ -117,6 +118,14 @@ GraphNode *GraphShard::add_graph_node(uint64_t id) {
   return (GraphNode *)bucket[node_location[id]];
 }
 
+GraphNode *GraphShard::add_graph_node(Node *node) {
+  auto id = node->get_id();
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(node);
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
 FeatureNode *GraphShard::add_feature_node(uint64_t id) {
   if (node_location.find(id) == node_location.end()) {
     node_location[id] = bucket.size();
@@ -134,6 +143,33 @@ Node *GraphShard::find_node(uint64_t id) {
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
 
+GraphTable::~GraphTable() {
+  for (auto p : shards) {
+    delete p;
+  }
+  for (auto p : extra_shards) {
+    delete p;
+  }
+  shards.clear();
+  extra_shards.clear();
+}
+
+int32_t GraphTable::load_graph_split_config(const std::string &path) {
+  VLOG(4) << "in server side load graph split config\n";
+  std::ifstream file(path);
+  std::string line;
+  while (std::getline(file, line)) {
+    auto values = paddle::string::split_string<std::string>(line, "\t");
+    if (values.size() < 2) continue;
+    size_t index = (size_t)std::stoi(values[0]);
+    if (index != _shard_idx) continue;
+    auto dst_id = std::stoull(values[1]);
+    extra_nodes.insert(dst_id);
+  }
+  if (extra_nodes.size() != 0) use_duplicate_nodes = true;
+  return 0;
+}
+
 int32_t GraphTable::load(const std::string &path, const std::string &param) {
   bool load_edge = (param[0] == 'e');
   bool load_node = (param[0] == 'n');
@@ -154,7 +190,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
   res.clear();
   std::vector<std::future<std::vector<uint64_t>>> tasks;
   for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) {
-    end = total_size + shards[i].get_size();
+    end = total_size + shards[i]->get_size();
     start = total_size;
     while (start < end && index < ranges.size()) {
       if (ranges[index].second <= start)
@@ -169,11 +205,11 @@ int32_t GraphTable::get_nodes_ids_by_ranges(
         second -= total_size;
         tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
             [this, first, second, i]() -> std::vector<uint64_t> {
-              return shards[i].get_ids_by_range(first, second);
+              return shards[i]->get_ids_by_range(first, second);
             }));
       }
     }
-    total_size += shards[i].get_size();
+    total_size += shards[i]->get_size();
   }
   for (size_t i = 0; i < tasks.size(); i++) {
     auto vec = tasks[i].get();
@@ -217,7 +253,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 
       size_t index = shard_id - shard_start;
 
-      auto node = shards[index].add_feature_node(id);
+      auto node = shards[index]->add_feature_node(id);
 
       node->set_feature_size(feat_name.size());
 
@@ -245,7 +281,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   std::string sample_type = "random";
   bool is_weighted = false;
   int valid_count = 0;
-
+  int extra_alloc_index = 0;
   for (auto path : paths) {
     std::ifstream file(path);
     std::string line;
@@ -268,8 +304,24 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
       size_t src_shard_id = src_id % shard_num;
 
       if (src_shard_id >= shard_end || src_shard_id < shard_start) {
-        VLOG(4) << "will not load " << src_id << " from " << path
-                << ", please check id distribution";
+        if (use_duplicate_nodes == false ||
+            extra_nodes.find(src_id) == extra_nodes.end()) {
+          VLOG(4) << "will not load " << src_id << " from " << path
+                  << ", please check id distribution";
+          continue;
+        }
+        int index;
+        if (extra_nodes_to_thread_index.find(src_id) !=
+            extra_nodes_to_thread_index.end()) {
+          index = extra_nodes_to_thread_index[src_id];
+        } else {
+          index = extra_alloc_index++;
+          extra_alloc_index %= task_pool_size_;
+          extra_nodes_to_thread_index[src_id] = index;
+        }
+        extra_shards[index]->add_graph_node(src_id)->build_edges(is_weighted);
+        extra_shards[index]->add_neighbor(src_id, dst_id, weight);
+        valid_count++;
         continue;
       }
       if (count % 1000000 == 0) {
@@ -278,36 +330,130 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
       }
 
       size_t index = src_shard_id - shard_start;
-      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
-      shards[index].add_neighbor(src_id, dst_id, weight);
+      shards[index]->add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index]->add_neighbor(src_id, dst_id, weight);
       valid_count++;
     }
   }
   VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
           << path;
 
+  std::vector<int> used(task_pool_size_, 0);
   // Build Sampler j
 
   for (auto &shard : shards) {
-    auto bucket = shard.get_bucket();
+    auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
       bucket[i]->build_sampler(sample_type);
+      used[get_thread_pool_index(bucket[i]->get_id())]++;
     }
   }
+  /*-----------------------
+  relocate the duplicate nodes to make them distributed evenly among threads.
+*/
+  for (auto &shard : extra_shards) {
+    auto bucket = shard->get_bucket();
+    for (size_t i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  int size = extra_nodes_to_thread_index.size();
+  if (size == 0) return 0;
+  std::vector<int> index;
+  for (int i = 0; i < used.size(); i++) index.push_back(i);
+  sort(index.begin(), index.end(),
+       [&](int &a, int &b) { return used[a] < used[b]; });
+
+  std::vector<int> alloc(index.size(), 0), has_alloc(index.size(), 0);
+  int t = 1, aim = 0, mod = 0;
+  for (; t < used.size(); t++) {
+    if ((used[index[t]] - used[index[t - 1]]) * t >= size) {
+      break;
+    } else {
+      size -= (used[index[t]] - used[index[t - 1]]) * t;
+    }
+  }
+  aim = used[index[t - 1]] + size / t;
+  mod = size % t;
+  for (int x = t - 1; x >= 0; x--) {
+    alloc[index[x]] = aim;
+    if (t - x <= mod) alloc[index[x]]++;
+    alloc[index[x]] -= used[index[x]];
+  }
+  std::vector<uint64_t> vec[index.size()];
+  for (auto p : extra_nodes_to_thread_index) {
+    has_alloc[p.second]++;
+    vec[p.second].push_back(p.first);
+  }
+  sort(index.begin(), index.end(), [&](int &a, int &b) {
+    return has_alloc[a] - alloc[a] < has_alloc[b] - alloc[b];
+  });
+  int left = 0, right = index.size() - 1;
+  while (left < right) {
+    if (has_alloc[index[right]] - alloc[index[right]] == 0) break;
+    int x = std::min(alloc[index[left]] - has_alloc[index[left]],
+                     has_alloc[index[right]] - alloc[index[right]]);
+    has_alloc[index[left]] += x;
+    has_alloc[index[right]] -= x;
+    uint64_t id;
+    while (x--) {
+      id = vec[index[right]].back();
+      vec[index[right]].pop_back();
+      extra_nodes_to_thread_index[id] = index[left];
+      vec[index[left]].push_back(id);
+    }
+    if (has_alloc[index[right]] - alloc[index[right]] == 0) right--;
+    if (alloc[index[left]] - has_alloc[index[left]] == 0) left++;
+  }
+  std::vector<GraphShard *> extra_shards_copy;
+  for (int i = 0; i < task_pool_size_; ++i) {
+    extra_shards_copy.push_back(new GraphShard());
+  }
+  for (auto &shard : extra_shards) {
+    auto &bucket = shard->get_bucket();
+    auto &node_location = shard->get_node_location();
+    while (bucket.size()) {
+      Node *temp = bucket.back();
+      bucket.pop_back();
+      node_location.erase(temp->get_id());
+      extra_shards_copy[extra_nodes_to_thread_index[temp->get_id()]]
+          ->add_graph_node(temp);
+    }
+  }
+  for (int i = 0; i < task_pool_size_; ++i) {
+    delete extra_shards[i];
+    extra_shards[i] = extra_shards_copy[i];
+  }
   return 0;
 }
 
 Node *GraphTable::find_node(uint64_t id) {
   size_t shard_id = id % shard_num;
   if (shard_id >= shard_end || shard_id < shard_start) {
-    return nullptr;
+    if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
+      return nullptr;
+    auto iter = extra_nodes_to_thread_index.find(id);
+    if (iter == extra_nodes_to_thread_index.end())
+      return nullptr;
+    else {
+      return extra_shards[iter->second]->find_node(id);
+    }
   }
   size_t index = shard_id - shard_start;
-  Node *node = shards[index].find_node(id);
+  Node *node = shards[index]->find_node(id);
   return node;
 }
 uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
-  return node_id % shard_num % shard_num_per_server % task_pool_size_;
+  if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0)
+    return node_id % shard_num % shard_num_per_server % task_pool_size_;
+  size_t src_shard_id = node_id % shard_num;
+  if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+    auto iter = extra_nodes_to_thread_index.find(node_id);
+    if (iter != extra_nodes_to_thread_index.end()) {
+      return iter->second;
+    }
+  }
+  return src_shard_id % shard_num_per_server % task_pool_size_;
 }
 
 uint32_t GraphTable::get_thread_pool_index_by_shard_index(
@@ -319,11 +465,16 @@ int32_t GraphTable::clear_nodes() {
   std::vector<std::future<int>> tasks;
   for (size_t i = 0; i < shards.size(); i++) {
     tasks.push_back(
-        _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue(
-            [this, i]() -> int {
-              this->shards[i].clear();
-              return 0;
-            }));
+        _shards_task_pool[i % task_pool_size_]->enqueue([this, i]() -> int {
+          this->shards[i]->clear();
+          return 0;
+        }));
+  }
+  for (size_t i = 0; i < extra_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i]->enqueue([this, i]() -> int {
+      this->extra_shards[i]->clear();
+      return 0;
+    }));
   }
   for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   return 0;
@@ -334,7 +485,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size,
                                         int &actual_size) {
   int total_size = 0;
   for (int i = 0; i < shards.size(); i++) {
-    total_size += shards[i].get_size();
+    total_size += shards[i]->get_size();
   }
   if (sample_size > total_size) sample_size = total_size;
   int range_num = random_sample_nodes_ranges;
@@ -401,8 +552,8 @@ int32_t GraphTable::random_sample_neighbors(
   size_t node_num = buffers.size();
   std::function<void(char *)> char_del = [](char *c) { delete[] c; };
   std::vector<std::future<int>> tasks;
-  std::vector<std::vector<uint32_t>> seq_id(shard_end - shard_start);
-  std::vector<std::vector<SampleKey>> id_list(shard_end - shard_start);
+  std::vector<std::vector<uint32_t>> seq_id(task_pool_size_);
+  std::vector<std::vector<SampleKey>> id_list(task_pool_size_);
   size_t index;
   for (size_t idx = 0; idx < node_num; ++idx) {
     index = get_thread_pool_index(node_ids[idx]);
@@ -524,7 +675,7 @@ int32_t GraphTable::set_node_feat(
     tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
         [&, idx, node_id]() -> int {
           size_t index = node_id % this->shard_num - this->shard_start;
-          auto node = shards[index].add_feature_node(node_id);
+          auto node = shards[index]->add_feature_node(node_id);
           node->set_feature_size(this->feat_name.size());
           for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
             const std::string &feature_name = feature_names[feat_idx];
@@ -581,7 +732,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
   int size = 0, cur_size;
   std::vector<std::future<std::vector<Node *>>> tasks;
   for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
-    cur_size = shards[i].get_size();
+    cur_size = shards[i]->get_size();
     if (size + cur_size <= start) {
       size += cur_size;
       continue;
@@ -590,7 +741,7 @@ int32_t GraphTable::pull_graph_list(int start, int total_size,
     int end = start + (count - 1) * step + 1;
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [this, i, start, end, step, size]() -> std::vector<Node *> {
-          return this->shards[i].get_batch(start - size, end - size, step);
+          return this->shards[i]->get_batch(start - size, end - size, step);
         }));
     start += count * step;
     total_size -= count;
@@ -665,7 +816,14 @@ int32_t GraphTable::initialize() {
   shard_end = shard_start + shard_num_per_server;
   VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
           << shard_start << " shard_end " << shard_end;
-  shards = std::vector<GraphShard>(shard_num_per_server, GraphShard(shard_num));
+  for (int i = 0; i < shard_num_per_server; i++) {
+    shards.push_back(new GraphShard());
+  }
+  use_duplicate_nodes = false;
+  for (int i = 0; i < task_pool_size_; i++) {
+    extra_shards.push_back(new GraphShard());
+  }
+
   return 0;
 }
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 9ca59db3bb2683..b76ab0ae950602 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -47,7 +47,6 @@ class GraphShard {
  public:
   size_t get_size();
   GraphShard() {}
-  GraphShard(int shard_num) { this->shard_num = shard_num; }
   ~GraphShard();
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
@@ -60,18 +59,18 @@ class GraphShard {
   }
 
   GraphNode *add_graph_node(uint64_t id);
+  GraphNode *add_graph_node(Node *node);
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
   void delete_node(uint64_t id);
   void clear();
   void add_neighbor(uint64_t id, uint64_t dst_id, float weight);
-  std::unordered_map<uint64_t, int> get_node_location() {
+  std::unordered_map<uint64_t, int> &get_node_location() {
     return node_location;
   }
 
  private:
   std::unordered_map<uint64_t, int> node_location;
-  int shard_num;
   std::vector<Node *> bucket;
 };
 
@@ -355,7 +354,7 @@ class ScaledLRU {
 class GraphTable : public SparseTable {
  public:
   GraphTable() { use_cache = false; }
-  virtual ~GraphTable() {}
+  virtual ~GraphTable();
   virtual int32_t pull_graph_list(int start, int size,
                                   std::unique_ptr<char[]> &buffer,
                                   int &actual_size, bool need_feature,
@@ -374,6 +373,7 @@ class GraphTable : public SparseTable {
   virtual int32_t initialize();
 
   int32_t load(const std::string &path, const std::string &param);
+  int32_t load_graph_split_config(const std::string &path);
 
   int32_t load_edges(const std::string &path, bool reverse);
 
@@ -434,7 +434,7 @@ class GraphTable : public SparseTable {
   }
 
  protected:
-  std::vector<GraphShard> shards;
+  std::vector<GraphShard *> shards, extra_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
   const int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
@@ -449,7 +449,9 @@ class GraphTable : public SparseTable {
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
-  bool use_cache;
+  std::unordered_set<uint64_t> extra_nodes;
+  std::unordered_map<uint64_t, size_t> extra_nodes_to_thread_index;
+  bool use_cache, use_duplicate_nodes;
   mutable std::mutex mutex_;
 };
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc
index e2311cc307b605..52c708be884884 100644
--- a/paddle/fluid/distributed/table/graph/graph_node.cc
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
@@ -65,6 +65,9 @@ void GraphNode::build_edges(bool is_weighted) {
   }
 }
 void GraphNode::build_sampler(std::string sample_type) {
+  if (sampler != nullptr) {
+    return;
+  }
   if (sample_type == "random") {
     sampler = new RandomSampler();
   } else if (sample_type == "weighted") {
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 597a08973b957b..62de82832e1336 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -21,6 +21,9 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct
 set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
 
+set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
+
 set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table)
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
new file mode 100644
index 00000000000000..3fcddde787f69f
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -0,0 +1,275 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+std::vector<std::string> edges = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+char edge_file_name[] = "edges.txt";
+
+std::vector<std::string> nodes = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+std::vector<std::string> graph_split = {std::string("0\t97")};
+char graph_split_file_name[] = "graph_split.txt";
+
+void prepare_file(char file_name[], std::vector<std::string> data) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  for (auto x : data) {
+    ofile << x << std::endl;
+  }
+
+  ofile.close();
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("GraphTable");
+  sparse_table_proto->set_shard_num(127);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+}
+
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+
+  return worker_fleet_desc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1";
+uint32_t port_ = 5209, port2 = 5210;
+
+std::vector<std::string> host_sign_list_;
+
+std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr_,
+    pserver_ptr2;
+
+std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr_;
+
+void RunServer() {
+  LOG(INFO) << "init first server";
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec);
+  LOG(INFO) << "first server, run start(ip,port)";
+  pserver_ptr_->start(ip_, port_);
+  pserver_ptr_->build_peer2peer_connection(0);
+  LOG(INFO) << "init first server Done";
+}
+
+void RunServer2() {
+  LOG(INFO) << "init second server";
+  ::paddle::distributed::PSParameter server_proto2 = GetServerProto();
+
+  auto _ps_env2 = paddle::distributed::PaddlePSEnvironment();
+  _ps_env2.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr2 = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto2));
+  std::vector<framework::ProgramDesc> empty_vec2;
+  framework::ProgramDesc empty_prog2;
+  empty_vec2.push_back(empty_prog2);
+  pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2);
+  pserver_ptr2->start(ip2, port2);
+  pserver_ptr2->build_peer2peer_connection(1);
+}
+
+void RunClient(
+    std::map<uint64_t, std::vector<paddle::distributed::Region>>& dense_regions,
+    int index, paddle::distributed::PsBaseService* service) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+  worker_ptr_->set_shard_num(127);
+  worker_ptr_->set_local_channel(index);
+  worker_ptr_->set_local_graph_service(
+      (paddle::distributed::GraphBrpcService*)service);
+}
+
+void RunGraphSplit() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  prepare_file(edge_file_name, edges);
+  prepare_file(node_file_name, nodes);
+  prepare_file(graph_split_file_name, graph_split);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+
+  // test-start
+  auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
+  host_sign_list_.push_back(ph_host2.serialize_to_string());
+  // test-end
+  // Srart Server
+  std::thread* server_thread = new std::thread(RunServer);
+
+  std::thread* server_thread2 = new std::thread(RunServer2);
+
+  sleep(2);
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+
+  RunClient(dense_regions, 0, pserver_ptr_->get_service());
+
+  /*-----------------------Test Server Init----------------------------------*/
+
+  auto pull_status = worker_ptr_->load_graph_split_config(
+      0, std::string(graph_split_file_name));
+  pull_status.wait();
+  pull_status =
+      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
+  srand(time(0));
+  pull_status.wait();
+  std::vector<std::vector<uint64_t>> _vs;
+  std::vector<std::vector<float>> vs;
+  pull_status = worker_ptr_->batch_sample_neighbors(
+      0, std::vector<uint64_t>(1, 10240001024), 4, _vs, vs, true);
+  pull_status.wait();
+  ASSERT_EQ(0, _vs[0].size());
+  _vs.clear();
+  vs.clear();
+  pull_status = worker_ptr_->batch_sample_neighbors(
+      0, std::vector<uint64_t>(1, 97), 4, _vs, vs, true);
+  pull_status.wait();
+  ASSERT_EQ(3, _vs[0].size());
+  std::remove(edge_file_name);
+  std::remove(node_file_name);
+  std::remove(graph_split_file_name);
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+}
+
+TEST(RunGraphSplit, Run) { RunGraphSplit(); }
\ No newline at end of file

From 08a2d0babafa5596cdc97fe35827370f882a66bc Mon Sep 17 00:00:00 2001
From: zhenlin <56170639+zhenlin-work@users.noreply.github.com>
Date: Thu, 2 Dec 2021 16:58:05 +0800
Subject: [PATCH 048/124] fix_pass_timeout_bug (#37789)

---
 python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 58affe1b843fdd..2b45a8c5db33d0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -74,11 +74,11 @@ set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100)
 set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30)
-set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
 
 if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
   set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
 endif()
 
 if (WITH_MKLDNN)

From cc2b466295b95e24d6e1eb007bba733d2d512a7b Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 2 Dec 2021 19:14:24 +0800
Subject: [PATCH 049/124] refine found_inf of loss_scaler (#37770)

---
 python/paddle/fluid/dygraph/amp/loss_scaler.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 432b178ea67066..f7c2d6be574c4e 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -127,6 +127,10 @@ def __init__(self,
             self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
 
             self._found_inf = to_variable(np.array([0]).astype(np.bool))
+            self._temp_found_inf_fp16 = to_variable(
+                np.array([0]).astype(np.bool))
+            self._temp_found_inf_fp32 = to_variable(
+                np.array([0]).astype(np.bool))
             self._scale = to_variable(
                 np.array([self._init_loss_scaling]).astype(np.float32))
             self._cache_founf_inf = None
@@ -282,17 +286,20 @@ def _unscale(self, optimizer):
                     ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
                            )
             ]
-        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
-        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
         if len(param_grads_fp16):
             _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                             param_grads_fp16,
-                                            temp_found_inf_fp16)
+                                            self._temp_found_inf_fp16)
         if len(param_grads_fp32):
             _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                             param_grads_fp32,
-                                            temp_found_inf_fp32)
-        self._found_inf = temp_found_inf_fp16 or temp_found_inf_fp32
+                                            self._temp_found_inf_fp32)
+        if len(param_grads_fp16) and len(param_grads_fp32):
+            self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
+        elif len(param_grads_fp16):
+            self._found_inf = self._temp_found_inf_fp16
+        else:
+            self._found_inf = self._temp_found_inf_fp32
 
         optimizer_state["state"] = OptimizerState.UNSCALED
 

From f306965d57ad0e5b79e90159bf464e0095e89c8a Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Thu, 2 Dec 2021 19:38:04 +0800
Subject: [PATCH 050/124] [fleet_executor] Add amplify interceptor info runtime
 graph (#37783)

---
 .../fleet_executor/amplifier_interceptor.cc   | 22 ---------
 .../distributed/fleet_executor/carrier.cc     |  9 +++-
 .../fleet_executor/compute_interceptor.cc     |  5 ++-
 .../fleet_executor/runtime_graph.cc           | 28 ++++++++----
 .../distributed/fleet_executor/task_node.cc   | 45 ++++++++++++-------
 .../distributed/fleet_executor/task_node.h    | 17 ++-----
 6 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
index 7d71f8e7b2242e..72c689732b5b7d 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -27,28 +27,6 @@ AmplifierInterceptor::AmplifierInterceptor(int64_t interceptor_id,
   run_at_offset_ = node->run_at_offset();
   reply_up_per_steps_ = node->reply_up_per_steps();
   send_down_per_steps_ = node->send_down_per_steps();
-
-  PADDLE_ENFORCE_GE(
-      run_per_steps_, 1,
-      platform::errors::InvalidArgument(
-          "run_per_steps must >= 1, but now is %ld", run_per_steps_));
-  PADDLE_ENFORCE_GE(
-      run_at_offset_, 0,
-      platform::errors::InvalidArgument(
-          "run_at_offset must >= 0, but now is %ld", run_at_offset_));
-  PADDLE_ENFORCE_LT(run_at_offset_, run_per_steps_,
-                    platform::errors::InvalidArgument(
-                        "run_at_offset must < run_per_steps, must now "
-                        "run_at_offset=%ld run_per_steps=%ld",
-                        run_at_offset_, run_per_steps_));
-  PADDLE_ENFORCE_GE(
-      reply_up_per_steps_, 1,
-      platform::errors::InvalidArgument(
-          "reply_up_per_steps must >= 1, but now is %ld", reply_up_per_steps_));
-  PADDLE_ENFORCE_GE(send_down_per_steps_, 1,
-                    platform::errors::InvalidArgument(
-                        "send_down_per_steps must >= 1, but now is %ld",
-                        send_down_per_steps_));
 }
 
 void AmplifierInterceptor::RunOps() {
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 55878a1000ec45..e3af0de2c89d76 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -199,6 +199,13 @@ void Carrier::CreateInterceptors() {
       int64_t interceptor_id = item.first;
       TaskNode* task_node = item.second;
 
+      PADDLE_ENFORCE_LT(
+          task_node->run_at_offset(), task_node->run_per_steps(),
+          platform::errors::InvalidArgument(
+              "Interceptor's run_at_offset must < run_per_steps, must now "
+              "run_at_offset=%ld run_per_steps=%ld",
+              task_node->run_at_offset(), task_node->run_per_steps()));
+
       std::unique_ptr<Interceptor> interceptor;
       if (task_node->type().empty()) {
         // TODO(wangxi): delete this in future
@@ -214,7 +221,7 @@ void Carrier::CreateInterceptors() {
 
       SetInterceptor(interceptor_id, std::move(interceptor));
       VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
-              << ".";
+              << " with type: " << task_node->type() << ".";
 
       if (task_node->upstream().empty()) {
         source_interceptor_ids_.emplace_back(interceptor_id);
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 09275dc10a136d..0c0411a035fb36 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -161,7 +161,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
-  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops.";
+  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
+          << step_ << " time.";
   for (auto op : node_->ops()) {
     op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
   }
@@ -180,6 +181,8 @@ void ComputeInterceptor::Run() {
     ReplyCompletedToUpStream();
     // Try to stop Carrier
     if (is_last_ && (step_ % node_->max_run_times() == 0)) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << " is stopping carrier.";
       StopCarrier();
     }
   }
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 21026ee3f973b7..19afdf7441257f 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -161,22 +161,30 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
   int64_t num_micro_batches = exe_desc_.num_micro_batches();
   int64_t task_id = cur_rank * functionality_order.size();
   for (std::size_t i = 0; i < functionality_order.size(); ++i) {
+    VLOG(3) << "Runtime graph is creating task node for: " << task_id << ".";
     OpRole role = functionality_order[i];
     int32_t role_id = static_cast<int64_t>(role);
     int64_t max_run_times = num_micro_batches;
     int64_t max_slot_nums = start_up_steps;
-    if (IsLRSched(role_id) || IsOptimize(role_id)) {
-      max_run_times = 1;
-      max_slot_nums = 1;
+    // NOTE: use short path, each interceptor should run for max_run_times
+    std::vector<OperatorBase*> task_ops{};
+    if (role_to_ops.find(role_id) != role_to_ops.end()) {
+      task_ops = role_to_ops.at(role_id);
     }
-    if (role_to_ops.find(role_id) == role_to_ops.end()) {
-      task_nodes_.emplace_back(TaskNode::CreateEmptyTaskNode(
-          role_id, cur_rank, task_id, max_run_times, max_slot_nums));
+    std::unique_ptr<TaskNode> task_node = std::make_unique<TaskNode>(
+        role_id, task_ops, cur_rank, task_id, max_run_times, max_slot_nums);
+    if (IsLRSched(role_id) || IsOptimize(role_id)) {
+      task_node->SetType("Amplifier");
+      if (IsLRSched(role_id)) {
+        task_node->SetRunPerSteps(max_run_times);
+      } else {
+        task_node->SetRunAtOffset(max_run_times - 1);
+        task_node->SetRunPerSteps(max_run_times);
+      }
     } else {
-      task_nodes_.emplace_back(
-          TaskNode::CreateTaskNode(role_id, role_to_ops.at(role_id), cur_rank,
-                                   task_id, max_run_times, max_slot_nums));
+      task_node->SetType("Compute");
     }
+    task_nodes_.emplace_back(std::move(task_node));
     ++task_id;
   }
 }
@@ -227,6 +235,8 @@ void RuntimeGraph::FakeDependence() {
 void RuntimeGraph::AssignTaskToIntercepter() {
   for (const auto& task : task_nodes_) {
     int64_t intercepter_id = task->task_id();
+    VLOG(3) << "Runtime graph is assigning task to interceptor: "
+            << intercepter_id << " with type: " << task->type() << ".";
     if (intercepter_id_to_node_.find(intercepter_id) !=
         intercepter_id_to_node_.end()) {
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 00b256da6af383..f2e785010b7263 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -57,22 +57,6 @@ TaskNode::TaskNode(int32_t role, int64_t rank, int64_t task_id,
       max_run_times_(max_run_times),
       max_slot_nums_(max_slot_nums) {}
 
-std::unique_ptr<TaskNode> TaskNode::CreateEmptyTaskNode(int32_t role,
-                                                        int64_t rank,
-                                                        int64_t task_id,
-                                                        int64_t max_run_times,
-                                                        int64_t max_slot_nums) {
-  return std::make_unique<TaskNode>(role, rank, task_id, max_run_times,
-                                    max_slot_nums);
-}
-
-std::unique_ptr<TaskNode> TaskNode::CreateTaskNode(
-    int32_t role, const std::vector<OperatorBase*>& ops, int64_t rank,
-    int64_t task_id, int64_t max_run_times, int64_t max_slot_nums) {
-  return std::make_unique<TaskNode>(role, ops, rank, task_id, max_run_times,
-                                    max_slot_nums);
-}
-
 bool TaskNode::AddUpstreamTask(int64_t task_id) {
   const auto& ret = upstream_.insert(task_id);
   return *ret.first == task_id;
@@ -92,5 +76,34 @@ std::string TaskNode::DebugString() const {
   os << "\n";
   return os.str();
 }
+
+void TaskNode::SetRunPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(value, 1,
+                    platform::errors::InvalidArgument(
+                        "run_per_steps must >= 1, but received %ld", value));
+  run_per_steps_ = value;
+}
+
+void TaskNode::SetRunAtOffset(int64_t value) {
+  PADDLE_ENFORCE_GE(value, 0,
+                    platform::errors::InvalidArgument(
+                        "run_at_offset must >= 0, but received %ld", value));
+  run_at_offset_ = value;
+}
+
+void TaskNode::SetReplyUpPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(
+      value, 1, platform::errors::InvalidArgument(
+                    "reply_up_per_steps must >= 1, but received %ld", value));
+  reply_up_per_steps_ = value;
+}
+
+void TaskNode::SetSendDownPerSteps(int64_t value) {
+  PADDLE_ENFORCE_GE(
+      value, 1, platform::errors::InvalidArgument(
+                    "send_down_per_steps must >= 1, but received %ld", value));
+  send_down_per_steps_ = value;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index f5704e6ae0cccb..23fb4c0a7dbfcd 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -54,25 +54,16 @@ class TaskNode final {
   const paddle::framework::ProgramDesc& program() const { return program_; }
   const std::vector<OperatorBase*>& ops() const { return ops_; }
 
-  void SetRunPerSteps(int64_t value) { run_per_steps_ = value; }
-  void SetRunAtOffset(int64_t value) { run_at_offset_ = value; }
-  void SetReplyUpPerSteps(int64_t value) { reply_up_per_steps_ = value; }
-  void SetSendDownPerSteps(int64_t value) { send_down_per_steps_ = value; }
+  void SetRunPerSteps(int64_t value);
+  void SetRunAtOffset(int64_t value);
+  void SetReplyUpPerSteps(int64_t value);
+  void SetSendDownPerSteps(int64_t value);
   void SetType(const std::string& type) { type_ = type; }
 
   bool AddUpstreamTask(int64_t task_id);
   bool AddDownstreamTask(int64_t task_id);
   std::string DebugString() const;
 
-  static std::unique_ptr<TaskNode> CreateEmptyTaskNode(int32_t role,
-                                                       int64_t rank,
-                                                       int64_t task_id,
-                                                       int64_t max_run_times,
-                                                       int64_t max_slot_nums);
-  static std::unique_ptr<TaskNode> CreateTaskNode(
-      int32_t role, const std::vector<OperatorBase*>& ops, int64_t rank,
-      int64_t task_id, int64_t max_run_times, int64_t max_slot_nums);
-
  private:
   DISABLE_COPY_AND_ASSIGN(TaskNode);
   TaskNode() = default;

From c1fd1b1c5ff09532bf49fa5d2275343a294cde2e Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 2 Dec 2021 19:51:05 +0800
Subject: [PATCH 051/124] [PTen]Make inplace_op and vector<DenseTensor> input
 compatible with old architecture (#37674)

* add inplace op adaptation

* optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor>

* refactor logic that transform variable to densetensor

* update func name
---
 paddle/fluid/framework/operator.cc           | 111 +++++++------------
 paddle/fluid/imperative/prepared_operator.cc | 102 ++++++-----------
 paddle/pten/core/kernel_context.h            |  42 +++++--
 paddle/pten/core/kernel_utils.h              |  40 +++----
 paddle/pten/kernels/cpu/manipulation.cc      |  33 +++++-
 paddle/pten/kernels/cuda/manipulation.cu     |  34 +++++-
 6 files changed, 188 insertions(+), 174 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6ef44fb127afbe..d60fdd90e2a2a4 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
       BuildPtenKernelContext(*runtime_ctx, dev_ctx);
       (*pt_kernel_)(pt_kernel_context_.get());
-
       WriteBackToOutputs(runtime_ctx);
-
       pt_kernel_context_->ClearData();
     } else {
       (*kernel_func_)(
@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (auto* var : ins_vector) {
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(*var, in_def));
-      }
-      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (pt_kernel_context_->InputsSize() > start_idx) {
-      size_t input_size = pt_kernel_context_->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = pt_kernel_context_->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr =
+            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(
+              *ins_vector[offset], in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[j], in_def,
+              *ins_vector[offset], in_def,
               pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
-          in_def));
-          */
+                                                                    offset));
         }
+      } else {
+        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
+                                                    in_def));
       }
-      pt_kernel_context_->MutableInputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->InputsSize()));
     }
+    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (pt_kernel_context_->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto* var : outs_vector) {
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(var, out_def));
-      }
-      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (pt_kernel_context_->OutputsSize() > start_idx) {
-      size_t output_size = pt_kernel_context_->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > start_idx + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j], out_def,
-              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                     j));
-
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(outs_vector[j],
-          out_def));
-              */
-        }
+    auto current_vector_size = pt_kernel_context_->OutputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset], out_def,
+            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
+                                                                   offset));
+      } else {
+        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                    out_def));
       }
-      pt_kernel_context_->MutableOutputRangeAt(i) =
-          std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, pt_kernel_context_->OutputsSize()));
     }
+    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
+                                          i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 604f9d2be9e487..8e61b7d2eed880 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->InputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-      for (const auto& var : ins_vector) {
-        const auto& variable = var->Var();
-        tmp_inputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
-      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
-    } else if (kernel_ctx->InputsSize() > start_idx) {
-      size_t input_size = kernel_ctx->InputsSize();
-      for (size_t j = 0; j < ins_vector.size(); ++j) {
-        if (input_size > start_idx + j) {
+    auto current_vector_size = kernel_ctx->InputsSize();
+
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      const auto& variable = ins_vector[offset]->Var();
+      if (current_vector_size > start_idx + offset) {
+        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
+        if (input_ptr == nullptr) {
+          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
+        } else {
           experimental::ReMakePtenDenseTensorFromVar(
-              ins_vector[j]->Var(), in_def,
-              kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j));
-          // TODO(chentianyu03): When multi input kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackInputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
-                                                        in_def));
-          */
+              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
+                                    start_idx + offset));
         }
+      } else {
+        kernel_ctx->EmplaceBackInputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
       }
-      kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.inputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->InputsSize()));
     }
+    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-
-    // The current size of input/output in pt_kernel_context_ is at least equal
-    // the start_idx. For the reason of reusing the allocted of inputs or
-    // outputs in pt_kernel_context_, the current size of input/output can be
-    // greater then the index of which the tensort wanted to set to, so it will
-    // use ReMakePtenDenseTensorFromVar to make pten tensor.
-    if (kernel_ctx->OutputsSize() == start_idx) {
-      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-      for (auto& var : outs_vector) {
-        auto* variable = var->MutableVar();
-        tmp_outputs.emplace_back(
-            experimental::MakePtenTensorBaseFromVar(variable, out_def));
-      }
-      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
-    } else if (kernel_ctx->OutputsSize() > start_idx) {
-      size_t output_size = kernel_ctx->OutputsSize();
-      for (size_t j = 0; j < outs_vector.size(); ++j) {
-        if (output_size > i + j) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[j]->MutableVar(), out_def,
-              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
-          // TODO(chentianyu03): When multi output kernel, open this code
-          /*
-          } else {
-            kernel_ctx->EmplaceBackOutputWithoutSetRange(
-                experimental::MakePtenTensorBaseFromVar(
-                    outs_vector[j]->MutableVar(), out_def));
-          */
-        }
+    auto current_vector_size = kernel_ctx->OutputsSize();
+    // If the memory needed is less than the current memory allocated, we will
+    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
+    // Otherwise，we will create new storage.
+    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (current_vector_size > start_idx + offset) {
+        experimental::ReMakePtenDenseTensorFromVar(
+            outs_vector[offset]->MutableVar(), out_def,
+            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
+      } else {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange(
+            experimental::MakePtenTensorBaseFromVar(
+                outs_vector[offset]->MutableVar(), out_def));
       }
-      kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Error start index when trying to set new tensor to inputs, start "
-          "index is `%d`, but current pt_kernel_context_.outputs.size() is "
-          "`%d`.",
-          start_idx, kernel_ctx->OutputsSize()));
     }
+    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 4f4d673dfe6c4b..8a87a5b735e99e 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -104,14 +104,18 @@ class KernelContext {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
   }
 
+  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
+    return inputs_.at(idx);
+  }
+
   template <typename TensorType>
-  std::vector<TensorType> InputBetween(size_t start, size_t end) const {
+  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
     std::vector<TensorType> v;
     for (size_t i = start; i < end; ++i) {
       auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
       v.emplace_back(std::move(*t.get()));
+      inputs_.at(i) = nullptr;
     }
-
     return v;
   }
 
@@ -123,12 +127,32 @@ class KernelContext {
     return output_range_.at(idx);
   }
 
-  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
-    return input_range_[idx];
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < input_range_.size()) {
+      input_range_[idx] = range;
+    } else if (idx == input_range_.size()) {
+      input_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          input_range_.size()));
+    }
   }
 
-  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
-    return output_range_[idx];
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
+    if (idx < output_range_.size()) {
+      output_range_[idx] = range;
+    } else if (idx == output_range_.size()) {
+      output_range_.emplace_back(range);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Invalid idx when trying to set InputRange, "
+          "index is `%d`, it is greater than the size(%d) of InputRange.",
+          idx,
+          output_range_.size()));
+    }
   }
 
   template <typename TensorType>
@@ -165,8 +189,10 @@ class KernelContext {
   // Only deal with DenseTensor now
   void ClearData() {
     for (auto& in : inputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
+      if (in) {
+        CompatibleDenseTensorUtils::ClearStorage(
+            static_cast<DenseTensor*>(in.get()));
+      }
     }
     for (auto& out : outputs_) {
       CompatibleDenseTensorUtils::ClearStorage(
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 7e6be1c3914006..dcfc8c55644d99 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                   \
   }
 
-#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)     \
-  template <typename... Tail>                                           \
-  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {   \
-    template <int dev_ctx_idx,                                          \
-              int in_idx,                                               \
-              int attr_idx,                                             \
-              int out_idx,                                              \
-              typename... PreviousArgs>                                 \
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
-      static_assert(attr_idx == 0,                                      \
-                    "Kernel's Input should appear before Attributes."); \
-      static_assert(out_idx == 0,                                       \
-                    "Kernel's Input should appear before Outputs.");    \
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);      \
-      std::vector<tensor_type> arg = std::move(                         \
-          ctx->InputBetween<tensor_type>(range.first, range.second));   \
-      KernelCallHelper<Tail...>::                                       \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
-              ctx, pargs..., arg);                                      \
-    }                                                                   \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)        \
+  template <typename... Tail>                                              \
+  struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> {      \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(attr_idx == 0,                                         \
+                    "Kernel's Input should appear before Attributes.");    \
+      static_assert(out_idx == 0,                                          \
+                    "Kernel's Input should appear before Outputs.");       \
+      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);         \
+      std::vector<tensor_type> arg = std::move(                            \
+          ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
+      KernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg);                                         \
+    }                                                                      \
   }
 
 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index 8f559b01b3bcb3..7693e204eaa091 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CPU,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CPU,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 22ada75304f245..1a1d5cef300d4b 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
                           const std::vector<int64_t>& shape,
                           DenseTensor* out) {
   auto out_meta = InferMetaFromVecValue(x.meta(), shape);
-  if (&x == out) {
+  if (x.data() == out->data() && x.numel() == out->numel()) {
     out->Resize(out_meta.dims);
     return;
   }
@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
                                 CUDA,
                                 ANY,
                                 pten::ReshapeFromVectorValWithXShape) {}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDT) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
+                                CUDA,
+                                ANY,
+                                pten::ReshapeFromVectorDTWithXShape) {
+  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
+  kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
+}

From f4138cf9123b8d4876193ba0cc1027dbf6d0f4a2 Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Thu, 2 Dec 2021 20:37:35 +0800
Subject: [PATCH 052/124] [Dy2Stat]Fix test_mnist_pure_fp16 (#37791)

* fix test_mnist_pure_fp16

* change batch_id
---
 .../tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
index 4ddc9d1aa08609..1860362896cfb1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_pure_fp16.py
@@ -91,13 +91,13 @@ def train(self, to_static=False):
                 loss_data.append(avg_loss.numpy()[0])
                 # save checkpoint
                 mnist.clear_gradients()
-                if batch_id % 10 == 0:
+                if batch_id % 2 == 0:
                     print(
                         "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
                         .format(epoch, batch_id,
                                 avg_loss.numpy(), acc.numpy(), time() - start))
                     start = time()
-                if batch_id == 50:
+                if batch_id == 10:
                     break
         return loss_data
 

From f1c48f85bfb8f32194dc44b298a3f3539ad2afce Mon Sep 17 00:00:00 2001
From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com>
Date: Thu, 2 Dec 2021 21:15:57 +0800
Subject: [PATCH 053/124] release dataset (#37790)

---
 paddle/fluid/framework/data_set.cc | 5 +++++
 paddle/fluid/framework/data_set.h  | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index c511526c3159d6..1b5db8380514d5 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -463,6 +463,11 @@ void DatasetImpl<T>::WaitPreLoadDone() {
 // release memory data
 template <typename T>
 void DatasetImpl<T>::ReleaseMemory() {
+  release_thread_ = new std::thread(&DatasetImpl<T>::ReleaseMemoryFun, this);
+}
+
+template <typename T>
+void DatasetImpl<T>::ReleaseMemoryFun() {
   VLOG(3) << "DatasetImpl<T>::ReleaseMemory() begin";
   if (input_channel_) {
     input_channel_->Clear();
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index b41f701548f3f1..58223a2f28b4f5 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -63,6 +63,7 @@ class Dataset {
   virtual void SetTrainerNum(int trainer_num) = 0;
   // set fleet send batch size
   virtual void SetFleetSendBatchSize(int64_t size) = 0;
+  virtual void ReleaseMemoryFun() = 0;
   // set fs name and ugi
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi) = 0;
@@ -168,8 +169,13 @@ template <typename T>
 class DatasetImpl : public Dataset {
  public:
   DatasetImpl();
-  virtual ~DatasetImpl() {}
+  virtual ~DatasetImpl() {
+    if (release_thread_ != nullptr) {
+      release_thread_->join();
+    }
+  }
   virtual void SetFileList(const std::vector<std::string>& filelist);
+  virtual void ReleaseMemoryFun();
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
   virtual void SetFleetSendBatchSize(int64_t size);
@@ -295,6 +301,7 @@ class DatasetImpl : public Dataset {
   int64_t fleet_send_batch_size_;
   int64_t fleet_send_sleep_seconds_;
   std::vector<std::thread> preload_threads_;
+  std::thread* release_thread_ = nullptr;
   bool merge_by_insid_;
   bool parse_ins_id_;
   bool parse_content_;

From c58c4ede1eecb8de9416f9c76134a3312722a4e0 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 3 Dec 2021 08:27:23 +0800
Subject: [PATCH 054/124] =?UTF-8?q?=E3=80=90PTen=E3=80=91C++=20API=20Code-?=
 =?UTF-8?q?Generation=20(#37668)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add code-gen file

* add api-gen in cmake

* adjust the code format

* temp save the code

* add genen-api module into pten

* merge the develop code

* fix merge conflict

* fix code conflict with develop

* support reduce_mean/sum

* fix the CI requirement

* fix requirement problem of CI

* remove original api code

* fix bug caused by removing original api
---
 .gitignore                                    |   1 +
 paddle/pten/api/CMakeLists.txt                |   2 +-
 paddle/pten/api/all.h                         |   5 +-
 paddle/pten/api/include/creation.h            |  49 --
 paddle/pten/api/include/linalg.h              |  30 --
 paddle/pten/api/include/manipulation.h        |  28 --
 paddle/pten/api/include/math.h                |  48 --
 paddle/pten/api/lib/CMakeLists.txt            |  25 +-
 paddle/pten/api/lib/creation.cc               | 135 ------
 paddle/pten/api/lib/kernel_dispatch.cc        |  41 ++
 paddle/pten/api/lib/kernel_dispatch.h         |  20 +
 paddle/pten/api/lib/linalg.cc                 | 120 -----
 paddle/pten/api/lib/manipulation.cc           | 140 ------
 paddle/pten/api/lib/math.cc                   | 319 ------------
 paddle/pten/api/lib/tensor.cc                 |   4 +-
 paddle/pten/tests/api/CMakeLists.txt          |   4 +-
 paddle/pten/tests/api/test_cast_api.cc        |   3 +-
 paddle/pten/tests/api/test_dot_api.cc         |   2 +-
 paddle/pten/tests/api/test_elementwise_api.cc |   2 +-
 paddle/pten/tests/api/test_fill_api.cc        |   2 +-
 paddle/pten/tests/api/test_flatten_api.cc     |   2 +-
 paddle/pten/tests/api/test_matmul_api.cc      |   2 +-
 paddle/pten/tests/api/test_mean_api.cc        |   2 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   2 +-
 paddle/pten/tests/api/test_scale_api.cc       |   3 +-
 paddle/pten/tests/api/test_slice_api.cc       |   2 +-
 paddle/pten/tests/api/test_sum_api.cc         |   4 +-
 paddle/scripts/musl_build/build_inside.sh     |   1 +
 python/paddle/utils/code_gen/api.yaml         | 153 ++++++
 python/paddle/utils/code_gen/api_gen.py       | 452 ++++++++++++++++++
 30 files changed, 708 insertions(+), 895 deletions(-)
 delete mode 100644 paddle/pten/api/include/creation.h
 delete mode 100644 paddle/pten/api/include/linalg.h
 delete mode 100644 paddle/pten/api/include/manipulation.h
 delete mode 100644 paddle/pten/api/include/math.h
 delete mode 100644 paddle/pten/api/lib/creation.cc
 delete mode 100644 paddle/pten/api/lib/linalg.cc
 delete mode 100644 paddle/pten/api/lib/manipulation.cc
 delete mode 100644 paddle/pten/api/lib/math.cc
 create mode 100644 python/paddle/utils/code_gen/api.yaml
 create mode 100644 python/paddle/utils/code_gen/api_gen.py

diff --git a/.gitignore b/.gitignore
index c246a56cf15a4e..6be36bf8c243e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
+paddle/pten/api/*/api*
 
 *.DS_Store
 *.vs
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 09df2c01fd97cf..a454ae807bcaae 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api utils_api)
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 2c647786379c09..e853ae331e4c75 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -25,10 +25,7 @@ limitations under the License. */
 #endif
 
 // new pten apis
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/linalg.h"
-#include "paddle/pten/api/include/manipulation.h"
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/include/utils.h"
 
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
deleted file mode 100644
index b4e4bd0fd05190..00000000000000
--- a/paddle/pten/api/include/creation.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/common/backend.h"
-#include "paddle/pten/common/data_type.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/common/scalar_array.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype = DataType::FLOAT32,
-                        Backend backend = Backend::CPU,
-                        DataLayout layout = DataLayout::NCHW);
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype = DataType::UNDEFINED,
-                              Backend backend = Backend::UNDEFINED,
-                              DataLayout layout = DataLayout::UNDEFINED);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
deleted file mode 100644
index 259cf664932038..00000000000000
--- a/paddle/pten/api/include/linalg.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x = false,
-                          bool transpose_y = false);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
deleted file mode 100644
index 579fa5cdf945a4..00000000000000
--- a/paddle/pten/api/include/manipulation.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
-
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype);
-
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape);
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
deleted file mode 100644
index 700af6d2d59116..00000000000000
--- a/paddle/pten/api/include/math.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/common/scalar.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
-
-// TODO(chenweihang): move mean API into stat.h/cc
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim);
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim);
-
-// TODO(chenweihang): Follow-up discussion on the handling of `act` argument
-PD_DLL_DECL Tensor scale(const Tensor& x,
-                         const Scalar& scale,
-                         float bias,
-                         bool bias_after_scale);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index f30a3c89eb69e0..ed2ad801283f5e 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -14,8 +14,25 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor device_conte
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
-cc_library(math_api SRCS math.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(linalg_api SRCS linalg.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(creation_api SRCS creation.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(manipulation_api SRCS manipulation.cc DEPS pten_tensor pten kernel_dispatch)
+set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
+set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
+
+set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/include/api.h)
+set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
+set(api_header_file_tmp ${api_header_file}.tmp)
+set(api_source_file_tmp ${api_source_file}.tmp)
+
+add_custom_command(
+  OUTPUT ${api_header_file} ${api_source_file}
+  COMMAND python ${api_gen_file} 
+                 --api_yaml_path ${api_yaml_file}
+                 --api_header_path ${api_header_file_tmp}
+                 --api_source_path ${api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
+  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
+  DEPENDS ${api_yaml_file}
+  VERBATIM)
+
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
deleted file mode 100644
index 40054b5d272bd0..00000000000000
--- a/paddle/pten/api/lib/creation.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/creation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(CreationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype,
-                        Backend backend,
-                        DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  pten::KernelKey kernel_key{backend, layout, dtype};
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_constant", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  kernel_context.EmplaceBackAttr(pten::ScalarArray(shape));
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = pten::FullInferMeta(shape, dtype, layout);
-
-  // 5. Prepare outputs
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-
-  DataType kernel_data_type =
-      dtype == DataType::UNDEFINED ? kernel_key.dtype() : dtype;
-  Backend kernel_backend =
-      backend == Backend::UNDEFINED ? kernel_key.backend() : backend;
-  DataLayout kernel_layout =
-      layout == DataLayout::UNDEFINED ? kernel_key.layout() : layout;
-
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_any_like", {kernel_backend, kernel_layout, kernel_data_type});
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = FullLikeInferMeta(dense_x->meta(), dtype, layout);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_backend));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  return full_like(x, 1, dtype, backend, layout);
-}
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype,
-                              Backend backend,
-                              DataLayout layout) {
-  return full_like(x, 0, dtype, backend, layout);
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Creation);
diff --git a/paddle/pten/api/lib/kernel_dispatch.cc b/paddle/pten/api/lib/kernel_dispatch.cc
index 0205a0d53c319f..97b3bf281fc8ed 100644
--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -57,5 +57,46 @@ paddle::platform::DeviceContext* GetDeviceContextByBackend(
   return pool.Get(pten::TransToFluidPlace(backend));
 }
 
+DataType ParseDataType(DataType dtype) { return dtype; }
+DataType ParseDataType(const Tensor& tensor) { return tensor.type(); }
+DataType ParseDataType(const std::vector<Tensor>& tensors) {
+  if (tensors.empty()) {
+    return DataType::UNDEFINED;
+  }
+  DataType dtype = tensors[0].type();
+  auto n = tensors.size();
+  for (size_t i = 1; i < n; ++i) {
+    if (tensors[i].type() != dtype) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The data_type of input tensor in list isn't consistent, "
+          "the first tensor is %s, but %dth tensor is %s.",
+          dtype,
+          i,
+          tensors[i].type()));
+    }
+  }
+  return dtype;
+}
+
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
+  return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
+}
+
+Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Tensor& tensor) {
+  return pten::TransToPtenBackend(tensor.inner_place());
+}
+
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
+  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+}
+
+DataLayout ParseLayout(DataLayout layout) { return layout; }
+DataLayout ParseLayout(const Tensor& tensor) { return tensor.layout(); }
+
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor) {
+  return layout != DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.h b/paddle/pten/api/lib/kernel_dispatch.h
index 2dba88d07eb127..e78e79f27c28bf 100644
--- a/paddle/pten/api/lib/kernel_dispatch.h
+++ b/paddle/pten/api/lib/kernel_dispatch.h
@@ -129,5 +129,25 @@ KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
   return detail::KernelKeyParser().apply(args...).key_set;
 }
 
+DataType ParseDataType(DataType dtype);
+DataType ParseDataType(const Tensor& tensor);
+DataType ParseDataType(const std::vector<Tensor>& tensors);
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
+
+Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Tensor& tensor);
+template <typename T, typename... Args>
+Backend ParseBackend(T t, Args... args) {
+  auto backend_set =
+      BackendSet(ParseBackend(t)) | BackendSet(ParseBackend(args...));
+  return static_cast<Backend>(64 -
+                              detail::CountLeadingZeros(backend_set.bitset()));
+}
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+
+DataLayout ParseLayout(DataLayout layout);
+DataLayout ParseLayout(const Tensor& tensor);
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
deleted file mode 100644
index 8eae16d9018ad2..00000000000000
--- a/paddle/pten/api/lib/linalg.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/linalg.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(LinalgCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(LinalgCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "dot", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = DotInferMeta(dense_x->meta(), dense_y->meta());
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x,
-                          bool transpose_y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "matmul_v2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(transpose_x);
-  kernel_context.EmplaceBackAttr(transpose_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = MatmulInferMeta(
-      dense_x->meta(), dense_y->meta(), transpose_x, transpose_y);
-
-  // 5. Prepare outputs
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Linalg);
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
deleted file mode 100644
index 51a7702d9fc6e7..00000000000000
--- a/paddle/pten/api/lib/manipulation.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/manipulation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "flatten_contiguous_range", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(start_axis);
-  kernel_context.EmplaceBackAttr(stop_axis);
-
-  // 4. InferMeta
-  auto out_meta = FlattenInferMeta(dense_x->meta(), start_axis, stop_axis);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "cast", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(out_dtype);
-  kernel_context.EmplaceBackAttr(dense_x->meta().dtype);
-
-  // 4. InferMeta
-  auto out_meta = CastInferMeta(dense_x->meta(), out_dtype);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reshape2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(shape);
-
-  // 4. InferMeta
-  auto out_meta = InferMetaFromVecValue(dense_x->meta(), shape);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Manipulation);
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
deleted file mode 100644
index a97d78b5a9d6f0..00000000000000
--- a/paddle/pten/api/lib/math.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/math.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_mean", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferShape
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_sum", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-  if (dense_x->dtype() == DataType::BOOL ||
-      dense_x->dtype() == DataType::INT32 ||
-      dense_x->dtype() == DataType::INT64) {
-    out_dtype = DataType::INT64;
-  }
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferMeta
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_add", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_sub", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_div", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_mul", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor scale(const Tensor& x,
-                         const Scalar& scale,
-                         float bias,
-                         bool bias_after_scale) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "scale", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
-  kernel_context.EmplaceBackAttr(bias);
-  kernel_context.EmplaceBackAttr(bias_after_scale);
-
-  // 4. InferMeta
-  auto out_meta = UnchangedInferMeta(dense_x->meta());
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Math);
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 3f0966d369d0ca..6b4a3b1950a98c 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/pten/api/include/manipulation.h"
 #include "paddle/pten/api/include/utils.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
@@ -67,6 +66,9 @@ inline bool IsDenseTensor(
 
 }  // namespace detail
 
+// declare cast api
+Tensor cast(const Tensor &x, DataType out_dtype);
+
 /////// Tensor Methods ////////
 
 /* Part 1: Construction and destruction methods */
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index c670d094810198..46f2ef8be7c464 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index ef110e8e33c033..c2660a1f800196 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 972e065596e31f..41c03f8f262019 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 44033f1c611c44..e5971aae5513fd 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 1ebfc8e6746253..e87d094eec9d34 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 2fcf00efc60366..93c8a50f02a782 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index d3652db54ec5c2..01ca4aad642ba2 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index 59d91672f96779..a8c4c5306dced7 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 643551ec1cb1d7..b6179f11b1019e 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
index 2c0cd5cc71d8ee..3541e3b85ccee7 100644
--- a/paddle/pten/tests/api/test_scale_api.cc
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_slice_api.cc b/paddle/pten/tests/api/test_slice_api.cc
index 31a96c392dc1d7..004c085af06e83 100644
--- a/paddle/pten/tests/api/test_slice_api.cc
+++ b/paddle/pten/tests/api/test_slice_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index 4656f40463960f..d1b7ea33e8b76d 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -50,7 +50,7 @@ TEST(API, sum) {
   std::vector<int64_t> axis = {0, 1};
 
   // 2. test API
-  auto out = paddle::experimental::sum(x, axis, DataType::UNDEFINED, false);
+  auto out = paddle::experimental::sum(x, axis, false);
   // 3. check result
   ASSERT_EQ(out.dims().size(), 1);
   ASSERT_EQ(out.dims()[0], 1);
diff --git a/paddle/scripts/musl_build/build_inside.sh b/paddle/scripts/musl_build/build_inside.sh
index 04dea2086a6780..4c7fa804de5787 100755
--- a/paddle/scripts/musl_build/build_inside.sh
+++ b/paddle/scripts/musl_build/build_inside.sh
@@ -51,6 +51,7 @@ if [ "$pip_index" ]; then
 fi
 
 if [ "$WITH_REQUIREMENT" ]; then
+    echo "pyyaml" >> $WITH_REQUIREMENT
     echo ">>> install python requirement: $WITH_REQUIREMENT";
     pip install $PIP_ARGS -r "$WITH_REQUIREMENT";
 fi
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
new file mode 100644
index 00000000000000..581aaef62a78f5
--- /dev/null
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -0,0 +1,153 @@
+- api : add
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_add
+    param : [x, y, -1]
+
+- api : cast
+  args : (const Tensor& x, DataType out_dtype)
+  output : Tensor
+  infer_meta : 
+    func : CastInferMeta
+  kernel :
+    func : cast
+    param : [x, out_dtype, x.dtype()]
+    data_type : x
+
+- api : divide
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_div
+    param : [x, y, -1]
+
+- api : dot
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : DotInferMeta
+  kernel : 
+    func : dot
+
+- api : flatten
+  args : (const Tensor& x, int start_axis, int stop_axis)
+  output : Tensor
+  infer_meta : 
+    func : FlattenInferMeta
+  kernel : 
+    func : flatten_contiguous_range
+
+- api : full
+  args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  output: Tensor
+  infer_meta : 
+    func : FullInferMeta
+    param : [shape, dtype, layout]
+  kernel : 
+    func : fill_constant
+    param : [shape, value]
+    data_type : dtype
+    backend : place
+    layout : layout
+  
+- api : full_like
+  args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  output: Tensor
+  infer_meta : 
+    func : FullLikeInferMeta
+    param : [x, dtype, layout]
+  kernel : 
+    func : fill_any_like
+    param : [x, value]
+    data_type : dtype > x
+    backend : place > x
+    layout : layout > x
+
+- api : matmul
+  args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
+  output : Tensor
+  infer_meta : 
+    func : MatmulInferMeta
+  kernel : 
+    func : matmul_v2
+
+- api : mean
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_mean
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : multiply
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_mul
+    param : [x, y, -1]
+
+- api : ones_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 1, dtype, place, layout)
+
+- api : reshape
+  args : (const Tensor& x, const std::vector<int64_t>& shape)
+  output : Tensor
+  infer_meta : 
+    func : InferMetaFromVecValue
+  kernel : 
+    func : reshape2
+
+- api : scale
+  args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
+  output : Tensor
+  infer_meta : 
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : scale
+
+- api : subtract
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_sub
+    param : [x, y, -1]
+
+- api : sum
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_sum
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : zeros_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 0, dtype, place, layout)
+
+# - api : full_like
+#   args : (const Tensor& x, const Scalar& value, DataType dtype, Backend place)->Tensor
+#   output: {Tensor : dtype}
+#   kernel : fill_any_like
+#   T : [dtype, x]
+#   backend : [place, x]
+#   layout : []
+#   InferMeta : UnchangedInferMeta(x)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
new file mode 100644
index 00000000000000..cd81d001b8f6b0
--- /dev/null
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -0,0 +1,452 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+
+
+class API:
+    prefix_tensor_name = 'dense_'
+
+    def __init__(self, api_item_yaml):
+        self.api = api_item_yaml['api']
+        # args:
+        #   inputs: 
+        #     names : [], list of input names
+        #   attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}    
+        self.args = self.parse_args(api_item_yaml['args'])
+        self.output = api_item_yaml['output']
+        self.is_base_api = True
+        if 'invoke' in api_item_yaml:
+            self.is_base_api = False
+            self.invoke = api_item_yaml['invoke']
+        else:
+            self.kernel = api_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+                self.kernel['param'] = None
+
+            self.infer_meta = api_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta or len(self.infer_meta[
+                    'param']) == 0:
+                self.infer_meta['param'] = None
+
+    def parse_args(self, args_str):
+        inputs = {'names': []}
+        attrs = {'names': [], 'attr_info': {}}
+        args_str = args_str.strip()
+        assert args_str.startswith('(') and args_str.endswith(')'), \
+            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
+        args_str = args_str[1:-1]
+        args_list = args_str.split(',')
+        input_types = ['const Tensor&', 'const Tensor &']
+        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+        args_declare_str = ""
+        args_define_str = ""
+        for item in args_list:
+            item = item.strip()
+            # match the input tensor
+            has_input = False
+            for in_type in input_types:
+                if item.startswith(in_type):
+                    input_name = item[len(in_type):].strip()
+                    assert len(input_name) > 0, \
+                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
+                    inputs['names'].append(input_name)
+                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                    has_input = True
+                    break
+            if has_input:
+                continue
+
+            # match the attribute
+            for attr_type in attr_types:
+                if item.startswith(attr_type):
+                    attr_name = item[len(attr_type):].strip()
+                    assert len(attr_name) > 0, \
+                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
+                    default_value = None
+                    if '=' in attr_name:
+                        attr_infos = attr_name.split('=')
+                        attr_name = attr_infos[0].strip()
+                        default_value = attr_infos[1].strip()
+
+                    default_value_str = "" if default_value is None else '=' + default_value
+                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                    attrs['names'].append(attr_name)
+                    attrs['attr_info'][attr_name] = (attr_type, default_value)
+                    break
+
+        args = {
+            'inputs': inputs,
+            'attrs': attrs,
+            'args_declare': args_declare_str[:-2],
+            'args_define': args_define_str[:-2]
+        }
+        return args
+
+    def gene_api_declaration(self):
+        return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args['args_declare']});
+"""
+
+    def gene_kernel_select(self, input_names, attrs, kernel):
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'Backend':
+                assert kernel['backend'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataLayout':
+                assert kernel['layout'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+                attr_layout_count = attr_layout_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataType':
+                assert kernel['data_type'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+                attr_data_type_count = attr_data_type_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = ""
+        if kernel['backend'] is not None:
+            if '>' in kernel['backend']:
+                vars_list = kernel['backend'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                args_str = ""
+                for ele in kernel['backend'].split(','):
+                    args_str = args_str + ele.strip() + ', '
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+        if kernel['layout'] is not None:
+            if '>' in kernel['layout']:
+                vars_list = kernel['layout'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['layout'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+        if kernel['data_type'] is not None:
+            if '>' in kernel['data_type']:
+                vars_list = kernel['data_type'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['data_type'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+        if len(input_names) == 0:
+            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+        kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} API kernel: " << kernel;"""
+
+        return kernel_select_code
+
+    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
+        infer_meta_params = infer_meta['param'] if infer_meta[
+            'param'] is not None else input_names + attr_names
+        param_code = ""
+        for param in infer_meta_params:
+            if param in input_names:
+                param_code = param_code + self.prefix_tensor_name + param + "->meta(), "
+            elif param in attr_names:
+                param_code = param_code + param + ", "
+            elif isinstance(param, str):
+                param_code = param_code + "\"" + param + "\", "
+            elif isinstance(param, bool):
+                param_code = param_code + str(param).lower() + ", "
+            else:
+                param_code = param_code + str(param) + ", "
+
+        param_code = param_code[:-2]
+        return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+    def gene_kernel_context(self, input_names, attrs, infer_meta, kernel_param):
+        attr_names = attrs['names']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        input_code_str = ""
+        attr_code_str = ""
+        for param in kernel_param:
+            if param in input_names:
+                # set input for kernel_context
+                input_code_str = input_code_str + f"""
+  auto {self.prefix_tensor_name}{param} = std::dynamic_pointer_cast<pten::DenseTensor>({param}.impl());
+  kernel_context.EmplaceBackInput({self.prefix_tensor_name}{param});"""
+
+            elif param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in attrs['attr_info'][param][0]:
+                    param = 'pten::ScalarArray(' + param + ')'
+                elif 'Scalar' in attrs['attr_info'][param][0]:
+                    param = 'pten::Scalar(' + param + ')'
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+            elif isinstance(param, bool):
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({str(param).lower()});"""
+
+            else:
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        return f"""
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = pten::KernelContext(dev_ctx);
+{input_code_str}
+{attr_code_str}
+{self.gene_infer_meta(input_names, attr_names, infer_meta)}
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_backend));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+
+  Tensor out;
+  out.set_impl(dense_out);"""
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+{self.gene_kernel_context(self.args['inputs']['names'], self.args['attrs'], self.infer_meta, self.kernel['param'])}
+
+  kernel(&kernel_context);
+  return out;
+}}
+"""
+
+        else:
+            return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+  return {self.invoke};
+}}
+"""
+
+
+def header_include():
+    return """
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infermeta.h"
+"""
+
+
+def module_declare():
+    return """
+PT_DECLARE_MODULE(CreationCPU);
+PT_DECLARE_MODULE(LinalgCPU);
+PT_DECLARE_MODULE(ManipulationCPU);
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(CreationCUDA);
+PT_DECLARE_MODULE(LinalgCUDA);
+PT_DECLARE_MODULE(ManipulationCUDA);
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+"""
+
+
+def api_register():
+    return """
+PT_REGISTER_API(Creation);
+PT_REGISTER_API(Linalg);
+PT_REGISTER_API(Manipulation);
+PT_REGISTER_API(Math);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/include/api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(module_declare())
+    source_file.write(namespace[0])
+
+    for api in apis:
+        api_code = API(api)
+        print(api_code.gene_api_declaration())
+        header_file.write(api_code.gene_api_declaration())
+        source_file.write(api_code.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to yaml file directory',
+        default='python/paddle/utils/code_gen/api.yaml')
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/pten/api/include/api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/pten/api/lib/api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()

From 07b4fe93f8d0f70350b68a63aadfff374219f880 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Fri, 3 Dec 2021 10:14:18 +0800
Subject: [PATCH 055/124] [Eager] publish python c api for eager (#37550)

* refine a test case, test=develop

* publish python c api for eager, test=develop

* revert modify about test_allclose_layer.py, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* delete numpy includes, use pybind11 numpy.h, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* suport eager error msg, and add grad test case, test=develop

* refine, test=develop

* refine, test=develop
---
 paddle/fluid/pybind/CMakeLists.txt            |   6 +
 paddle/fluid/pybind/eager.cc                  | 132 +++++++
 paddle/fluid/pybind/eager.h                   |  24 ++
 paddle/fluid/pybind/eager_functions.cc        | 223 ++++++++++++
 paddle/fluid/pybind/eager_method.cc           | 109 ++++++
 paddle/fluid/pybind/eager_properties.cc       | 155 ++++++++
 paddle/fluid/pybind/eager_utils.cc            | 339 ++++++++++++++++++
 paddle/fluid/pybind/eager_utils.h             |  54 +++
 paddle/fluid/pybind/exception.cc              |  43 +++
 paddle/fluid/pybind/exception.h               |  16 +
 paddle/fluid/pybind/op_function.h             |  45 +--
 paddle/fluid/pybind/pybind.cc                 |  48 ++-
 paddle/pten/core/CMakeLists.txt               |   6 +-
 paddle/pten/core/convert_utils.cc             | 126 ++++++-
 paddle/pten/core/convert_utils.h              |   5 +
 python/paddle/fluid/__init__.py               |   4 +
 python/paddle/fluid/eager/__init__.py         |  20 ++
 .../fluid/eager/eager_tensor_patch_methods.py |  23 ++
 python/paddle/fluid/framework.py              |  22 +-
 .../tests/unittests/test_egr_python_api.py    | 103 ++++++
 python/paddle/tensor/creation.py              |   7 +
 python/paddle/tensor/to_string.py             |  36 ++
 python/setup.py.in                            |   1 +
 23 files changed, 1485 insertions(+), 62 deletions(-)
 create mode 100644 paddle/fluid/pybind/eager.cc
 create mode 100644 paddle/fluid/pybind/eager.h
 create mode 100644 paddle/fluid/pybind/eager_functions.cc
 create mode 100644 paddle/fluid/pybind/eager_method.cc
 create mode 100644 paddle/fluid/pybind/eager_properties.cc
 create mode 100644 paddle/fluid/pybind/eager_utils.cc
 create mode 100644 paddle/fluid/pybind/eager_utils.h
 create mode 100644 python/paddle/fluid/eager/__init__.py
 create mode 100644 python/paddle/fluid/eager/eager_tensor_patch_methods.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_egr_python_api.py

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 588caed5a452ef..41708ef8611e42 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -212,6 +212,12 @@ if(WITH_PYTHON)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
 
   list(APPEND PYBIND_DEPS interpretercore standalone_executor)
+
+  cc_library(paddle_eager
+  SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
+  DEPS autograd_meta grad_node_info pten global_utils utils eager_api accumulation_node backward python)
+  list(APPEND PYBIND_DEPS paddle_eager)
+
   cc_library(paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
new file mode 100644
index 00000000000000..5be000844bcf17
--- /dev/null
+++ b/paddle/fluid/pybind/eager.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+PyTypeObject* p_eager_tensor_type;
+
+PyObject* eagertensor_new(PyTypeObject* type, PyObject* args,
+                          PyObject* kwargs) {
+  PyObject* obj = type->tp_alloc(type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+  }
+  return obj;
+}
+
+static void eagertensor_dealloc(EagerTensorObject* self) {
+  self->eagertensor.~EagerTensor();
+  Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
+}
+
+extern struct PyGetSetDef variable_properties[];
+
+extern PyMethodDef variable_methods[];
+
+PyTypeObject eager_tensor_type = {
+    PyVarObject_HEAD_INIT(NULL, 0) "core_avx.eager.EagerTensor", /* tp_name */
+    sizeof(EagerTensorObject),       /* tp_basicsize */
+    0,                               /* tp_itemsize */
+    (destructor)eagertensor_dealloc, /* tp_dealloc */
+    0,                               /* tp_vectorcall_offset */
+    0,                               /* tp_getattr */
+    0,                               /* tp_setattr */
+    0,                               /* tp_reserved */
+    0,                               /* tp_repr */
+    0,                               /* tp_as_number */
+    0,                               /* tp_as_sequence */
+    0,                               /* tp_as_mapping */
+    0,                               /* tp_hash  */
+    0,                               /* tp_call */
+    0,                               /* tp_str */
+    0,                               /* tp_getattro */
+    0,                               /* tp_setattro */
+    0,                               /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
+        Py_TPFLAGS_HEAPTYPE, /* tp_flags */
+    0,                       /* tp_doc */
+    0,                       /* tp_traverse */
+    0,                       /* tp_clear */
+    0,                       /* tp_richcompare */
+    0,                       /* tp_weaklistoffset */
+    0,                       /* tp_iter */
+    0,                       /* tp_iternext */
+    variable_methods,        /* tp_methods */
+    0,                       /* tp_members */
+    variable_properties,     /* tp_getset */
+    0,                       /* tp_base */
+    0,                       /* tp_dict */
+    0,                       /* tp_descr_get */
+    0,                       /* tp_descr_set */
+    0,                       /* tp_dictoffset */
+    0,                       /* tp_init */
+    0,                       /* tp_alloc */
+    eagertensor_new,         /* tp_new */
+    0,                       /* tp_free */
+    0,                       /* tp_is_gc */
+    0,                       /* tp_bases */
+    0,                       /* tp_mro */
+    0,                       /* tp_cache */
+    0,                       /* tp_subclasses */
+    0,                       /* tp_weaklist */
+    0,                       /* tp_del */
+    0,                       /* tp_version_tag */
+    0                        /* tp_finalize */
+};
+
+void BindEager(pybind11::module* module) {
+  auto m = module->def_submodule("eager");
+
+  p_eager_tensor_type = &eager_tensor_type;
+  if (PyType_Ready(&eager_tensor_type) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindEager(PyType_Ready)."));
+    return;
+  }
+
+  Py_INCREF(&eager_tensor_type);
+  if (PyModule_AddObject(m.ptr(), "EagerTensor",
+                         reinterpret_cast<PyObject*>(&eager_tensor_type)) < 0) {
+    Py_DECREF(&eager_tensor_type);
+    Py_DECREF(m.ptr());
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindEager(PyModule_AddObject)."));
+    return;
+  }
+
+  BindFunctions(m.ptr());
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
new file mode 100644
index 00000000000000..c1a869d9b89faa
--- /dev/null
+++ b/paddle/fluid/pybind/eager.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindEager(pybind11::module* m);
+void BindFunctions(PyObject* module);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
new file mode 100644
index 00000000000000..8c0f9ddf19f120
--- /dev/null
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -0,0 +1,223 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+extern PyTypeObject* p_eager_tensor_type;
+
+size_t PyArray_Size_(PyObject* numpy_data) {
+  size_t res = 1;
+  auto dims = pybind11::detail::array_proxy(numpy_data)->dimensions;
+  auto nd = pybind11::detail::array_proxy(numpy_data)->nd;
+  while (nd--) {
+    res *= (*dims++);
+  }
+  return res;
+}
+
+class EagerNumpyAllocation : public paddle::memory::allocation::Allocation {
+ public:
+  explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
+      : Allocation(
+            static_cast<void*>(pybind11::detail::array_proxy(numpy_data)->data),
+            pten::DataTypeSize(dtype) * PyArray_Size_(numpy_data),
+            paddle::platform::CPUPlace()),
+        arr_(numpy_data) {
+    PADDLE_ENFORCE_NOT_NULL(arr_, platform::errors::InvalidArgument(
+                                      "The underlying PyObject pointer of "
+                                      "numpy array cannot be nullptr"));
+    PADDLE_ENFORCE_NE(
+        arr_, Py_None,
+        platform::errors::PreconditionNotMet(
+            "The underlying PyObject pointer of numpy array cannot be None"));
+    Py_INCREF(arr_);
+  }
+  ~EagerNumpyAllocation() override {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(arr_);
+  }
+
+ private:
+  PyObject* arr_;
+};
+
+static PyObject* eager_api_set_expected_place(PyObject* self, PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  egr::Controller::Instance().SetExpectedPlace(place);
+
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_scale(PyObject* self, PyObject* args,
+                                 PyObject* kwargs) {
+  EAGER_TRY
+  // TODO(jiabin): Sync Tensor and Variable here when we support
+  egr::EagerTensor ret =
+      egr::scale(reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+                     ->eagertensor,
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1),
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4));
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_numpy_to_tensor(PyObject* numpy_data,
+                                           pten::DataType dtype,
+                                           const paddle::platform::Place& place,
+                                           bool stop_gradient) {
+  std::vector<int64_t> vec_dims;
+  auto numpy_shape = pybind11::detail::array_proxy(numpy_data)->dimensions;
+  int rank = pybind11::detail::array_proxy(numpy_data)->nd;
+  for (int i = 0; i < rank; i++) {
+    vec_dims.push_back(static_cast<int64_t>(numpy_shape[i]));
+  }
+  paddle::framework::DDim dims = paddle::framework::make_ddim(vec_dims);
+
+  // TODO(jiabin): Support GPU later
+  auto meta = pten::DenseTensorMeta(dtype, dims);
+  auto holder = std::make_shared<EagerNumpyAllocation>(numpy_data, dtype);
+  auto shared_storage =
+      pten::make_intrusive<paddle::experimental::SharedStorage>(holder, 0);
+  std::shared_ptr<pten::DenseTensor> densetensor(
+      new pten::DenseTensor(std::move(shared_storage), std::move(meta)));
+
+  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+    v->eagertensor.set_impl(densetensor);
+    v->eagertensor.set_name(egr::Controller::Instance().GenerateUniqueName());
+    auto meta = egr::EagerUtils::autograd_meta(&(v->eagertensor));
+    meta->SetStopGradient(stop_gradient);
+
+    // Created tensor will be leaf tensor
+    // So we append AccumulationNode to it.
+    auto accumulation_node = std::make_shared<egr::GradNodeAccumulation>();
+    meta->SetGradNode(accumulation_node);
+
+    // TODO(jiabin): Shall we increase ref cnt here to make python ref cnt num
+    // correctly?
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+
+  return obj;
+}
+
+static PyObject* eager_api_to_tensor(PyObject* self, PyObject* args,
+                                     PyObject* kwargs) {
+  EAGER_TRY
+  // TODO(jiabin): Support Kwargs here
+  PyObject* data = PyTuple_GET_ITEM(args, 0);
+  auto str_dtype = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  pten::DataType dtype = pten::String2DataType(str_dtype);
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
+  bool stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+  // TODO(jiabin): Support this when python given name
+  // auto str_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 4), 4);
+
+  if (pybind11::detail::npy_api::get().PyArray_Check_(data)) {
+    return eager_api_numpy_to_tensor(data, dtype, place, stop_gradient);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Eater to_tensor only support numpy to tensor."));
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_retain_grad_for_tensor(PyObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
+  EAGER_TRY
+  egr::egr_utils_api::RetainGradForTensor(
+      CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0));
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensors =
+      CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 1), 1);
+  RunBackward(tensors, grad_tensors,
+              CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef variable_functions[] = {
+    {"to_tensor", (PyCFunction)(void (*)(void))eager_api_to_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_expected_place",
+     (PyCFunction)(void (*)(void))eager_api_set_expected_place,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"retain_grad_for_tensor",
+     (PyCFunction)(void (*)(void))eager_api_retain_grad_for_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}};
+
+void BindFunctions(PyObject* module) {
+  if (PyModule_AddFunctions(module, variable_functions) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindFunctions(PyModule_AddFunctions)."));
+    return;
+  }
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
new file mode 100644
index 00000000000000..f040566260c74a
--- /dev/null
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* pEagerTensorType;
+
+static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
+                                           PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  if (!self->eagertensor.initialized()) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  auto tensor_dims = self->eagertensor.shape();
+  auto numpy_dtype = pten::TensorDtype2NumpyDtype(self->eagertensor.type());
+  auto sizeof_dtype = pten::DataTypeSize(self->eagertensor.type());
+  Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
+  Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
+  size_t numel = 1;
+  for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+    py_dims[i] = static_cast<size_t>(tensor_dims[i]);
+    py_strides[i] = sizeof_dtype * numel;
+    numel *= py_dims[i];
+  }
+  auto& api = pybind11::detail::npy_api::get();
+  PyObject* array = api.PyArray_NewFromDescr_(
+      api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
+      tensor_dims.size(), py_dims, py_strides, nullptr,
+      pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+          pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+      nullptr);
+
+  if (self->eagertensor.is_cpu()) {
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(self->eagertensor.impl());
+    platform::CPUPlace place;
+    // deep copy
+    paddle::memory::Copy(place, reinterpret_cast<void*>(
+                                    pybind11::detail::array_proxy(array)->data),
+                         place, dense_tensor->data(), sizeof_dtype * numel);
+#if defined(PADDLE_WITH_CUDA)
+  } else if (self->eagertensor.is_cuda()) {
+    auto dense_tensor =
+        std::dynamic_pointer_cast<pten::DenseTensor>(self->eagertensor.impl());
+
+    paddle::platform::GpuMemcpySync(
+        pybind11::detail::array_proxy(array)->data, dense_tensor->data(),
+        pten::DataTypeSize(dense_tensor->dtype()) * dense_tensor->numel(),
+        cudaMemcpyDeviceToHost);
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Tensor.numpy() only support cpu tensor."));
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+
+  return array;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor_method_is_initialized(EagerTensorObject* self,
+                                                    PyObject* args,
+                                                    PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(self->eagertensor.initialized());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef variable_methods[] = {
+    {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_initialized",
+     (PyCFunction)(void (*)(void))eager_tensor_method_is_initialized,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
new file mode 100644
index 00000000000000..a13e4836d141a8
--- /dev/null
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+#pragma GCC diagnostic ignored "-Wwrite-strings"
+
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* p_eager_tensor_type;
+
+PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_TRY
+  return ToPyObject(self->eagertensor.name());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
+                                     void* closure) {
+  EAGER_TRY
+  self->eagertensor.set_name(CastPyArg2AttrString(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
+                                                    void* closure) {
+  EAGER_TRY
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  return ToPyObject(meta->StopGradient());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_TRY
+  auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eagertensor);
+  return ToPyObject(meta->Grad());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
+                                              PyObject* value, void* closure) {
+  EAGER_TRY
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
+                                                  void* closure) {
+  EAGER_TRY
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  return ToPyObject(meta->Persistable());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+int eager_tensor_properties_set_persistable(EagerTensorObject* self,
+                                            PyObject* value, void* closure) {
+  EAGER_TRY
+  auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
+  meta->SetPersistable(CastPyArg2AttrBoolean(value, 0));
+  return 0;
+  EAGER_CATCH_AND_THROW_RETURN_ZERO
+}
+
+PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  auto ddim = self->eagertensor.shape();
+  std::vector<int64_t> value;
+  size_t rank = static_cast<size_t>(ddim.size());
+  value.resize(rank);
+  for (size_t i = 0; i < rank; i++) {
+    value[i] = ddim[i];
+  }
+
+  return ToPyObject(value);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  return ToPyObject(self->eagertensor.place());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
+                                                void* closure) {
+  EAGER_TRY
+  std::stringstream ostr;
+  ostr << self->eagertensor.place();
+  return ToPyObject(ostr.str());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self,
+                                            void* closure) {
+  EAGER_TRY
+  return ToPyObject(pten::DataType2String(self->eagertensor.type()));
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+struct PyGetSetDef variable_properties[] = {
+    {"grad", (getter)eager_tensor_properties_get_grad, nullptr, nullptr,
+     nullptr},
+    {"name", (getter)eager_tensor_properties_get_name,
+     (setter)eager_tensor_properties_set_name, nullptr, nullptr},
+    {"stop_gradient", (getter)eager_tensor_properties_get_stop_gradient,
+     (setter)eager_tensor_properties_set_stop_gradient, nullptr, nullptr},
+    {"persistable", (getter)eager_tensor_properties_get_persistable,
+     (setter)eager_tensor_properties_set_persistable, nullptr, nullptr},
+    {"shape", (getter)eager_tensor_properties_get_shape, nullptr, nullptr,
+     nullptr},
+    // {"is_leaf", (getter)eager_tensor_properties_get_is_leaf, nullptr,
+    // nullptr,
+    //  nullptr},
+    {"place", (getter)eager_tensor_properties_get_place, nullptr, nullptr,
+     nullptr},
+    {"_place_str", (getter)eager_tensor_properties_get_place_str, nullptr,
+     nullptr, nullptr},
+    {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr,
+     nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
new file mode 100644
index 00000000000000..9268fc8e7b976c
--- /dev/null
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -0,0 +1,339 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+
+namespace paddle {
+namespace pybind {
+
+extern PyTypeObject* p_eager_tensor_type;
+
+extern PyTypeObject* g_place_pytype;
+extern PyTypeObject* g_cudaplace_pytype;
+extern PyTypeObject* g_cpuplace_pytype;
+extern PyTypeObject* g_xpuplace_pytype;
+extern PyTypeObject* g_npuplace_pytype;
+extern PyTypeObject* g_cudapinnedplace_pytype;
+
+bool PyObject_CheckLongOrConvertToLong(PyObject** obj) {
+  if ((PyLong_Check(*obj) && !PyBool_Check(*obj))) {
+    return true;
+  }
+
+  if (std::string((reinterpret_cast<PyTypeObject*>((*obj)->ob_type))->tp_name)
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Long(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj) {
+  // sometimes users provide PyLong or numpy.int64 but attr is float
+  if (PyFloat_Check(*obj) || PyLong_Check(*obj)) {
+    return true;
+  }
+  if (std::string((reinterpret_cast<PyTypeObject*>((*obj)->ob_type))->tp_name)
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Float(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool PyObject_CheckStr(PyObject* obj) { return PyUnicode_Check(obj); }
+
+bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
+  if (obj == Py_None) {
+    return false;  // To be compatible with QA integration testing. Some
+                   // test case pass in None.
+  } else if (obj == Py_True) {
+    return true;
+  } else if (obj == Py_False) {
+    return false;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "bool, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckLongOrConvertToLong(&obj)) {
+    return static_cast<int>(PyLong_AsLong(obj));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "int, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckLongOrConvertToLong(&obj)) {
+    return (int64_t)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "long, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
+    return static_cast<float>(PyFloat_AsDouble(obj));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "float, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+  }
+}
+
+std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_CheckStr(obj)) {
+    Py_ssize_t size;
+    const char* data;
+    data = PyUnicode_AsUTF8AndSize(obj, &size);
+    return std::string(data, static_cast<size_t>(size));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "str, but got %s",
+        arg_pos + 1, (reinterpret_cast<PyTypeObject*>(obj->ob_type))->tp_name));
+    return "";
+  }
+}
+
+egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos) {
+  if (PyObject_IsInstance(obj,
+                          reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+    return reinterpret_cast<EagerTensorObject*>(obj)->eagertensor;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "EagerTensor, but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
+std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
+                                                            ssize_t arg_pos) {
+  std::vector<egr::EagerTensor> result;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_IsInstance(
+              item, reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+        result.emplace_back(
+            reinterpret_cast<EagerTensorObject*>(item)->eagertensor);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
+      }
+    }
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_IsInstance(
+              item, reinterpret_cast<PyObject*>(p_eager_tensor_type))) {
+        result.emplace_back(
+            reinterpret_cast<EagerTensorObject*>(item)->eagertensor);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name, i));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "list or tuple, but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return result;
+}
+
+platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
+  platform::Place place;
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(g_place_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::Place>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cudaplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CUDAPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cpuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_xpuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::XPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_npuplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::NPUPlace>();
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(g_cudapinnedplace_pytype))) {
+    place = ::pybind11::handle(obj).cast<platform::CUDAPinnedPlace>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "but got %s",
+        arg_pos + 1, reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return place;
+}
+
+PyObject* ToPyObject(bool value) {
+  if (value) {
+    Py_INCREF(Py_True);
+    return Py_True;
+  } else {
+    Py_INCREF(Py_False);
+    return Py_False;
+  }
+}
+
+PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
+
+PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
+
+PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
+
+PyObject* ToPyObject(double value) { return PyLong_FromDouble(value); }
+
+PyObject* ToPyObject(const char* value) { return PyUnicode_FromString(value); }
+
+PyObject* ToPyObject(const std::string& value) {
+  return PyUnicode_FromString(value.c_str());
+}
+
+PyObject* ToPyObject(const egr::EagerTensor& value) {
+  PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<EagerTensorObject*>(obj);
+    new (&(v->eagertensor)) egr::EagerTensor();
+    v->eagertensor = value;
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+  return obj;
+}
+
+PyObject* ToPyObject(const std::vector<bool>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<int>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<int64_t>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, (Py_ssize_t)i, ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<float>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<double>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), ToPyObject(value[i]));
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value) {
+  PyObject* result = PyList_New((Py_ssize_t)value.size());
+
+  for (size_t i = 0; i < value.size(); i++) {
+    PyObject* obj = p_eager_tensor_type->tp_alloc(p_eager_tensor_type, 0);
+    if (obj) {
+      auto v = reinterpret_cast<EagerTensorObject*>(obj);
+      new (&(v->eagertensor)) egr::EagerTensor();
+      v->eagertensor = value[i];
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "tp_alloc return null, can not new a PyObject."));
+    }
+    PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
+  }
+
+  return result;
+}
+
+PyObject* ToPyObject(const platform::Place& value) {
+  auto obj = ::pybind11::cast(value);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
new file mode 100644
index 00000000000000..49f56a61c31f1f
--- /dev/null
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+typedef struct {
+  PyObject_HEAD egr::EagerTensor eagertensor;
+} EagerTensorObject;
+
+bool PyObject_CheckLongOrConvertToLong(PyObject** obj);
+bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj);
+bool PyObject_CheckStr(PyObject* obj);
+bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos);
+int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
+int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
+float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
+std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
+egr::EagerTensor CastPyArg2EagerTensor(PyObject* obj, ssize_t arg_pos);
+std::vector<egr::EagerTensor> CastPyArg2VectorOfEagerTensor(PyObject* obj,
+                                                            ssize_t arg_pos);
+platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
+
+PyObject* ToPyObject(int value);
+PyObject* ToPyObject(bool value);
+PyObject* ToPyObject(int64_t value);
+PyObject* ToPyObject(float value);
+PyObject* ToPyObject(double value);
+PyObject* ToPyObject(const char* value);
+PyObject* ToPyObject(const std::string& value);
+PyObject* ToPyObject(const egr::EagerTensor& value);
+PyObject* ToPyObject(const std::vector<bool>& value);
+PyObject* ToPyObject(const std::vector<int>& value);
+PyObject* ToPyObject(const std::vector<int64_t>& value);
+PyObject* ToPyObject(const std::vector<float>& value);
+PyObject* ToPyObject(const std::vector<double>& value);
+PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
+PyObject* ToPyObject(const platform::Place& value);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 3d07985ff654e6..362a3e44fab625 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -81,5 +81,48 @@ void BindException(pybind11::module* m) {
   });
 }
 
+void ThrowExceptionToPython(std::exception_ptr p) {
+  static PyObject* EOFExceptionException =
+      PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL);
+  static PyObject* EnforceNotMetException =
+      PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL);
+  try {
+    if (p) std::rethrow_exception(p);
+  } catch (const platform::EOFException& e) {
+    PyErr_SetString(EOFExceptionException, e.what());
+  } catch (const platform::EnforceNotMet& e) {
+    switch (e.code()) {
+      case paddle::platform::error::INVALID_ARGUMENT:
+        PyErr_SetString(PyExc_ValueError, e.what());
+        break;
+      case paddle::platform::error::NOT_FOUND:
+      case paddle::platform::error::ALREADY_EXISTS:
+      case paddle::platform::error::PRECONDITION_NOT_MET:
+      case paddle::platform::error::PERMISSION_DENIED:
+      case paddle::platform::error::EXECUTION_TIMEOUT:
+      case paddle::platform::error::UNAVAILABLE:
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+        break;
+      case paddle::platform::error::OUT_OF_RANGE:
+        PyErr_SetString(PyExc_IndexError, e.what());
+        break;
+      case paddle::platform::error::RESOURCE_EXHAUSTED:
+        PyErr_SetString(PyExc_MemoryError, e.what());
+        break;
+      case paddle::platform::error::UNIMPLEMENTED:
+        PyErr_SetString(PyExc_NotImplementedError, e.what());
+        break;
+      case paddle::platform::error::FATAL:
+        PyErr_SetString(PyExc_SystemError, e.what());
+        break;
+      case paddle::platform::error::EXTERNAL:
+        PyErr_SetString(PyExc_OSError, e.what());
+        break;
+      default:
+        PyErr_SetString(EnforceNotMetException, e.what());
+        break;
+    }
+  }
+}
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index 5e054267361f2c..cf82f464a11f29 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -18,10 +18,26 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
+#define EAGER_TRY try {
+#define EAGER_CATCH_AND_THROW_RETURN_NULL             \
+  }                                                   \
+  catch (...) {                                       \
+    ThrowExceptionToPython(std::current_exception()); \
+    return nullptr;                                   \
+  }
+
+#define EAGER_CATCH_AND_THROW_RETURN_ZERO             \
+  }                                                   \
+  catch (...) {                                       \
+    ThrowExceptionToPython(std::current_exception()); \
+    return 0;                                         \
+  }
+
 namespace paddle {
 namespace pybind {
 
 void BindException(pybind11::module* m);
+void ThrowExceptionToPython(std::exception_ptr p);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 324cd4b1b161f5..5535ffd950f37d 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -29,6 +29,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
 
 namespace py = pybind11;
@@ -992,50 +993,6 @@ void InitOpsAttrTypeMap() {
   }
 }
 
-void ThrowExceptionToPython(std::exception_ptr p) {
-  static PyObject* EOFExceptionException =
-      PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL);
-  static PyObject* EnforceNotMetException =
-      PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL);
-  try {
-    if (p) std::rethrow_exception(p);
-  } catch (const platform::EOFException& e) {
-    PyErr_SetString(EOFExceptionException, e.what());
-  } catch (const platform::EnforceNotMet& e) {
-    switch (e.code()) {
-      case paddle::platform::error::INVALID_ARGUMENT:
-        PyErr_SetString(PyExc_ValueError, e.what());
-        break;
-      case paddle::platform::error::NOT_FOUND:
-      case paddle::platform::error::ALREADY_EXISTS:
-      case paddle::platform::error::PRECONDITION_NOT_MET:
-      case paddle::platform::error::PERMISSION_DENIED:
-      case paddle::platform::error::EXECUTION_TIMEOUT:
-      case paddle::platform::error::UNAVAILABLE:
-        PyErr_SetString(PyExc_RuntimeError, e.what());
-        break;
-      case paddle::platform::error::OUT_OF_RANGE:
-        PyErr_SetString(PyExc_IndexError, e.what());
-        break;
-      case paddle::platform::error::RESOURCE_EXHAUSTED:
-        PyErr_SetString(PyExc_MemoryError, e.what());
-        break;
-      case paddle::platform::error::UNIMPLEMENTED:
-        PyErr_SetString(PyExc_NotImplementedError, e.what());
-        break;
-      case paddle::platform::error::FATAL:
-        PyErr_SetString(PyExc_SystemError, e.what());
-        break;
-      case paddle::platform::error::EXTERNAL:
-        PyErr_SetString(PyExc_OSError, e.what());
-        break;
-      default:
-        PyErr_SetString(EnforceNotMetException, e.what());
-        break;
-    }
-  }
-}
-
 }  // namespace pybind
 }  // namespace paddle
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c06d6961c4fd94..5fc1f27eff36f6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,6 +75,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/utils/none.h"
 #ifdef PADDLE_WITH_ASCEND
@@ -150,6 +151,14 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 
 namespace paddle {
 namespace pybind {
+
+PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_cudaplace_pytype = nullptr;
+PyTypeObject *g_cpuplace_pytype = nullptr;
+PyTypeObject *g_xpuplace_pytype = nullptr;
+PyTypeObject *g_npuplace_pytype = nullptr;
+PyTypeObject *g_cudapinnedplace_pytype = nullptr;
+
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   return false;
@@ -524,6 +533,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+  BindEager(&m);
   BindCudaStream(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
@@ -1599,7 +1609,7 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
-  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
+  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
 
     CUDAPlace is a descriptor of a device.
     It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1622,7 +1632,9 @@ All parameter, weight, gradient are variables in Paddle.
 
           place = paddle.CUDAPlace(0)
 
-        )DOC")
+        )DOC");
+  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
+  cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1680,13 +1692,15 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
-  py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
+  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
     **Note**:
     Examples:
         .. code-block:: python
           import paddle.fluid as fluid
           xpu_place = fluid.XPUPlace(0)
-        )DOC")
+        )DOC");
+  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
+  xpuplace
       .def("__init__",
            [](platform::XPUPlace &self, int dev_id) {
 #ifdef PADDLE_WITH_XPU
@@ -1756,7 +1770,7 @@ All parameter, weight, gradient are variables in Paddle.
   });
 #endif
 
-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
+  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device on which a tensor will be allocated and a model will run.
 
@@ -1766,8 +1780,9 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           cpu_place = paddle.CPUPlace()
 
-        )DOC")
-      .def(py::init<>())
+        )DOC");
+  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
+  cpuplace.def(py::init<>())
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
@@ -1779,7 +1794,8 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CPUPlace &>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
-  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
+  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
+      m, "CUDAPinnedPlace", R"DOC(
     CUDAPinnedPlace is a descriptor of a device.
     It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
     The host operating system will not paging and exchanging the memory.
@@ -1793,7 +1809,10 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           place = paddle.CUDAPinnedPlace()
 
-        )DOC")
+        )DOC");
+  g_cudapinnedplace_pytype =
+      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
+  cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
@@ -1819,7 +1838,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   // NPUPlace
-  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
     NPUPlace is a descriptor of a device.
     It represents a NPU device on which a tensor will be allocated and a model will run.
 
@@ -1828,7 +1847,9 @@ All parameter, weight, gradient are variables in Paddle.
           import paddle
           npu_place = paddle.NPUPlace(0)
 
-        )DOC")
+        )DOC");
+  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
+  npuplace
       .def("__init__",
            [](platform::NPUPlace &self, int dev_id) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -1879,8 +1900,9 @@ All parameter, weight, gradient are variables in Paddle.
            [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
       .def("__str__", string::to_string<const platform::NPUPlace &>);
 
-  py::class_<platform::Place>(m, "Place")
-      .def(py::init<>())
+  py::class_<platform::Place> platformplace(m, "Place");
+  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
+  platformplace.def(py::init<>())
       .def("_type", &PlaceIndex<platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index e19d0a490cef39..0a2504f50327c1 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,9 +1,9 @@
 if(WITH_GPU)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info python)
 elseif(WITH_ROCM)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info python)
 else()
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place python)
 endif()
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 92709647dac00d..8b54813eadf327 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/gpu_info.h"
@@ -180,4 +181,127 @@ pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod) {
   return out;
 }
 
+size_t DataTypeSize(DataType dtype) {
+  switch (dtype) {
+    case DataType::UNDEFINED:
+      return 0;
+    case DataType::BOOL:
+      return sizeof(bool);
+    case DataType::INT8:
+      return sizeof(int8_t);
+    case DataType::UINT8:
+      return sizeof(uint8_t);
+    case DataType::INT16:
+      return sizeof(int16_t);
+    case DataType::INT32:
+      return sizeof(int);
+    case DataType::INT64:
+      return sizeof(int64_t);
+    case DataType::FLOAT16:
+      return sizeof(paddle::platform::float16);
+    case DataType::FLOAT32:
+      return sizeof(float);
+    case DataType::FLOAT64:
+      return sizeof(double);
+    case DataType::COMPLEX64:
+      return sizeof(paddle::platform::complex<float>);
+    case DataType::COMPLEX128:
+      return sizeof(paddle::platform::complex<double>);
+    default:
+      return 0;
+  }
+}
+
+DataType String2DataType(const std::string& str) {
+  if (str == "bool") {
+    return DataType::BOOL;
+  } else if (str == "float16") {
+    return DataType::FLOAT16;
+  } else if (str == "float32") {
+    return DataType::FLOAT32;
+  } else if (str == "float64") {
+    return DataType::FLOAT64;
+  } else if (str == "int8") {
+    return DataType::INT8;
+  } else if (str == "int16") {
+    return DataType::INT16;
+  } else if (str == "int32") {
+    return DataType::INT32;
+  } else if (str == "int64") {
+    return DataType::INT64;
+  } else if (str == "uint8") {
+    return DataType::UINT8;
+  } else if (str == "complex64") {
+    return DataType::COMPLEX64;
+  } else if (str == "complex128") {
+    return DataType::COMPLEX128;
+  } else {
+    return DataType::UNDEFINED;
+  }
+}
+
+std::string DataType2String(DataType dtype) {
+  switch (dtype) {
+    case DataType::BOOL:
+      return "bool";
+    case DataType::INT8:
+      return "int8";
+    case DataType::UINT8:
+      return "uint8";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::FLOAT16:
+      return "float16";
+    case DataType::FLOAT32:
+      return "float32";
+    case DataType::FLOAT64:
+      return "float64";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unknow pten::DataType, the int value = %d.",
+          static_cast<int>(dtype)));
+      return "";
+  }
+}
+
+int TensorDtype2NumpyDtype(pten::DataType dtype) {
+  switch (dtype) {
+    case pten::DataType::BOOL:
+      return pybind11::detail::npy_api::NPY_BOOL_;
+    case pten::DataType::INT8:
+      return pybind11::detail::npy_api::NPY_INT8_;
+    case pten::DataType::UINT8:
+      return pybind11::detail::npy_api::NPY_UINT8_;
+    case pten::DataType::INT16:
+      return pybind11::detail::npy_api::NPY_INT16_;
+    case pten::DataType::INT32:
+      return pybind11::detail::npy_api::NPY_INT32_;
+    case pten::DataType::INT64:
+      return pybind11::detail::npy_api::NPY_INT64_;
+    case pten::DataType::FLOAT16:
+      return pybind11::detail::NPY_FLOAT16_;
+    case pten::DataType::FLOAT32:
+      return pybind11::detail::npy_api::NPY_FLOAT_;
+    case pten::DataType::FLOAT64:
+      return pybind11::detail::npy_api::NPY_DOUBLE_;
+    case pten::DataType::COMPLEX64:
+      return pybind11::detail::NPY_COMPLEX64;
+    case pten::DataType::COMPLEX128:
+      return pybind11::detail::NPY_COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unknow pten::DataType, the int value = %d.",
+          static_cast<int>(dtype)));
+      return 0;
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index 0b807c48bc1505..e5990eb0a89f03 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -45,4 +45,9 @@ paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
 paddle::framework::LoD TransToFluidLoD(const pten::LoD& lod);
 pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
 
+size_t DataTypeSize(DataType dtype);
+DataType String2DataType(const std::string& str);
+std::string DataType2String(DataType dtype);
+int TensorDtype2NumpyDtype(pten::DataType dtype);
+
 }  // namespace pten
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5683750c4d8298..5482413dbbc5d1 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -55,6 +55,7 @@
 from .initializer import set_global_initializer
 from . import layers
 from . import dygraph
+from . import eager
 from . import contrib
 from . import nets
 from . import optimizer
@@ -90,6 +91,7 @@
 from .io import save, load, load_program_state, set_program_state
 from .dygraph.checkpoint import save_dygraph, load_dygraph
 from .dygraph.varbase_patch_methods import monkey_patch_varbase
+from .eager.eager_tensor_patch_methods import monkey_patch_eagertensor
 from . import generator
 from .core import _cuda_synchronize
 from .generator import Generator
@@ -113,6 +115,7 @@
         'contrib',
         'data',
         'dygraph',
+        'eager',
         'enable_dygraph',
         'disable_dygraph',
         'enable_imperative',
@@ -211,6 +214,7 @@ def remove_flag_if_exists(name):
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+monkey_patch_eagertensor()
 
 # NOTE(zhiqiu): register npu_finalize on the exit of Python,
 # do some clean up manually.
diff --git a/python/paddle/fluid/eager/__init__.py b/python/paddle/fluid/eager/__init__.py
new file mode 100644
index 00000000000000..1dc82ef69979c1
--- /dev/null
+++ b/python/paddle/fluid/eager/__init__.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# incubate directory is mainly for internal use
+# after we have tested incubate APIs in industrial application for a period
+# we will move stable functions into fluid
+
+from . import eager_tensor_patch_methods
+
+__all__ = []
diff --git a/python/paddle/fluid/eager/eager_tensor_patch_methods.py b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
new file mode 100644
index 00000000000000..206c5cf23e6dad
--- /dev/null
+++ b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+
+
+def monkey_patch_eagertensor():
+    def __str__(self):
+        from paddle.tensor.to_string import eager_tensor_to_string
+        return eager_tensor_to_string(self)
+
+    setattr(core.eager.EagerTensor, "__str__", __str__)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 04042eac953ba2..ee7aa4560364e6 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -45,6 +45,8 @@
     'Program',
     'default_startup_program',
     'default_main_program',
+    'eager_guard',
+    'in_eager_mode',
     'program_guard',
     'name_scope',
     'cuda_places',
@@ -75,6 +77,21 @@
 global_prog_seed = 0
 _current_pipeline_stage = None
 _global_flags_ = core.globals()
+_eager_mode_ = False
+
+
+@signature_safe_contextmanager
+def eager_guard():
+    global _eager_mode_
+    _eager_mode_ = True
+    try:
+        yield
+    finally:
+        _eager_mode_ = False
+
+
+def in_eager_mode():
+    return _eager_mode_
 
 
 def require_version(min_version, max_version=None):
@@ -340,7 +357,10 @@ def _set_dygraph_tracer_expected_place(place):
 def _set_expected_place(place):
     global _global_expected_place_
     _global_expected_place_ = place
-    _set_dygraph_tracer_expected_place(place)
+    if in_eager_mode():
+        return core.eager._set_expected_place(place)
+    else:
+        _set_dygraph_tracer_expected_place(place)
 
 
 # TODO(zhiqiu): remove this function.
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
new file mode 100644
index 00000000000000..c497c7f9bd80a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
+import paddle
+import numpy as np
+from paddle.fluid import eager_guard
+import unittest
+
+
+class EagerScaleTestCase(unittest.TestCase):
+    def test_scale_base(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
+            print(tensor)
+            tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
+            for i in range(0, 100):
+                tensor = core.eager.scale(tensor, 2.0, 0.9, True, False)
+            print(tensor)
+            self.assertEqual(tensor.shape, [4, 16, 16, 32])
+            self.assertEqual(tensor.stop_gradient, True)
+
+    def test_retain_grad_and_run_backward(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+
+            input_data = np.ones([4, 16, 16, 32]).astype('float32')
+            data_eager = paddle.to_tensor(input_data, 'float32',
+                                          core.CPUPlace(), False)
+
+            grad_data = np.ones([4, 16, 16, 32]).astype('float32')
+            grad_eager = paddle.to_tensor(grad_data, 'float32', core.CPUPlace())
+
+            core.eager.retain_grad_for_tensor(data_eager)
+
+            out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
+            self.assertFalse(data_eager.grad._is_initialized())
+            core.eager.run_backward([out_eager], [grad_eager], False)
+            self.assertTrue(data_eager.grad._is_initialized())
+            self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
+
+
+class EagerDtypeTestCase(unittest.TestCase):
+    def check_to_tesnsor_and_numpy(self, dtype):
+        with eager_guard():
+            arr = np.random.random([4, 16, 16, 32]).astype(dtype)
+            tensor = paddle.to_tensor(arr, dtype)
+            self.assertEqual(tensor.dtype, dtype)
+            self.assertTrue(np.array_equal(arr, tensor.numpy()))
+
+    def test_dtype_base(self):
+        self.check_to_tesnsor_and_numpy('bool')
+        self.check_to_tesnsor_and_numpy('int8')
+        self.check_to_tesnsor_and_numpy('uint8')
+        self.check_to_tesnsor_and_numpy('int16')
+        self.check_to_tesnsor_and_numpy('int32')
+        self.check_to_tesnsor_and_numpy('int64')
+        self.check_to_tesnsor_and_numpy('float16')
+        self.check_to_tesnsor_and_numpy('float32')
+        self.check_to_tesnsor_and_numpy('float64')
+        self.check_to_tesnsor_and_numpy('complex64')
+        self.check_to_tesnsor_and_numpy('complex128')
+
+
+class EagerTensorPropertiesTestCase(unittest.TestCase):
+    def test_properties(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            tensor = paddle.to_tensor(arr, 'float32', core.CPUPlace())
+            self.assertEqual(tensor.shape, [4, 16, 16, 32])
+            tensor.name = 'tensor_name_test'
+            self.assertEqual(tensor.name, 'tensor_name_test')
+            self.assertEqual(tensor.persistable, False)
+            tensor.persistable = True
+            self.assertEqual(tensor.persistable, True)
+            tensor.persistable = False
+            self.assertEqual(tensor.persistable, False)
+            self.assertTrue(tensor.place.is_cpu_place())
+            self.assertEqual(tensor._place_str, 'CPUPlace')
+            self.assertEqual(tensor.stop_gradient, True)
+            tensor.stop_gradient = False
+            self.assertEqual(tensor.stop_gradient, False)
+            tensor.stop_gradient = True
+            self.assertEqual(tensor.stop_gradient, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 72b6bd29fd9e78..812c7e8b5ac04d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -31,6 +31,7 @@
 from ..fluid.layers import linspace  # noqa: F401
 import paddle
 from paddle import _C_ops
+from ..fluid.framework import in_eager_mode
 
 __all__ = []
 
@@ -115,6 +116,12 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             ) != _current_expected_place()._get_device_id():
         place = _current_expected_place()
 
+    if in_eager_mode():
+        if dtype is None:
+            dtype = paddle.get_default_dtype()
+        return core.eager.to_tensor(data,
+                                    convert_dtype(dtype), place, stop_gradient)
+
     if not isinstance(data, np.ndarray):
 
         def _handle_dtype(data, dtype):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index f640882893034d..6fd20457fe619a 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -255,3 +255,39 @@ def to_string(var, prefix='Tensor'):
         stop_gradient=var.stop_gradient,
         indent=' ' * indent,
         data=data)
+
+
+def eager_tensor_to_string(tensor, prefix='Tensor'):
+    indent = len(prefix) + 1
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
+
+    if not tensor._is_initialized():
+        return "Tensor(Not initialized)"
+
+    np_tensor = tensor.numpy()
+
+    if len(tensor.shape) == 0:
+        size = 0
+    else:
+        size = 1
+        for dim in tensor.shape:
+            size *= dim
+
+    sumary = False
+    if size > DEFAULT_PRINT_OPTIONS.threshold:
+        sumary = True
+
+    max_width, signed = _get_max_width(_to_summary(np_tensor))
+
+    data = _format_tensor(
+        np_tensor, sumary, indent=indent, max_width=max_width, signed=signed)
+
+    return _template.format(
+        prefix=prefix,
+        shape=tensor.shape,
+        dtype=tensor.dtype,
+        place=tensor._place_str,
+        stop_gradient=tensor.stop_gradient,
+        indent=' ' * indent,
+        data=data)
diff --git a/python/setup.py.in b/python/setup.py.in
index e01019ed7da778..5690fccf89dda3 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -307,6 +307,7 @@ packages=['paddle',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
+          'paddle.fluid.eager',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.distributed',

From 809ba964619c5e1959691b7b58dff98c4203f7f5 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Fri, 3 Dec 2021 11:08:02 +0800
Subject: [PATCH 056/124] add trt_mobilenet demo test in windows (#37667)

* add trt_mobilenet demo test

* only deliver trt_root in run.sh
---
 paddle/fluid/inference/api/demo_ci/run.sh | 32 ++++++++++++++++++-----
 paddle/scripts/paddle_build.bat           |  2 +-
 paddle/scripts/paddle_build.sh            |  3 +--
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index c0038f6c3f0387..d5452f82d08b50 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -19,9 +19,8 @@ PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
 TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
-TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-MSVC_STATIC_CRT=$7
+TENSORRT_ROOT_DIR=$5 # TensorRT root dir, default to /usr
+MSVC_STATIC_CRT=$6
 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform
 
@@ -39,7 +38,7 @@ else
 fi
 
 USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
+if [ -d "$TENSORRT_ROOT_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -132,6 +131,28 @@ for WITH_STATIC_LIB in ON OFF; do
         fi
       done
     done
+
+    # --------tensorrt mobilenet on windows------
+    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
+      rm -rf *
+      cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=trt_mobilenet_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
+      msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+      Release/trt_mobilenet_demo.exe \
+        --modeldir=$DATA_DIR/mobilenet/model \
+        --data=$DATA_DIR/mobilenet/data.txt \
+        --refer=$DATA_DIR/mobilenet/result.txt 
+      if [ $? -ne 0 ]; then
+        echo "trt demo trt_mobilenet_demo runs fail."
+        exit 1
+      fi
+    fi
   else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
@@ -183,8 +204,7 @@ for WITH_STATIC_LIB in ON OFF; do
         -DWITH_GPU=$TEST_GPU_CPU \
         -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
         -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-        -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR
       make -j$(nproc)
       ./trt_mobilenet_demo \
         --modeldir=$DATA_DIR/mobilenet/model \
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 1f5810dc5bcdd8..39b78d2a4b8fb9 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -741,7 +741,7 @@ for /F %%i in ("%libsize%") do (
 )
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 5c46f011a74d15..a80caee028a2b4 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2236,8 +2236,7 @@ EOF
     demo_ci_startTime_s=`date +%s`
     cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
     ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+             ${TENSORRT_ROOT_DIR:-/usr}
     DEMO_EXIT_CODE=$?
     ./clean.sh
     demo_ci_endTime_s=`date +%s`

From c077de3cf9b702c70c4b305fdaf4b5629f886fe4 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 3 Dec 2021 14:58:58 +0800
Subject: [PATCH 057/124] Rearranged Eager AutoCodeGen directory structure
 (#37812)

* Rearranged Eager AutoCodeGen directory structure

* Removed USE_OP in Eager AutoCodeGen
---
 .../auto_code_generator/eager_generator.cc    | 178 ++++++++++--------
 .../generate_file_structures.py               |  32 ++--
 .../performance_tests/benchmark_eager_cpu.cc  |   6 +
 .../performance_tests/benchmark_eager_cuda.cc |   7 +
 .../performance_tests/benchmark_fluid_cpu.cc  |   1 +
 .../performance_tests/benchmark_fluid_cuda.cc |   1 +
 .../eager/tests/task_tests/generated_test.cc  |  35 ++++
 7 files changed, 166 insertions(+), 94 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 283153585866ab..0d66d8d96a9b49 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -30,26 +30,63 @@
 DEFINE_bool(generate_all, false,
             "Generate all operators currently registered in Paddle");
 
+static std::unordered_map<std::string, paddle::framework::AttributeMap>
+    operators_with_attrs = {};
+
 static std::unordered_set<std::string> operators_to_skip = {
-    "fused_elemwise_add_activation",  // No Default Attr
-    "fused_elemwise_activation",      // No Default Attr
-    "reverse",                        // Attr Error
-    "flip",                           // Attr Error
-    "cast",                           // Attr Error
-    "sum",
-    "minus",  // Multiple ops_
-    "pull_sparse",
-    "pull_box_extended_sparse",
-    "pull_sparse_v2",
-    "pull_box_sparse",
-    "fused_attention",
-    "diag_v2",
-    "transfer_dtype",
+    "pull_sparse",     "pull_box_extended_sparse", "pull_sparse_v2",
+    "pull_box_sparse", "fused_attention",          "diag_v2",
     "c_split"};
 
 static std::unordered_set<std::string> operators_to_codegen = {};
 static std::unordered_set<std::string> skipped_operators = {};
 
+static void PrepareAttrMapForOps() {
+  // Handle "fused_elemwise_add_activation"
+  std::vector<std::string> functor_list = {"a", "b"};
+  operators_with_attrs["fused_elemwise_add_activation"] = {};
+  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "fused_elemwise_activation"
+  operators_with_attrs["fused_elemwise_activation"] = {};
+  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "reverse"
+  std::vector<int> axis = {0};
+  operators_with_attrs["reverse"] = {};
+  operators_with_attrs["reverse"]["axis"] = axis;
+
+  // Handle "flip"
+  operators_with_attrs["flip"] = {};
+  operators_with_attrs["flip"]["axis"] = axis;
+
+  // Handle "cast"
+  operators_with_attrs["cast"] = {};
+  operators_with_attrs["cast"]["out_dtype"] = 5;
+  operators_with_attrs["cast"]["in_dtype"] = 5;
+
+  // Handle "transfer_dtype"
+  operators_with_attrs["transfer_dtype"] = {};
+  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
+  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+}
+
+static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
+  std::string line;
+  std::ifstream op_list_file(op_list_path);
+  if (op_list_file.is_open()) {
+    while (getline(op_list_file, line)) {
+      operators_to_codegen.insert(line);
+    }
+    op_list_file.close();
+  } else {
+    PADDLE_THROW(
+        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
+  }
+}
+
 namespace paddle {
 namespace framework {
 
@@ -573,12 +610,21 @@ static bool CollectInformationFromOpInfo(
   paddle::framework::AttributeMap default_attrs;
   auto* attr_checker = op_info.Checker();
   if (attr_checker) {
+    VLOG(6) << "Checking AttributeMap Settings";
     attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true);
     default_attrs = attr_checker->GetDefaultAttrMap();
+    VLOG(6) << "AttributeMap Checking Passed";
   } else {
     VLOG(6) << "Detected Null Attribute Checker, use empty default_attrs";
   }
 
+  if (operators_with_attrs.count(op_type)) {
+    VLOG(6) << "Found operator " << op_type << " using special AttributeMap";
+    attrs = operators_with_attrs[op_type];
+    // default_attrs.insert(operators_with_attrs[op_type].begin(),
+    // operators_with_attrs[op_type].end());
+  }
+
   VLOG(6) << "Prepared Default Attributes Map, size = " << default_attrs.size();
 
   /* ---------------------------- */
@@ -851,18 +897,6 @@ static std::string GenerateGradNodeCreationContent(
   return grad_node_creation_body_str;
 }
 
-static std::string AppendUseOp(const std::string& op_type) {
-  // [Generation] Append USE_OP
-  const char* USE_OP_TEMPLATE = "USE_OP(%s);\n";
-  std::string return_str = paddle::string::Sprintf(USE_OP_TEMPLATE, op_type);
-
-  // Special Ops
-  if (op_type == "reduce_sum")
-    return_str += paddle::string::Sprintf(USE_OP_TEMPLATE, "reduce_sum_grad");
-
-  return return_str;
-}
-
 /* -------------------------------- */
 /* --------- CodeGen: Forward ----- */
 /* -------------------------------- */
@@ -1110,9 +1144,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
       dygraph_function_args_str, generated_function_body);
 
-  // [Generation] Append USE_OP
-  fwd_function_str += AppendUseOp(op_type);
-
   // [Generation] Generate forward functions header
   const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n";
   std::string dygraph_function_declaration_str = paddle::string::Sprintf(
@@ -1480,34 +1511,31 @@ static void GenerateForwardHFile(const std::string& output_dir,
   forward_header_stream.close();
 }
 
-static void GenerateForwardDygraphFile(const std::string& op_type,
-                                       const std::string& output_dir,
+static void GenerateForwardDygraphFile(const std::string& output_dir,
                                        const std::string& fwd_function_str) {
   std::string forwards_dir = output_dir + "/forwards/";
-  std::string node_h_filename = op_type + "_node.h";
-  std::string forward_cc_filename = op_type + "_dygraph.cc";
+  std::string forward_cc_filename = "dygraph_forward_functions.cc";
   std::string forward_cc_path = forwards_dir + forward_cc_filename;
   const char* FORWARD_INCLUDE_TEMPLATE =
       "#include "
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n"
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n"
       "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
       "#include \"paddle/fluid/eager/legacy/op_runner.h\"\n";
   std::string forward_cc_include_str =
-      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE, node_h_filename);
+      paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE);
   std::ofstream forward_cc_stream(forward_cc_path, std::ios::out);
   forward_cc_stream << forward_cc_include_str;
   forward_cc_stream << fwd_function_str;
   forward_cc_stream.close();
 }
 
-static void GenerateNodeHFile(const std::string& op_type,
-                              const std::string& output_dir,
+static void GenerateNodeHFile(const std::string& output_dir,
                               const std::string& grad_node_str) {
   std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_h_filename = op_type + "_node.h";
+  std::string node_h_filename = "nodes.h";
   std::string node_h_path = nodes_dir + node_h_filename;
   std::string node_h_include_str =
       "#pragma once\n"
@@ -1520,12 +1548,10 @@ static void GenerateNodeHFile(const std::string& op_type,
   node_h_stream.close();
 }
 
-static void GenerateNodeCCFile(const std::string& op_type,
-                               const std::string& output_dir,
+static void GenerateNodeCCFile(const std::string& output_dir,
                                const std::string& grad_function_str) {
   std::string nodes_dir = output_dir + "/nodes/";
-  std::string node_h_filename = op_type + "_node.h";
-  std::string node_cc_filename = op_type + "_node.cc";
+  std::string node_cc_filename = "nodes.cc";
   std::string node_cc_path = nodes_dir + node_cc_filename;
   const char* NODE_CC_INCLUDE_TEMPLATE =
       "#include \"glog/logging.h\"\n"
@@ -1535,9 +1561,9 @@ static void GenerateNodeCCFile(const std::string& op_type,
       "#include \"paddle/fluid/eager/utils.h\"\n"
       "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"
       "#include "
-      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/%s\"\n\n";
+      "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n";
   std::string node_cc_include_str =
-      paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE, node_h_filename);
+      paddle::string::Sprintf(NODE_CC_INCLUDE_TEMPLATE);
   std::ofstream node_cc_stream(node_cc_path, std::ios::out);
   node_cc_stream << node_cc_include_str;
   node_cc_stream << grad_function_str;
@@ -1558,6 +1584,9 @@ static std::string GenerateDygraphHFileIncludes() {
 
 static void DygraphCodeGeneration(const std::string& output_dir) {
   std::string dygraph_forward_api_str = GenerateDygraphHFileIncludes();
+  std::string fwd_function_str = "";
+  std::string grad_node_h_str = "";
+  std::string grad_node_cc_str = "";
 
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
 
@@ -1603,7 +1632,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* --------------------------- */
     /* --------- CodeGen --------- */
     /* --------------------------- */
-    /* ---- xxx_dygraph.cc ---- */
+    /* ---- forward_dygraph_functions.cc ---- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
         GenerateForwardFunctionContents(
@@ -1611,56 +1640,53 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
             grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map,
             grad_outs_slotname_map, grad_ins, grad_outs, op_type, in_vars,
             out_vars);
-    std::string fwd_function_str = body_and_declaration.first;
-    GenerateForwardDygraphFile(op_type, output_dir, fwd_function_str);
+    fwd_function_str += body_and_declaration.first + "\n";
 
     /* ---- dygraph_forward_api.h ---- */
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
-    /* ---- xxx_node.h ---- */
+    /* ---- nodes.h ---- */
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
-    std::string grad_node_h_str = GenerateGradNodeHeaderContents(
-        grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
-    GenerateNodeHFile(op_type, output_dir, grad_node_h_str);
+    grad_node_h_str +=
+        GenerateGradNodeHeaderContents(grad_ins_fwd_slotname_map, op_type,
+                                       in_vars, out_vars) +
+        "\n";
 
-    /* ---- xxx_node.cc ---- */
+    /* ---- nodes.cc ---- */
     VLOG(6) << "-------- GenerateGradNodeCCContents -------";
-    std::string grad_node_cc_str = GenerateGradNodeCCContents(
-        grad_op_types, fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
-        grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map,
-        grad_outs_slotname_map, grad_ins, grad_outs, op_type, in_vars,
-        out_vars);
-    GenerateNodeCCFile(op_type, output_dir, grad_node_cc_str);
-
-    VLOG(6) << op_type << ": Finished Generation";
+    grad_node_cc_str += GenerateGradNodeCCContents(
+                            grad_op_types, fwd_inputs_name_pos_map,
+                            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
+                            grad_ins_grad_slotname_map, grad_outs_slotname_map,
+                            grad_ins, grad_outs, op_type, in_vars, out_vars) +
+                        "\n";
+
+    VLOG(6) << op_type << ": Finished Generating Op: " << op_type;
   }
+  /* ---- dygraph_forward_function.cc ---- */
+  VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
+  GenerateForwardDygraphFile(output_dir, fwd_function_str);
 
   /* ---- dygraph_forward_api.h ---- */
   VLOG(6) << "-------- GenerateForwardHFile -------";
   GenerateForwardHFile(output_dir, dygraph_forward_api_str);
+
+  /* ---- nodes.h ---- */
+  VLOG(6) << "-------- GenerateNodeHFile -------";
+  GenerateNodeHFile(output_dir, grad_node_h_str);
+
+  /* ---- nodes.cc ---- */
+  VLOG(6) << "-------- GenerateNodeCCFile -------";
+  GenerateNodeCCFile(output_dir, grad_node_cc_str);
 }
 
 }  // namespace framework
 }  // namespace paddle
 
-static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
-  std::string line;
-  std::ifstream op_list_file(op_list_path);
-  if (op_list_file.is_open()) {
-    while (getline(op_list_file, line)) {
-      operators_to_codegen.insert(line);
-    }
-    op_list_file.close();
-  } else {
-    PADDLE_THROW(
-        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
-  }
-}
-
 int main(int argc, char* argv[]) {
   if (argc != 3) {
-    std::cerr << "argc must be 2" << std::endl;
+    std::cerr << "argc must be 3" << std::endl;
     return -1;
   }
 
@@ -1668,6 +1694,8 @@ int main(int argc, char* argv[]) {
   std::string op_list_path = argv[2];
 
   CollectOperatorsToCodeGen(op_list_path);
+  PrepareAttrMapForOps();
+
   paddle::framework::DygraphCodeGeneration(eager_root);
 
   return 0;
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index af6cf2cec0246b..56ec287561c564 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -18,12 +18,6 @@
 if __name__ == "__main__":
     assert len(sys.argv) == 2
     eager_dir = sys.argv[1]
-
-    op_list = []
-    with open(f"{eager_dir}/auto_code_generator/op_list.txt", "r") as f:
-        for line in f:
-            line = str(line.strip())
-            op_list.append(line)
     """
     paddle/fluid/eager
     |- generated
@@ -31,15 +25,15 @@
     |  |  "add_subdirectory(forwards), add_subdirectory(nodes)"
     |  
     |  |- forwards
-    |     |- op_name + "_dygraph.cc"
+    |     |- "dygraph_forward_functions.cc"
     |     |- CMakeLists.txt
-    |     |  "cc_library(dygraph_function SRCS op_name+"_dygraph.cc" DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)"
+    |     |  "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} GLOB_OP_LIB)"
     |
     |  |- nodes
-    |     |- op_name + "_node.cc"
-    |     |- op_name + "_node.h"
+    |     |- "nodes.cc"
+    |     |- "nodes.h"
     |     |- CMakeLists.txt
-    |     |  "cc_library(dygraph_node SRCS op_name+"_node.cc" DEPS ${eager_deps} ${fluid_deps})"
+    |     |  "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})"
     | 
     |  |- dygraph_forward_api.h
     """
@@ -56,10 +50,10 @@
     dygraph_forward_api_h_path = os.path.join(generated_dir,
                                               "dygraph_forward_api.h")
     empty_files = [dygraph_forward_api_h_path]
-    for op_name in op_list:
-        empty_files.append(os.path.join(forwards_dir, op_name + "_dygraph.cc"))
-        empty_files.append(os.path.join(nodes_dir, op_name + "_node.cc"))
-        empty_files.append(os.path.join(nodes_dir, op_name + "_node.h"))
+    empty_files.append(
+        os.path.join(forwards_dir, "dygraph_forward_functions.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
+    empty_files.append(os.path.join(nodes_dir, "nodes.h"))
 
     for path in empty_files:
         if not os.path.exists(path):
@@ -73,14 +67,14 @@
 
     with open(nodes_level_cmakelist_path, "w") as f:
         f.write(
-            "cc_library(dygraph_node SRCS %s DEPS ${eager_deps} ${fluid_deps})\n"
-            % " ".join([op_name + '_node.cc' for op_name in op_list]))
+            "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})\n"
+        )
         f.write("add_dependencies(dygraph_node eager_codegen)")
 
     with open(forwards_level_cmakelist_path, "w") as f:
         f.write(
-            "cc_library(dygraph_function SRCS %s DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB})\n"
-            % " ".join([op_name + '_dygraph.cc' for op_name in op_list]))
+            "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
+        )
         f.write("add_dependencies(dygraph_function eager_codegen)")
 
     with open(generated_level_cmakelist_path, "w") as f:
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 83185dff9b7812..c100e3b70f3842 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -14,6 +14,7 @@
 
 // Eager Dygraph
 
+#include <paddle/fluid/framework/op_registry.h>
 #include <chrono>
 
 #include "gtest/gtest.h"
@@ -178,3 +179,8 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
     }
   }
 }
+
+USE_OP(scale);
+USE_OP(elementwise_add);
+USE_OP(matmul_v2);
+USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 9fbed054183029..c8f4b1b32e453d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 // Eager Dygraph
+#include <paddle/fluid/framework/op_registry.h>
 #include <chrono>
 
 #include "gtest/gtest.h"
@@ -188,4 +189,10 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
   }
 }
 
+USE_OP(scale);
+USE_OP(matmul_v2);
+USE_OP(reduce_sum);
+USE_OP(reduce_sum_grad);
+USE_OP(elementwise_add);
+
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index d98000b71fd2a3..68e7512eedbde3 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -217,5 +217,6 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace paddle
 
 USE_OP(scale);
+USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 918ebadba0a4c8..50423b5a64fcf9 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -254,5 +254,6 @@ USE_OP(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
+USE_OP(elementwise_add);
 
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 9d6e3310678345..a06091247bf7ae 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -89,4 +89,39 @@ TEST(Generated, Matmul_v2) {
   eager_test::CompareGradVariableWithValue<float>(Y, 3.0 * 4);
 }
 
+TEST(Generated, ElementwiseAdd) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  auto tracer = std::make_shared<paddle::imperative::Tracer>();
+  paddle::imperative::SetCurrentTracer(tracer);
+
+  // 1. Prepare Input
+  paddle::framework::DDim ddimX = paddle::framework::make_ddim({4, 16});
+  egr::EagerTensor X = egr_utils_api::CreateTensorWithValue(
+      ddimX, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 3.0, true);
+  egr_utils_api::RetainGradForTensor(X);
+
+  paddle::framework::DDim ddimY = paddle::framework::make_ddim({4, 16});
+  egr::EagerTensor Y = egr_utils_api::CreateTensorWithValue(
+      ddimY, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 2.0, true);
+  egr_utils_api::RetainGradForTensor(Y);
+
+  auto output_tensor = elementwise_add_dygraph_function(X, Y, {});
+
+  eager_test::CompareVariableWithValue<float>(output_tensor, 5);
+
+  std::vector<egr::EagerTensor> target_tensors = {output_tensor};
+  RunBackward(target_tensors, {});
+
+  eager_test::CompareGradVariableWithValue<float>(X, 1.0);
+  eager_test::CompareGradVariableWithValue<float>(Y, 1.0);
+}
+
 }  // namespace egr
+
+USE_OP(sigmoid);
+USE_OP(elementwise_add);
+USE_OP(matmul_v2);

From 7e9b20b5ec3a6b1941683df79b3bd115c49516a5 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 3 Dec 2021 15:02:16 +0800
Subject: [PATCH 058/124] fix python command in cmake of api-gen (#37818)

---
 paddle/pten/api/lib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index ed2ad801283f5e..189548880694d9 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -24,7 +24,7 @@ set(api_source_file_tmp ${api_source_file}.tmp)
 
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
-  COMMAND python ${api_gen_file} 
+  COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} 
                  --api_yaml_path ${api_yaml_file}
                  --api_header_path ${api_header_file_tmp}
                  --api_source_path ${api_source_file_tmp}

From b65708a812cac2f3cf4e6d8ed9e03a5dd8278a91 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 3 Dec 2021 15:24:58 +0800
Subject: [PATCH 059/124] update ci check_op_desc to support op_version and
 op_compat. (#37600)

* update check_op_desc to support op_version and op_compat.
---
 tools/check_api_approvals.sh |  16 +++-
 tools/check_op_desc.py       | 157 +++++++++++++++++++++++++++++------
 tools/print_op_desc.py       |  13 ++-
 3 files changed, 157 insertions(+), 29 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index dcbe853d8a1bcc..45d4731ba1dbac 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -76,9 +76,21 @@ if [ "$op_type_spec_diff" != "" ]; then
 fi
 
 op_desc_diff=`python ${PADDLE_ROOT}/tools/check_op_desc.py ${PADDLE_ROOT}/paddle/fluid/OP_DESC_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_DESC_PR.spec`
+inference_approve=`echo "$op_desc_diff" | grep "need inference to review" -`
+slim_approve=`echo "$op_desc_diff" | grep "need slim to review" -`
 if [ "$op_desc_diff" != "" ]; then
-    echo_line="You must have one RD (cyj1986, Superjomn) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
-    check_approval 1 39645414 328693
+    echo_line="You must have one RD (inference[ Superjomn(Recommend), Shixiaowei02, cyj1986 ] or slim[ wanghaoshuang(Recommend), qingqing01 ]) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
+    check_approval 1 39645414 328693 39303645 7534971 7845005
+fi
+
+if [ "$slim_approve" != "" ]; then
+    echo_line="You must have one RD (wanghaoshuang(Recommend), qingqing01) approval for the changes of `quant` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${slim_approve}\n"
+    check_approval 1 7534971 7845005
+fi
+
+if [ "$inference_approve" != "" ]; then
+    echo_line="You must have one RD (Superjomn(Recommend), Shixiaowei02, cyj1986) approval for the changes of `def` Inputs/Output/Attrs of OPs. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${inference_approve}\n"
+    check_approval 1 39645414 328693 39303645
 fi
 
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 78abb6f36c6062..19984a55a41af4 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -40,6 +40,11 @@
 GENERATED = "generated"
 DEFAULT_VALUE = "default_value"
 
+# add_with_extra, add_with_quant and add_with_def
+EXTRA = "extra"
+QUANT = "quant"
+DEF = "def"
+
 error = False
 
 version_update_map = {
@@ -64,6 +69,9 @@ def diff_vars(origin_vars, new_vars):
     var_add_dispensable_massage = []
     var_deleted_error_massage = []
 
+    var_add_quant_message = []
+    var_add_def_message = []
+
     common_vars_name = set(origin_vars.keys()) & set(new_vars.keys())
     vars_name_only_in_origin = set(origin_vars.keys()) - set(new_vars.keys())
     vars_name_only_in_new = set(new_vars.keys()) - set(origin_vars.keys())
@@ -73,11 +81,12 @@ def diff_vars(origin_vars, new_vars):
             continue
         else:
             error, var_error = True, True
-            var_changed_error_massage[var_name] = {}
             for arg_name in origin_vars.get(var_name):
                 new_arg_value = new_vars.get(var_name, {}).get(arg_name)
                 origin_arg_value = origin_vars.get(var_name, {}).get(arg_name)
                 if new_arg_value != origin_arg_value:
+                    if var_name not in var_changed_error_massage.keys():
+                        var_changed_error_massage[var_name] = {}
                     var_changed_error_massage[var_name][arg_name] = (
                         origin_arg_value, new_arg_value)
 
@@ -91,6 +100,21 @@ def diff_vars(origin_vars, new_vars):
             error, var_error = True, True
             var_add_dispensable_massage.append(var_name)
 
+        # if added var is extra, then no need to check.
+        if new_vars.get(var_name).get(EXTRA):
+            continue
+
+        # if added var is quant, slim needs to review, needs to register.
+        if new_vars.get(var_name).get(QUANT):
+            error, var_error = True, True
+            var_add_quant_message.append(var_name)
+
+        # if added var is def, inference needs to review, needs to register.
+        if not new_vars.get(var_name).get(EXTRA) and not new_vars.get(
+                var_name).get(QUANT):
+            error, var_error = True, True
+            var_add_def_message.append(var_name)
+
     var_diff_message = {}
     if var_add_massage:
         var_diff_message[ADD] = var_add_massage
@@ -100,6 +124,10 @@ def diff_vars(origin_vars, new_vars):
         var_diff_message[CHANGE] = var_changed_error_massage
     if var_deleted_error_massage:
         var_diff_message[DELETE] = var_deleted_error_massage
+    if var_add_quant_message:
+        var_diff_message[QUANT] = var_add_quant_message
+    if var_add_def_message:
+        var_diff_message[DEF] = var_add_def_message
 
     return var_error, var_diff_message
 
@@ -113,6 +141,9 @@ def diff_attr(ori_attrs, new_attrs):
     attr_added_def_error_massage = []
     attr_deleted_error_massage = []
 
+    attr_added_quant_message = []
+    attr_added_define_message = []
+
     common_attrs = set(ori_attrs.keys()) & set(new_attrs.keys())
     attrs_only_in_origin = set(ori_attrs.keys()) - set(new_attrs.keys())
     attrs_only_in_new = set(new_attrs.keys()) - set(ori_attrs.keys())
@@ -122,11 +153,12 @@ def diff_attr(ori_attrs, new_attrs):
             continue
         else:
             error, attr_error = True, True
-            attr_changed_error_massage[attr_name] = {}
             for arg_name in ori_attrs.get(attr_name):
                 new_arg_value = new_attrs.get(attr_name, {}).get(arg_name)
                 origin_arg_value = ori_attrs.get(attr_name, {}).get(arg_name)
                 if new_arg_value != origin_arg_value:
+                    if attr_name not in attr_changed_error_massage.keys():
+                        attr_changed_error_massage[attr_name] = {}
                     attr_changed_error_massage[attr_name][arg_name] = (
                         origin_arg_value, new_arg_value)
 
@@ -140,6 +172,17 @@ def diff_attr(ori_attrs, new_attrs):
             error, attr_error = True, True
             attr_added_def_error_massage.append(attr_name)
 
+        # if added attr is quant, slim needs to review, needs to register
+        if new_attrs.get(attr_name).get(QUANT):
+            error, var_error = True, True
+            attr_added_quant_message.append(attr_name)
+
+        # if added attr is def, inference needs to review, needs to register
+        if not new_attrs.get(attr_name).get(EXTRA) and not new_attrs.get(
+                attr_name).get(QUANT):
+            error, var_error = True, True
+            attr_added_define_message.append(attr_name)
+
     attr_diff_message = {}
     if attr_added_error_massage:
         attr_diff_message[ADD] = attr_added_error_massage
@@ -149,6 +192,10 @@ def diff_attr(ori_attrs, new_attrs):
         attr_diff_message[CHANGE] = attr_changed_error_massage
     if attr_deleted_error_massage:
         attr_diff_message[DELETE] = attr_deleted_error_massage
+    if attr_added_define_message:
+        attr_diff_message[DEF] = attr_added_define_message
+    if attr_added_quant_message:
+        attr_diff_message[QUANT] = attr_added_quant_message
 
     return attr_error, attr_diff_message
 
@@ -157,23 +204,49 @@ def check_io_registry(io_type, op, diff):
     checker = OpLastCheckpointChecker()
     results = {}
     for update_type in [ADD]:
-        for item in diff.get(update_type, {}):
+        for item in diff.get(update_type, []):
             infos = checker.filter_updates(
                 op, version_update_map[io_type][update_type], item)
             if not infos:
-                results[update_type] = (op, item, io_type)
+                if update_type not in results.keys():
+                    results[update_type] = []
+                # extra not need to register.
+                qaunt_ios = diff.get(QUANT, [])
+                def_ios = diff.get(DEF, [])
+                if item in qaunt_ios or item in def_ios:
+                    results[update_type].append((op, item, io_type))
+
     return results
 
 
-def check_attr_registry(op, diff):
+def check_attr_registry(op, diff, origin_attrs):
     checker = OpLastCheckpointChecker()
     results = {}
+    qaunt_attrs = diff.get(QUANT, [])
+    def_attrs = diff.get(DEF, [])
+    change_attrs = diff.get(CHANGE, {})
     for update_type in [ADD, CHANGE]:
         for item in diff.get(update_type, {}):
             infos = checker.filter_updates(
                 op, version_update_map[ATTRS][update_type], item)
             if not infos:
-                results[update_type] = (op, item)
+                if update_type == ADD:
+                    if update_type not in results.keys():
+                        results[update_type] = []
+                    # extra not need to register.
+                    if item in qaunt_attrs or item in def_attrs:
+                        results[update_type].append((op, item))
+                elif update_type == CHANGE:
+                    if CHANGE not in results.keys():
+                        results[update_type] = {}
+                    for attr_name, attr_change in change_attrs.items():
+                        # extra not need to register.
+                        if not origin_attrs.get(attr_name).get(EXTRA):
+                            results[update_type][attr_name] = attr_change
+
+    for update_type in [ADD, CHANGE]:
+        if update_type in results.keys() and len(results[update_type]) == 0:
+            del results[update_type]
     return results
 
 
@@ -206,13 +279,14 @@ def compare_op_desc(origin_op_desc, new_op_desc):
         origin_attrs = origin_info.get(ATTRS, {})
         new_attrs = new_info.get(ATTRS, {})
         attrs_error, attrs_diff = diff_attr(origin_attrs, new_attrs)
-        attrs_version_errors = check_attr_registry(op_type, attrs_diff)
+        attrs_version_errors = check_attr_registry(op_type, attrs_diff,
+                                                   origin_attrs)
 
-        if ins_error:
+        if ins_diff:
             desc_error_message.setdefault(op_type, {})[INPUTS] = ins_diff
-        if outs_error:
+        if outs_diff:
             desc_error_message.setdefault(op_type, {})[OUTPUTS] = outs_diff
-        if attrs_error:
+        if attrs_diff:
             desc_error_message.setdefault(op_type, {})[ATTRS] = attrs_diff
 
         if ins_version_errors:
@@ -250,6 +324,14 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of Input '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in Inputs_error.get(QUANT, {}):
+            print(" * The added Input '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in Inputs_error.get(DEF, {}):
+            print(" * The added Input '{}' is `def`, need inference to review.".
+                  format(name))
+
         # 2. print outputs error message
         Outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
         for name in Outputs_error.get(ADD_DISPENSABLE, {}):
@@ -266,6 +348,15 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of Output '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in Outputs_error.get(QUANT, {}):
+            print(" * The added Output '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in Outputs_error.get(DEF, {}):
+            print(
+                " * The added Output '{}' is `def`, need inference to review.".
+                format(name))
+
         # 3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
         for name in attrs_error.get(ADD_WITH_DEFAULT, {}):
@@ -283,6 +374,16 @@ def print_desc_error_message(error_message):
                     " * The arg '{}' of attr '{}' is changed: from '{}' to '{}'.".
                     format(arg, name, ori_value, new_value))
 
+        for name in attrs_error.get(QUANT, {}):
+            # TODO(Wilber):
+            print(" * The added attr '{}' is `quant`, need slim to review.".
+                  format(name))
+
+        for name in attrs_error.get(DEF, {}):
+            # TODO(Wilber):
+            print(" * The added attr '{}' is `def`, need inference to review.".
+                  format(name))
+
 
 def print_version_error_message(error_message):
     print(
@@ -294,28 +395,32 @@ def print_version_error_message(error_message):
 
         # 1. print inputs error message
         inputs_error = error_message.get(op_name, {}).get(INPUTS, {})
-        tuple = inputs_error.get(ADD, {})
-        if tuple:
-            print(" * The added input '{}' is not yet registered.".format(tuple[
-                1]))
+        error_list = inputs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added input '{}' is not yet registered.".format(
+                    tup[1]))
 
-        # 2. print inputs error message
+        # 2. print outputs error message
         outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
-        tuple = outputs_error.get(ADD, {})
-        if tuple:
-            print(" * The added output '{}' is not yet registered.".format(
-                tuple[1]))
+        error_list = outputs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added output '{}' is not yet registered.".format(
+                    tup[1]))
 
         #3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
-        tuple = attrs_error.get(ADD, {})
-        if tuple:
-            print(" * The added attribute '{}' is not yet registered.".format(
-                tuple[1]))
-        tuple = attrs_error.get(CHANGE, {})
-        if tuple:
+        error_list = attrs_error.get(ADD, [])
+        if error_list:
+            for tup in error_list:
+                print(" * The added attribute '{}' is not yet registered.".
+                      format(tup[1]))
+        error_dic = error_message.get(op_name, {}).get(ATTRS, {}).get(CHANGE,
+                                                                      {})
+        for key, val in error_dic.items():
             print(" * The change of attribute '{}' is not yet registered.".
-                  format(tuple[1]))
+                  format(key))
 
 
 def print_repeat_process():
diff --git a/tools/print_op_desc.py b/tools/print_op_desc.py
index 64445bab3a62c5..b85103a7a25e16 100644
--- a/tools/print_op_desc.py
+++ b/tools/print_op_desc.py
@@ -18,7 +18,9 @@
             {input_name1:
                 {DISPENSABLE: bool,
                  INTERMEDIATE: bool,
-                 DUPLICABLE: bool
+                 DUPLICABLE: bool,
+                 EXTRA: bool,
+                 QUANT: bool,
                 },
             input_name2:{}
             },
@@ -28,6 +30,8 @@
                 {TYPE: int,
                  GENERATED: bool,
                  DEFAULT_VALUE: int/str/etc,
+                 EXTRA: bool,
+                 QUANT: bool,
                 }
             }
         }
@@ -55,6 +59,9 @@
 GENERATED = "generated"
 DEFAULT_VALUE = "default_value"
 
+EXTRA = "extra"
+QUANT = "quant"
+
 
 def get_attr_default_value(op_name):
     return core.get_op_attrs_default_value(cpt.to_bytes(op_name))
@@ -68,6 +75,8 @@ def get_vars_info(op_vars_proto):
         vars_info[name][DUPLICABLE] = var_proto.duplicable
         vars_info[name][DISPENSABLE] = var_proto.dispensable
         vars_info[name][INTERMEDIATE] = var_proto.intermediate
+        vars_info[name][EXTRA] = var_proto.extra
+        vars_info[name][QUANT] = var_proto.quant
     return vars_info
 
 
@@ -81,6 +90,8 @@ def get_attrs_info(op_proto, op_attrs_proto):
         attrs_info[attr_name][GENERATED] = attr_proto.generated
         attrs_info[attr_name][DEFAULT_VALUE] = attrs_default_values[
             attr_name] if attr_name in attrs_default_values else None
+        attrs_info[attr_name][EXTRA] = attr_proto.extra
+        attrs_info[attr_name][QUANT] = attr_proto.quant
     return attrs_info
 
 

From 0633e14dbf9bac4e03bdba12f5cb6a3f1d1de238 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Fri, 3 Dec 2021 16:59:16 +0800
Subject: [PATCH 060/124] [fleet_executor] set pipeline 1f1b buffer size
 (#37807)

---
 .../fleet_executor/compute_interceptor.cc     | 21 ++---
 .../fleet_executor/runtime_graph.cc           | 77 ++++++++++++++-----
 .../distributed/fleet_executor/task_node.cc   | 12 +--
 .../distributed/fleet_executor/task_node.h    | 18 +++--
 .../test/compute_interceptor_test.cc          |  4 +-
 .../interceptor_pipeline_short_path_test.cc   | 33 ++++++--
 .../test_fleet_executor_task_node.py          |  5 +-
 7 files changed, 113 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 0c0411a035fb36..084e91c11caa73 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -27,19 +27,15 @@ ComputeInterceptor::ComputeInterceptor(int64_t interceptor_id, TaskNode* node)
 }
 
 void ComputeInterceptor::PrepareDeps() {
-  auto& upstream = GetTaskNode()->upstream();
-  auto& downstream = GetTaskNode()->downstream();
+  auto& upstream = node_->upstream();
+  auto& downstream = node_->downstream();
 
-  // TODO(wangxi): get from task node
-  int64_t in_buff_size = std::numeric_limits<int64_t>::max();
-  int64_t out_buff_size = 2;
-
-  for (auto up_id : upstream) {
-    in_readys_.emplace(up_id, std::make_pair(in_buff_size, 0));
-    in_stops_.emplace(up_id, false);
+  for (auto up : upstream) {
+    in_readys_.emplace(up.first, std::make_pair(up.second, 0));
+    in_stops_.emplace(up.first, false);
   }
-  for (auto down_id : downstream) {
-    out_buffs_.emplace(down_id, std::make_pair(out_buff_size, 0));
+  for (auto down : downstream) {
+    out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
   }
 
   // source compute node, should we add a new SourceInterceptor?
@@ -114,8 +110,7 @@ bool ComputeInterceptor::CanWriteOutput() {
 
 // only source node need reset
 bool ComputeInterceptor::ShouldReset() {
-  if (is_source_ && step_ == node_->max_run_times()) return true;
-  return false;
+  return is_source_ && (step_ == node_->max_run_times());
 }
 
 void ComputeInterceptor::SendDataReadyToDownStream() {
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 19afdf7441257f..9999956223ab15 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -150,12 +150,14 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
     }
     role_to_ops.at(new_op_role_id).emplace_back(op.get());
   }
+
   int64_t cur_rank = exe_desc_.cur_rank();
   DistCoordSys coord_sys(exe_desc_.dp_degree(), exe_desc_.pp_degree(),
                          exe_desc_.mp_degree());
   const auto& coord = coord_sys.RankToCoord(cur_rank);
   int pipeline_stage = coord.pp_idx;
   int64_t num_pipeline_stages = exe_desc_.pp_degree();
+
   // TODO(fleet_executor dev): start up steps should be a config `num_slots`
   int64_t start_up_steps = num_pipeline_stages - pipeline_stage;
   int64_t num_micro_batches = exe_desc_.num_micro_batches();
@@ -199,36 +201,69 @@ void RuntimeGraph::FakeDependence() {
   downstream_coord.pp_idx += 1;
   int64_t pp_upstream = coord_sys.CoordToRank(upstream_coord);
   int64_t pp_downstream = coord_sys.CoordToRank(downstream_coord);
+  bool is_first_stage = (pp_upstream == -1);
+  bool is_last_stage = (pp_downstream == -1);
+
   int32_t num_of_functionality = functionality_order.size();
-  // lr -> forward -> backward -> optimize
-  //         |          |
-  // lr -> forward -> backward -> optimize
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
+  //               ↑          ↓
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
+  //               ↑          ↓
+  // lr(1:m) -> forward -> backward -> (m:1)optimize
   for (std::size_t i = 0; i < task_nodes_.size(); ++i) {
-    if (i != 0) {
-      task_nodes_[i]->AddUpstreamTask(cur_rank * num_of_functionality + i - 1);
+    auto& node = task_nodes_[i];
+    bool is_forward = IsForward(node->role());
+    bool is_backward = IsBackward(node->role());
+
+    int64_t cur_id = cur_rank * num_of_functionality + i;
+    int64_t prev_id = cur_id - 1;
+    int64_t next_id = cur_id + 1;
+
+    int64_t upstream_id = pp_upstream * num_of_functionality + i;
+    int64_t downstream_id = pp_downstream * num_of_functionality + i;
+
+    // 1F1B, last stage pp_buff_size should be 1, while first stage
+    // pp_buff_size should be pp_degree
+    int64_t pp_buff_size = exe_desc_.pp_degree() - coord.pp_idx;
+
+    std::vector<std::pair<int64_t, int64_t>> ups;
+    std::vector<std::pair<int64_t, int64_t>> downs;
+
+    if (i != 0) {  // not lr
+      int64_t buff_size = is_backward ? pp_buff_size : 2;
+      ups.emplace_back(prev_id, buff_size);
     }
-    if (i != task_nodes_.size() - 1) {
-      task_nodes_[i]->AddDownstreamTask(cur_rank * num_of_functionality + i +
-                                        1);
+    if (i != task_nodes_.size() - 1) {  // not optimize
+      int64_t buff_size = is_forward ? pp_buff_size : 2;
+      downs.emplace_back(next_id, buff_size);
     }
-    if (IsForward(task_nodes_[i]->role())) {
-      if (pp_upstream != -1) {
-        task_nodes_[i]->AddUpstreamTask(pp_upstream * num_of_functionality + i);
+
+    if (is_forward) {
+      if (!is_first_stage) {
+        ups.emplace_back(upstream_id, 2);
       }
-      if (pp_downstream != -1) {
-        task_nodes_[i]->AddDownstreamTask(pp_downstream * num_of_functionality +
-                                          i);
+      if (!is_last_stage) {
+        downs.emplace_back(downstream_id, 2);
       }
-    } else if (IsBackward(task_nodes_[i]->role())) {
-      if (pp_downstream != -1) {
-        task_nodes_[i]->AddUpstreamTask(pp_downstream * num_of_functionality +
-                                        i);
+    } else if (is_backward) {
+      if (!is_last_stage) {
+        ups.emplace_back(downstream_id, 2);
       }
-      if (pp_upstream != -1) {
-        task_nodes_[i]->AddDownstreamTask(pp_upstream * num_of_functionality +
-                                          i);
+      if (!is_first_stage) {
+        downs.emplace_back(upstream_id, 2);
       }
     }
+
+    for (auto up : ups) {
+      VLOG(3) << "Task(" << cur_id << ") AddUpstream Task(" << up.first
+              << ") with buff_size=" << up.second;
+      node->AddUpstreamTask(up.first, up.second);
+    }
+    for (auto down : downs) {
+      VLOG(3) << "Task(" << cur_id << ") AddDownstream Task(" << down.first
+              << ") with buff_size=" << down.second;
+      node->AddDownstreamTask(down.first, down.second);
+    }
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index f2e785010b7263..e92ab09d481e8f 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -57,14 +57,14 @@ TaskNode::TaskNode(int32_t role, int64_t rank, int64_t task_id,
       max_run_times_(max_run_times),
       max_slot_nums_(max_slot_nums) {}
 
-bool TaskNode::AddUpstreamTask(int64_t task_id) {
-  const auto& ret = upstream_.insert(task_id);
-  return *ret.first == task_id;
+bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) {
+  const auto& ret = upstream_.emplace(task_id, buff_size);
+  return ret.second;
 }
 
-bool TaskNode::AddDownstreamTask(int64_t task_id) {
-  const auto& ret = downstream_.insert(task_id);
-  return *ret.first == task_id;
+bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) {
+  const auto& ret = downstream_.emplace(task_id, buff_size);
+  return ret.second;
 }
 
 std::string TaskNode::DebugString() const {
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 23fb4c0a7dbfcd..a03ccd4cded18e 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -48,8 +48,12 @@ class TaskNode final {
   int64_t run_at_offset() const { return run_at_offset_; }
   int64_t reply_up_per_steps() const { return reply_up_per_steps_; }
   int64_t send_down_per_steps() const { return send_down_per_steps_; }
-  const std::unordered_set<int64_t>& upstream() const { return upstream_; }
-  const std::unordered_set<int64_t>& downstream() const { return downstream_; }
+  const std::unordered_map<int64_t, int64_t>& upstream() const {
+    return upstream_;
+  }
+  const std::unordered_map<int64_t, int64_t>& downstream() const {
+    return downstream_;
+  }
   const std::string& type() const { return type_; }
   const paddle::framework::ProgramDesc& program() const { return program_; }
   const std::vector<OperatorBase*>& ops() const { return ops_; }
@@ -60,8 +64,9 @@ class TaskNode final {
   void SetSendDownPerSteps(int64_t value);
   void SetType(const std::string& type) { type_ = type; }
 
-  bool AddUpstreamTask(int64_t task_id);
-  bool AddDownstreamTask(int64_t task_id);
+  // upstream need buffs?
+  bool AddUpstreamTask(int64_t task_id, int64_t buff_size = 1);
+  bool AddDownstreamTask(int64_t task_id, int64_t buff_size = 1);
   std::string DebugString() const;
 
  private:
@@ -69,8 +74,9 @@ class TaskNode final {
   TaskNode() = default;
   // ops_ will be removed in the future
   std::vector<OperatorBase*> ops_;
-  std::unordered_set<int64_t> upstream_;
-  std::unordered_set<int64_t> downstream_;
+  // task_id-->buff_size
+  std::unordered_map<int64_t, int64_t> upstream_;
+  std::unordered_map<int64_t, int64_t> downstream_;
   framework::ProgramDesc program_;
   std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
   int32_t role_;
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 8f44b2035aea02..44dc0c9bc9b0c9 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -56,8 +56,8 @@ TEST(ComputeInterceptor, Compute) {
   TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
 
   // a->b->c
-  node_a->AddDownstreamTask(1);
-  node_b->AddUpstreamTask(0);
+  node_a->AddDownstreamTask(1, 3);
+  node_b->AddUpstreamTask(0, 3);
   node_b->AddDownstreamTask(2);
   node_c->AddUpstreamTask(1);
 
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index db42135040ae74..936a970c05f7c5 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -25,19 +25,34 @@ limitations under the License. */
 namespace paddle {
 namespace distributed {
 
-void LinkNodes(const std::vector<TaskNode*>& nodes) {
+int64_t GetBuffSize(
+    const std::map<std::pair<TaskNode*, TaskNode*>, int64_t> buffs,
+    TaskNode* from, TaskNode* to) {
+  if (buffs.find({from, to}) != buffs.end()) {
+    return buffs.at({from, to});
+  }
+  if (buffs.find({to, from}) != buffs.end()) {
+    return buffs.at({to, from});
+  }
+  return 2;  // set default 2
+}
+
+void LinkNodes(const std::vector<TaskNode*>& nodes,
+               const std::map<std::pair<TaskNode*, TaskNode*>, int64_t> buffs) {
   size_t size = nodes.size();
   if (size <= 1) return;
 
   {  // i = 0
     TaskNode* now = nodes[0];
     TaskNode* next = nodes[1];
-    now->AddDownstreamTask(next->task_id());
+    auto buff_size = GetBuffSize(buffs, now, next);
+    now->AddDownstreamTask(next->task_id(), buff_size);
   }
   {  // i = size - 1
     TaskNode* prev = nodes[size - 2];
     TaskNode* now = nodes[size - 1];
-    now->AddUpstreamTask(prev->task_id());
+    auto buff_size = GetBuffSize(buffs, prev, now);
+    now->AddUpstreamTask(prev->task_id(), buff_size);
   }
 
   for (size_t i = 1; i < size - 1; ++i) {
@@ -45,8 +60,11 @@ void LinkNodes(const std::vector<TaskNode*>& nodes) {
     TaskNode* now = nodes[i];
     TaskNode* next = nodes[i + 1];
 
-    now->AddUpstreamTask(prev->task_id());
-    now->AddDownstreamTask(next->task_id());
+    auto buff_size = GetBuffSize(buffs, prev, now);
+    now->AddUpstreamTask(prev->task_id(), buff_size);
+
+    buff_size = GetBuffSize(buffs, now, next);
+    now->AddDownstreamTask(next->task_id(), buff_size);
   }
 }
 
@@ -55,7 +73,7 @@ TEST(AmplifierInterceptor, Amplifier) {
   MessageBus& msg_bus = MessageBus::Instance();
   msg_bus.Init({{0, 0}, {1, 0}, {2, 0}, {3, 0}}, {{0, ""}}, "");
 
-  int64_t micro_steps = 3;
+  int64_t micro_steps = 6;
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
   TaskNode* node_a =
@@ -65,7 +83,8 @@ TEST(AmplifierInterceptor, Amplifier) {
   TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0);
 
   // a->b->c->d
-  LinkNodes({node_a, node_b, node_c, node_d});
+  // LR->F->B->U
+  LinkNodes({node_a, node_b, node_c, node_d}, {{{node_b, node_c}, 1}});
 
   node_a->SetRunPerSteps(micro_steps);
   node_d->SetRunPerSteps(micro_steps);
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
index 2c944aa5dbc471..3dae8a5bf6b958 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
@@ -28,8 +28,9 @@ def test_task_node(self):
         self.assertEqual(task_node_0.task_id(), 0)
         self.assertEqual(task_node_1.task_id(), 1)
         self.assertEqual(task_node_2.task_id(), 2)
-        self.assertTrue(task_node_0.add_downstream_task(task_node_1.task_id()))
-        self.assertTrue(task_node_1.add_upstream_task(task_node_0.task_id()))
+        self.assertTrue(
+            task_node_0.add_downstream_task(task_node_1.task_id(), 1))
+        self.assertTrue(task_node_1.add_upstream_task(task_node_0.task_id(), 1))
 
 
 if __name__ == "__main__":

From 797d898c351d2965514e9d53ad377f3317233028 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 3 Dec 2021 17:10:05 +0800
Subject: [PATCH 061/124] pre-commit use python3.8  (#37829)

* fix

* test=document_fix
---
 paddle/scripts/paddle_build.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a80caee028a2b4..4849fa20e58c3b 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -316,6 +316,10 @@ function check_style() {
     mkdir -p $GOPATH/src/github.com/PaddlePaddle/
     ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
 
+    # pre-commit use python3.8.0 
+    OLD_PATH=$PATH
+    export PATH=export PATH=/usr/local/python3.8.0/bin:/usr/local/python3.8.0/include:/usr/local/bin:${PATH}
+
     pre-commit install
     clang-format --version
 
@@ -325,6 +329,8 @@ function check_style() {
             commit_files=off
         fi
     done 
+
+    export PATH=${OLD_PATH}
     
     if [ $commit_files == 'off' ];then
         echo "code format error"

From 9ccb622898202fc32bd24c96c119b4e828459960 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 3 Dec 2021 17:36:35 +0800
Subject: [PATCH 062/124] [new-exec] use stream safe allocator in memcpy_h2d
 (#37777)

* use sync h2d copy

* use stream safe allocator in memcpy_h2d

* remove wait

* add guard
---
 .../framework/new_executor/interpretercore.cc |  1 +
 paddle/fluid/framework/tensor.cc              | 31 +++++++++++++++++++
 paddle/fluid/framework/tensor.h               |  5 +++
 paddle/fluid/operators/memcpy_h2d_op.h        |  7 ++++-
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index f954b297510071..9f6e0557815062 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -439,6 +439,7 @@ void InterpreterCore::ExecuteInstructionList(
 
   if (UNLIKELY(exception_holder_.IsCaught())) {
     VLOG(4) << "Exception caught " << exception_holder_.Type();
+    async_work_queue_->Cancel();
     exception_holder_.ReThrow();
   }
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 8d927b87c9abee..063ede6ffbf319 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 
+DECLARE_bool(use_stream_safe_cuda_allocator);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -89,6 +91,35 @@ void* Tensor::mutable_data(const platform::Place& place,
   return mutable_data(place, type_, requested_size);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void* Tensor::mutable_data(const platform::CUDAPlace& place,
+                           proto::VarType::Type type,
+                           const gpuStream_t& stream) {
+  if (!FLAGS_use_stream_safe_cuda_allocator) {
+    return mutable_data(place, type);
+  }
+
+  type_ = type;
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
+  size_t size = numel() * SizeOfType(type);
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    holder_.reset();
+    holder_ = memory::AllocShared(place, size, stream);
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+#endif
+
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
   src.check_memory_size();
   *this = src;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 2efaa3f37f9e9a..494a02878f1a2c 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -149,6 +149,11 @@ class Tensor {
 
   void* mutable_data(const platform::Place& place, size_t requested_size = 0);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void* mutable_data(const platform::CUDAPlace& place,
+                     proto::VarType::Type type, const gpuStream_t& stream);
+#endif
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 3998db6731b3d2..43ac5984bc8c84 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -41,7 +41,12 @@ class MemcpyH2DFunctor {
 
   void operator()(const framework::LoDTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    out_tensor.mutable_data(
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()),
+        lod_tensor.type(),
+        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream());
+#endif
     if (dst_place_type_ == 0 || dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);

From a6d2fddb601f4c068ba8d2640b381909e2e7587d Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Fri, 3 Dec 2021 18:01:13 +0800
Subject: [PATCH 063/124] refine structure for cuda and rocm (#37202)

* refine structure for cuda and rocm

* update

* update

* update

* update
---
 paddle/fluid/framework/conv_search_cache.h    |   6 +-
 .../framework/details/all_reduce_op_handle.cc |  10 +-
 .../framework/details/all_reduce_op_handle.h  |   2 +-
 .../framework/details/broadcast_op_handle.cc  |   2 +-
 .../framework/details/broadcast_op_handle.h   |   2 +-
 .../details/broadcast_op_handle_test.h        |   2 +-
 .../fluid/framework/details/build_strategy.h  |   2 +-
 .../details/eager_deletion_op_handle.cc       |  16 +-
 .../details/fused_all_reduce_op_handle.cc     |  24 +-
 .../details/fused_all_reduce_op_handle.h      |   2 +-
 .../details/fused_broadcast_op_handle.h       |   2 +-
 .../details/gather_op_handle_test.cc          |   2 +-
 .../details/grad_merge_all_reduce_op_handle.h |   2 +-
 .../framework/details/nan_inf_utils_detail.cu |   6 +-
 .../fluid/framework/details/nccl_op_handle.h  |  43 +-
 .../fluid/framework/details/op_handle_base.cc |  30 +-
 .../framework/details/reduce_op_handle.cc     |   2 +-
 .../framework/details/reduce_op_handle.h      |   2 +-
 .../details/reduce_op_handle_test.cc          |   2 +-
 .../details/sparse_all_reduce_op_handle.cc    |   4 +-
 .../details/sparse_all_reduce_op_handle.h     |   2 +-
 paddle/fluid/framework/device_worker.h        |   2 +-
 paddle/fluid/framework/dlpack_tensor_test.cc  |   2 +-
 paddle/fluid/framework/fleet/ascend_wrapper.h |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.cc   |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.h    |   6 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |   4 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   2 +-
 .../framework/fleet/heter_ps/hashtable.h      |   2 +-
 .../framework/fleet/heter_ps/heter_comm_inl.h |  58 +-
 .../fleet/heter_ps/heter_resource.cc          |  14 +-
 paddle/fluid/framework/fleet/nccl_wrapper.cc  |  10 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   |   2 +-
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |   4 +-
 paddle/fluid/framework/garbage_collector.cc   |  13 +-
 paddle/fluid/framework/generator.cc           |   4 +-
 paddle/fluid/framework/heterxpu_trainer.cc    |  16 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |   8 +-
 .../framework/ir/fuse_bn_add_act_pass.cc      |   7 +-
 paddle/fluid/framework/mixed_vector_test.cu   |   4 +-
 .../framework/new_executor/interpretercore.cc |   9 +-
 .../fluid/framework/new_executor/profiler.h   |   4 +-
 paddle/fluid/framework/operator.cc            |   8 +-
 paddle/fluid/framework/parallel_executor.h    |   2 +-
 paddle/fluid/framework/var_type_traits.cc     |   6 +-
 .../fluid/framework/var_type_traits_test.cc   |   6 +-
 paddle/fluid/imperative/all_reduce.cc         |  28 +-
 paddle/fluid/imperative/nccl_context.cc       |  16 +-
 paddle/fluid/imperative/nccl_context.h        |   2 +-
 paddle/fluid/imperative/prepared_operator.cc  |  10 +-
 paddle/fluid/inference/api/analysis_config.cc |   2 +-
 .../fluid/inference/api/analysis_predictor.cc |   2 +-
 .../inference/api/paddle_infer_contrib.cc     |   4 +-
 .../tensorrt/convert/io_converter.cc          |   2 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   2 +-
 .../plugin/deformable_conv_op_plugin.cu       |   8 +-
 .../plugin/instance_norm_op_plugin.cu         |   2 +-
 .../tensorrt/plugin/matmul_op_int8_plugin.cu  | 330 ++++----
 .../tensorrt/plugin/split_op_plugin.cu        |   6 +-
 .../inference/tensorrt/trt_int8_calibrator.cc |   2 +-
 .../memory/allocation/allocator_facade.cc     |  29 +-
 .../memory/allocation/allocator_facade.h      |   2 +-
 ...o_growth_best_fit_allocator_facade_test.cc |   2 +-
 .../fluid/memory/allocation/cuda_allocator.cc |  10 +-
 .../cuda_device_context_allocator.h           |  17 +-
 .../allocation/cuda_virtual_mem_allocator.cc  |  32 +-
 .../allocation/naive_best_fit_allocator.cc    |   2 +-
 .../memory/allocation/pinned_allocator.cc     |   8 +-
 .../allocation/stream_safe_cuda_allocator.cc  |  16 +-
 .../allocation/thread_local_allocator.h       |   2 +-
 paddle/fluid/memory/detail/buddy_allocator.h  |   2 +-
 .../memory/detail/buddy_allocator_test.cc     |   2 +-
 .../fluid/memory/detail/system_allocator.cc   |   8 +-
 .../memory/detail/system_allocator_test.cc    |   9 +-
 paddle/fluid/memory/memcpy.h                  |   2 +-
 paddle/fluid/memory/pinned_memory_test.cu     |   2 +-
 .../memory/stream_safe_cuda_alloc_test.cu     |  40 +-
 paddle/fluid/operators/activation_cudnn.cu.cc |   6 +-
 .../fluid/operators/activation_cudnn_op.cu.cc |  14 +-
 paddle/fluid/operators/activation_op.cu       |   2 +-
 paddle/fluid/operators/affine_channel_op.cu   |   2 +-
 .../operators/affine_grid_cudnn_op.cu.cc      |   4 +-
 paddle/fluid/operators/affine_grid_op.cc      |   7 +-
 paddle/fluid/operators/affine_grid_op.cu      |   6 +-
 paddle/fluid/operators/argsort_op.cu          |   8 +-
 .../fluid/operators/average_accumulates_op.cu |   2 +-
 paddle/fluid/operators/batch_fc_op.cu         |   4 +-
 paddle/fluid/operators/batch_norm_op.cu       |  75 +-
 paddle/fluid/operators/bce_loss_op.cu         |   4 +-
 paddle/fluid/operators/bilateral_slice_op.cu  |   4 +-
 paddle/fluid/operators/bincount_op.cu         |   4 +-
 paddle/fluid/operators/cast_op.cu             |   2 +-
 paddle/fluid/operators/center_loss_op.cu      |   4 +-
 paddle/fluid/operators/cholesky_op.cu         |  21 +-
 paddle/fluid/operators/cinn_launch_op.cu.cc   |   6 +-
 .../fluid/operators/class_center_sample_op.cu |  24 +-
 .../fluid/operators/collective/allreduce_op.h |  10 +-
 .../operators/collective/alltoall_op.cu.cc    |  10 +-
 .../operators/collective/barrier_op.cu.cc     |  10 +-
 .../operators/collective/broadcast_op.cu.cc   |  10 +-
 .../operators/collective/c_allgather_op.cu.cc |   4 +-
 .../operators/collective/c_allreduce_op.h     |   4 +-
 .../operators/collective/c_broadcast_op.cu.cc |   6 +-
 .../collective/c_comm_init_all_op.cc          |   2 +-
 .../collective/c_comm_init_multitrainer_op.cc |   2 +-
 .../operators/collective/c_concat_op.cu.cc    |   4 +-
 .../operators/collective/c_embedding_op.cu    |   2 +-
 .../operators/collective/c_gen_nccl_id_op.cc  |   2 +-
 .../fluid/operators/collective/c_reduce_op.h  |   4 +-
 .../collective/c_reducescatter_op.cu.cc       |   4 +-
 .../operators/collective/c_scatter_op.cu.cc   |   6 +-
 .../c_softmax_with_cross_entropy_op.cu        |   8 +-
 .../fluid/operators/collective/c_split_op.cu  |   2 +-
 .../collective/c_sync_calc_stream_op.cc       |   6 +-
 .../collective/c_sync_comm_stream_op.cc       |   8 +-
 .../operators/collective/c_wait_comm_op.cc    |   8 +-
 .../operators/collective/c_wait_compute_op.cc |   8 +-
 .../operators/collective/gen_nccl_id_op.cc    |   4 +-
 .../collective/global_gather_op.cu.cc         |  10 +-
 .../collective/global_scatter_op.cu.cc        |  10 +-
 .../collective/partial_allgather_op.cu.cc     |   4 +-
 .../collective/partial_recv_op.cu.cc          |   4 +-
 .../collective/partial_send_op.cu.cc          |   4 +-
 .../operators/collective/recv_v2_op.cu.cc     |   6 +-
 .../operators/collective/send_v2_op.cu.cc     |   6 +-
 .../operators/controlflow/get_places_op.cc    |   4 +-
 paddle/fluid/operators/conv_cudnn_helper.h    |  45 +-
 paddle/fluid/operators/conv_cudnn_op.cu       |  37 +-
 paddle/fluid/operators/conv_cudnn_op_cache.h  |   6 +-
 paddle/fluid/operators/conv_miopen_helper.h   |  14 +-
 paddle/fluid/operators/conv_op.cc             |  10 +-
 paddle/fluid/operators/conv_shift_op.cu       |   2 +-
 .../operators/conv_transpose_cudnn_op.cu      |  41 +-
 paddle/fluid/operators/cudnn_lstm_cache.h     |  14 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  38 +-
 paddle/fluid/operators/cudnn_rnn_cache.h      |  96 +--
 paddle/fluid/operators/cumsum_op.cu           |   2 +-
 paddle/fluid/operators/cvm_op.cu              |   2 +-
 paddle/fluid/operators/data_norm_op.cu        |  16 +-
 paddle/fluid/operators/deformable_conv_op.cu  |   2 +-
 .../fluid/operators/deformable_conv_v1_op.cu  |   2 +-
 .../operators/deformable_psroi_pooling_op.cu  |   2 +-
 paddle/fluid/operators/dequantize_log_op.cu   |   2 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   3 +-
 .../fluid/operators/detection/box_clip_op.cu  |   2 +-
 .../fluid/operators/detection/box_coder_op.cu |   2 +-
 .../detection/box_decoder_and_assign_op.cu    |   2 +-
 .../detection/collect_fpn_proposals_op.cu     |   2 +-
 .../detection/distribute_fpn_proposals_op.cu  |   2 +-
 .../detection/polygon_box_transform_op.cu     |   4 +-
 .../detection/roi_perspective_transform_op.cu |   2 +-
 .../detection/sigmoid_focal_loss_op.cu        |   2 +-
 .../fluid/operators/detection/yolo_box_op.cu  |   2 +-
 paddle/fluid/operators/diagonal_op.cu         |   2 +-
 paddle/fluid/operators/dropout_impl.cu.h      |  10 +-
 paddle/fluid/operators/edit_distance_op.cu    |   4 +-
 .../elementwise/elementwise_op_function.h     |   6 +-
 .../test_elementwise_add_op_inplace.cc        |   5 +-
 paddle/fluid/operators/fake_quantize_op.cu    |   2 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |   6 +-
 .../fluid/operators/fused/conv_fusion_op.cc   |   4 +-
 .../fluid/operators/fused/conv_fusion_op.cu   |  48 +-
 .../fused/cudnn_bn_stats_finalize.cu.h        |   3 +-
 .../operators/fused/cudnn_fusion_helper.h     |  28 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |   7 +-
 .../fused/cudnn_scale_bias_add_relu.cu.h      |   3 +-
 .../operators/fused/fused_attention_op.cu     |   4 +-
 .../operators/fused/fused_bn_activation_op.cu |  42 +-
 .../fused/fused_bn_add_activation_op.cu       |  42 +-
 .../operators/fused/fused_dropout_common.h    |   6 +-
 .../fused_fc_elementwise_layernorm_op.cu      |   2 +-
 .../fused_layernorm_residual_dropout_bias.h   |   2 +-
 .../fused/fusion_conv_inception_op.cc         |   2 +-
 .../fused/fusion_conv_inception_op.cu         |  65 +-
 .../fusion_transpose_flatten_concat_op.cu.cc  |  16 +-
 paddle/fluid/operators/gather.cu.h            |   4 +-
 paddle/fluid/operators/graph_send_recv_op.cu  |   2 +-
 .../operators/grid_sampler_cudnn_op.cu.cc     |  17 +-
 paddle/fluid/operators/grid_sampler_op.cc     |   7 +-
 paddle/fluid/operators/grid_sampler_op.cu     |   6 +-
 paddle/fluid/operators/group_norm_op.cu       |   4 +-
 paddle/fluid/operators/histogram_op.cu        |   4 +-
 paddle/fluid/operators/index_sample_op.cu     |   4 +-
 paddle/fluid/operators/index_select_op.cu     |  26 +-
 paddle/fluid/operators/instance_norm_op.cu    |  69 +-
 paddle/fluid/operators/interpolate_op.cu      |   4 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   6 +-
 .../kernel_primitives/compute_primitives.h    |   2 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h |   9 +-
 paddle/fluid/operators/linspace_op.cu         |   2 +-
 paddle/fluid/operators/lite/lite_engine_op.h  |   2 +-
 paddle/fluid/operators/log_softmax_op.cu      |   2 +-
 paddle/fluid/operators/lookup_table_op.cu     |   2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   2 +-
 .../operators/margin_cross_entropy_op.cu      |  10 +-
 paddle/fluid/operators/math/beam_search.cu    |   2 +-
 paddle/fluid/operators/math/blas_impl.cu.h    |  94 ++-
 paddle/fluid/operators/math/blas_impl.hip.h   |  99 ++-
 .../fluid/operators/math/concat_and_split.cu  |   2 +-
 .../fluid/operators/math/cos_sim_functor.cu   |   2 +-
 paddle/fluid/operators/math/cross_entropy.cu  |   4 +-
 paddle/fluid/operators/math/depthwise_conv.cu |   4 +-
 .../operators/math/detail/gru_gpu_kernel.h    |   2 +-
 .../operators/math/detail/lstm_gpu_kernel.h   |   2 +-
 .../operators/math/eigen_values_vectors.h     |  20 +-
 paddle/fluid/operators/math/im2col.cu         |   4 +-
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/math/maxouting.cu      |   2 +-
 paddle/fluid/operators/math/pooling.cu        |   6 +-
 paddle/fluid/operators/math/prelu.h           |   6 +-
 paddle/fluid/operators/math/sample_prob.cu    |  12 +-
 .../fluid/operators/math/segment_pooling.cu   |   4 +-
 .../operators/math/selected_rows_functor.cu   |   2 +-
 .../fluid/operators/math/sequence_pooling.cu  |   2 +-
 paddle/fluid/operators/math/sequence_scale.cu |   2 +-
 paddle/fluid/operators/math/softmax.cu        |  14 +-
 paddle/fluid/operators/math/unpooling.cu      |   2 +-
 paddle/fluid/operators/math/vol2col.cu        |   4 +-
 paddle/fluid/operators/matrix_rank_op.cu      |  32 +-
 paddle/fluid/operators/mean_iou_op.cu         |   4 +-
 paddle/fluid/operators/mean_op.cu             |   6 +-
 paddle/fluid/operators/metrics/accuracy_op.cu |   4 +-
 paddle/fluid/operators/metrics/auc_op.cu      |   2 +-
 paddle/fluid/operators/miopen_lstm_cache.h    |  12 +-
 paddle/fluid/operators/miopen_rnn_cache.h     |  96 +--
 paddle/fluid/operators/mish_op.cu             |   4 +-
 paddle/fluid/operators/mv_op.cu               |   2 +-
 .../fluid/operators/nccl/nccl_gpu_common.cc   |   2 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   8 +-
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |   4 +-
 paddle/fluid/operators/nll_loss_op.cu         |   2 +-
 paddle/fluid/operators/norm_utils.cu.h        |   6 +-
 paddle/fluid/operators/one_hot_op.cu          |   4 +-
 paddle/fluid/operators/one_hot_v2_op.cu       |   4 +-
 .../fluid/operators/optimizers/adagrad_op.cu  |   2 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |   2 +-
 .../operators/optimizers/sparse_momentum_op.h |   4 +-
 paddle/fluid/operators/pad2d_op.cu            |   4 +-
 paddle/fluid/operators/pad3d_op.cu            |   4 +-
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  25 +-
 paddle/fluid/operators/pool_op.cc             |   7 +-
 paddle/fluid/operators/prelu_op.cu            |   2 +-
 paddle/fluid/operators/prroi_pool_op.h        |   2 +-
 paddle/fluid/operators/psroi_pool_op.cu       |   2 +-
 .../operators/pull_box_extended_sparse_op.cu  |   4 +-
 paddle/fluid/operators/pull_box_sparse_op.cu  |   4 +-
 paddle/fluid/operators/qr_op.cu               |  16 +-
 paddle/fluid/operators/range_op.cu            |   2 +-
 paddle/fluid/operators/rank_attention_op.cu   |   4 +-
 .../fluid/operators/reader/buffered_reader.cc |  21 +-
 .../fluid/operators/reader/buffered_reader.h  |   4 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |  11 +-
 paddle/fluid/operators/rnn_op.cu.cc           |  80 +-
 paddle/fluid/operators/roi_align_op.cu        |   4 +-
 paddle/fluid/operators/roi_pool_op.cu         |   2 +-
 paddle/fluid/operators/roll_op.cu             |   2 +-
 paddle/fluid/operators/row_conv_op.cu         |   2 +-
 paddle/fluid/operators/scatter.cu.h           |   2 +-
 paddle/fluid/operators/segment_pool_op.cu     |   4 +-
 paddle/fluid/operators/segment_pool_op.h      |   4 +-
 .../sequence_ops/sequence_enumerate_op.cu     |   2 +-
 .../sequence_ops/sequence_erase_op.cu         |   2 +-
 .../sequence_ops/sequence_expand_as_op.cu     |   2 +-
 .../sequence_ops/sequence_expand_op.cu        |   2 +-
 paddle/fluid/operators/shard_index_op.cu      |   4 +-
 paddle/fluid/operators/shuffle_channel_op.cu  |   4 +-
 .../sigmoid_cross_entropy_with_logits_op.cu   |   2 +-
 paddle/fluid/operators/softmax_cudnn_op.cu.h  |  24 +-
 paddle/fluid/operators/softmax_op.cc          |   8 +-
 .../softmax_with_cross_entropy_op.cu          |  10 +-
 paddle/fluid/operators/spectral_helper.h      |  16 +-
 paddle/fluid/operators/spectral_op.cu         |  22 +-
 paddle/fluid/operators/stack_op.cu            |   2 +-
 paddle/fluid/operators/svd_op.cu              |  16 +-
 .../fluid/operators/sync_batch_norm_op.cu.h   |   9 +-
 paddle/fluid/operators/temporal_shift_op.cu   |   4 +-
 paddle/fluid/operators/top_k_function_cuda.h  |   2 +-
 paddle/fluid/operators/transpose_op.cu.h      |   4 +-
 paddle/fluid/operators/trunc_op.cu            |   4 +-
 paddle/fluid/operators/warpctc_op.cc          |   7 +-
 paddle/fluid/operators/where_index_op.cu      |   2 +-
 paddle/fluid/operators/where_op.cu            |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |  16 +-
 paddle/fluid/platform/collective_helper.cc    |  14 +-
 paddle/fluid/platform/cuda_device_guard.h     |   2 +-
 .../platform/cuda_graph_with_memory_pool.h    |   2 +-
 paddle/fluid/platform/device/CMakeLists.txt   |   3 +
 .../fluid/platform/device/gpu/CMakeLists.txt  |  15 +
 .../platform/device/gpu/cuda/CMakeLists.txt   |   5 +
 .../gpu/cuda}/cuda_device_function.h          |  93 +--
 .../{ => device/gpu/cuda}/cuda_graph.cc       |  22 +-
 .../{ => device/gpu/cuda}/cuda_graph.h        |   6 +-
 .../{ => device/gpu/cuda}/cuda_helper.h       |  33 -
 .../platform/device/gpu/cuda/cuda_info.cc     | 268 +++++++
 .../{ => device/gpu/cuda}/cuda_profiler.cc    |   8 +-
 .../{ => device/gpu/cuda}/cuda_profiler.h     |   0
 .../{ => device/gpu/cuda}/cudnn_desc.h        |  44 +-
 .../{ => device/gpu/cuda}/cudnn_helper.h      |  70 +-
 .../gpu/cuda}/cudnn_helper_test.cc            |   2 +-
 .../{ => device/gpu}/cuda_helper_test.cu      |   6 +-
 .../{ => device/gpu}/cudnn_desc_test.cc       |   6 +-
 .../gpu/gpu_device_function.h}                |  24 +-
 paddle/fluid/platform/device/gpu/gpu_dnn.h    |  27 +
 paddle/fluid/platform/device/gpu/gpu_helper.h |  26 +
 paddle/fluid/platform/device/gpu/gpu_info.cc  | 356 +++++++++
 .../platform/{ => device/gpu}/gpu_info.h      |  76 +-
 .../{ => device/gpu}/gpu_launch_config.h      |   1 +
 .../gpu/gpu_primitives.h}                     |   0
 .../gpu/gpu_resource_pool.cc}                 |  24 +-
 .../gpu/gpu_resource_pool.h}                  |   0
 paddle/fluid/platform/device/gpu/gpu_types.h  |  94 +++
 .../platform/{ => device/gpu}/nccl_helper.h   |   4 +-
 .../platform/device/gpu/rocm/CMakeLists.txt   |   3 +
 .../{ => device/gpu/rocm}/miopen_desc.h       |  32 +-
 .../{ => device/gpu/rocm}/miopen_helper.h     |  62 +-
 .../gpu/rocm}/miopen_helper_test.cc           |   2 +-
 .../device/gpu/rocm/rocm_device_function.h    | 160 ++++
 .../platform/device/gpu/rocm/rocm_helper.h    | 102 +++
 .../platform/device/gpu/rocm/rocm_info.cc     | 269 +++++++
 .../fluid/platform/device/npu/hccl_helper.h   |   4 +-
 paddle/fluid/platform/device_context.cc       |  20 +-
 paddle/fluid/platform/device_context.h        |  20 +-
 paddle/fluid/platform/device_context_test.cu  |   6 +-
 .../fluid/platform/device_memory_aligment.h   |   4 +-
 paddle/fluid/platform/dynload/miopen.h        |   6 +
 paddle/fluid/platform/enforce.h               |   6 +-
 paddle/fluid/platform/enforce_test.cc         |   4 +-
 paddle/fluid/platform/event.h                 |  10 +-
 paddle/fluid/platform/for_range.h             |   2 +-
 paddle/fluid/platform/gpu_info.cc             | 734 ------------------
 paddle/fluid/platform/init_test.cc            |   2 +-
 paddle/fluid/platform/profiler.cu             |  18 +-
 paddle/fluid/platform/profiler.h              |   2 +-
 paddle/fluid/platform/profiler_helper.h       |   8 +-
 paddle/fluid/platform/stream/cuda_stream.cc   |  14 +-
 paddle/fluid/platform/stream/cuda_stream.h    |  24 +-
 .../fluid/platform/stream_callback_manager.cc |  17 +-
 .../fluid/platform/test_limit_gpu_memory.cu   |  63 +-
 paddle/fluid/pybind/cuda_streams_py.cc        |   6 +-
 paddle/fluid/pybind/imperative.cc             |   2 +-
 paddle/fluid/pybind/pybind.cc                 |  21 +-
 paddle/pten/api/lib/ext_compat_utils.cc       |   2 +-
 paddle/pten/core/convert_utils.cc             |   2 +-
 .../kernels/functions/cuda/cast_kernel_impl.h |   4 +-
 .../functions/cuda/reduce/reduce_cuda_impl.h  |  10 +-
 tools/check_file_diff_approvals.sh            |   2 +-
 347 files changed, 3043 insertions(+), 2885 deletions(-)
 create mode 100644 paddle/fluid/platform/device/gpu/CMakeLists.txt
 create mode 100644 paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_device_function.h (67%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_graph.cc (90%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_graph.h (96%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_helper.h (78%)
 create mode 100644 paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_profiler.cc (85%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cuda_profiler.h (100%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cudnn_desc.h (84%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cudnn_helper.h (88%)
 rename paddle/fluid/platform/{ => device/gpu/cuda}/cudnn_helper_test.cc (98%)
 rename paddle/fluid/platform/{ => device/gpu}/cuda_helper_test.cu (98%)
 rename paddle/fluid/platform/{ => device/gpu}/cudnn_desc_test.cc (90%)
 rename paddle/fluid/platform/{type_defs.h => device/gpu/gpu_device_function.h} (50%)
 create mode 100644 paddle/fluid/platform/device/gpu/gpu_dnn.h
 create mode 100644 paddle/fluid/platform/device/gpu/gpu_helper.h
 create mode 100644 paddle/fluid/platform/device/gpu/gpu_info.cc
 rename paddle/fluid/platform/{ => device/gpu}/gpu_info.h (70%)
 rename paddle/fluid/platform/{ => device/gpu}/gpu_launch_config.h (98%)
 rename paddle/fluid/platform/{cuda_primitives.h => device/gpu/gpu_primitives.h} (100%)
 rename paddle/fluid/platform/{cuda_resource_pool.cc => device/gpu/gpu_resource_pool.cc} (84%)
 rename paddle/fluid/platform/{cuda_resource_pool.h => device/gpu/gpu_resource_pool.h} (100%)
 create mode 100644 paddle/fluid/platform/device/gpu/gpu_types.h
 rename paddle/fluid/platform/{ => device/gpu}/nccl_helper.h (99%)
 create mode 100644 paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
 rename paddle/fluid/platform/{ => device/gpu/rocm}/miopen_desc.h (88%)
 rename paddle/fluid/platform/{ => device/gpu/rocm}/miopen_helper.h (89%)
 rename paddle/fluid/platform/{ => device/gpu/rocm}/miopen_helper_test.cc (98%)
 create mode 100644 paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
 create mode 100644 paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
 create mode 100644 paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
 delete mode 100644 paddle/fluid/platform/gpu_info.cc

diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index db8dc22f68663b..51446f287e94b7 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -17,11 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 3429677a2403e4..b1573093ec333f 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -291,13 +291,9 @@ void AllReduceOpHandle::SyncNCCLAllReduce() {
           nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
       auto &nccl_ctx = nccl_ctxs->at(dev_id);
       auto stream = nccl_ctx.stream();
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-#endif
+
+      platform::GpuStreamSync(stream);
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     }
   }
 }
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 033d9396e9bf23..02e35895205b76 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -33,7 +33,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/framework/details/bkcl_op_handle.h"
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 36b840e4945a0a..a11a244214d4fc 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -111,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
                 send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                 root_id, nccl_ctx.comm_, nccl_ctx.stream()));
           });
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 0b062b1a3f49a4..055c7e63863b37 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -44,7 +44,7 @@ struct BKCLContextMap;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 6ca4baa6d8b040..2e82fe22dba731 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -95,7 +95,7 @@ struct TestBroadcastOpHandle {
 #endif
     } else if (use_device_ == p::kCUDA) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 68c5daaac5d780..f9c28cbee50c3e 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -40,7 +40,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 07f7bbdb97a8d4..bcdd6129230b01 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -49,10 +49,10 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, place).device);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
 #endif
       PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
@@ -75,9 +75,9 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
   }
 #endif
@@ -160,12 +160,12 @@ void EagerDeletionOpHandle::ClearGarbages(
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamWaitEvent(callback_stream, event_, 0));
 #endif
     };
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 94507140a81d61..bd153f24fa318a 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -55,9 +55,9 @@ FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
   };
   destroy_event(start_event_);
@@ -87,10 +87,10 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto create_event = [](gpuEvent_t *event) {
       if (*event) return;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
 #endif
     };
@@ -109,12 +109,12 @@ void FusedAllReduceOpHandle::RunImpl() {
     auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device);
     nccl_stream = nccl_ctx.stream();
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamWaitEvent(nccl_stream, start_event_, 0));
 #endif
   } else {
@@ -169,12 +169,12 @@ void FusedAllReduceOpHandle::RunImpl() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamWaitEvent(compute_stream, end_event_, 0));
 #endif
   }
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 31336b92c4dfb6..d522981c77fa1a 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -35,7 +35,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 2fd1e0e7e98894..e08a768f8ce074 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -37,7 +37,7 @@ struct NCCLContextMap;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 98c37ca3c406a5..38e20127f1612e 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -48,7 +48,7 @@ struct TestGatherOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     if (use_gpu) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
index c59f61347303d4..9cfc3ada6ac3d7 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
@@ -35,7 +35,7 @@ class NCCLCommunicator;
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index a9ea336e425457..82557076544163 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -40,7 +40,7 @@ static std::vector<std::mutex>& multi_op_var2gpu_str_mutex() {
 }
 
 static void InitMultiGPUOpVarMap() {
-  int dev_count = platform::GetCUDADeviceCount();
+  int dev_count = platform::GetGPUDeviceCount();
   PADDLE_ENFORCE_GT(dev_count, 0,
                     platform::errors::NotFound(
                         "cuda device must > 0, now dev_count=%d", dev_count));
@@ -161,11 +161,11 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
                             op_var));
 
 #ifdef __HIPCC__
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                          hipMemcpyHostToDevice, dev_ctx->stream()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                           cudaMemcpyHostToDevice, dev_ctx->stream()));
 #endif
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 762f4071b5cabd..324d39ed8bb77a 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -27,7 +27,7 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 
 DECLARE_bool(sync_nccl_allreduce);
 
@@ -52,16 +52,16 @@ class NCCLOpHandleBase : public OpHandleBase {
   virtual ~NCCLOpHandleBase() {
     for (auto& ev : inter_events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
     for (auto& ev : exter_events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
   }
@@ -109,14 +109,14 @@ class NCCLOpHandleBase : public OpHandleBase {
 
       platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &inter_events_[dev_id], hipEventDisableTiming));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &exter_events_[dev_id], hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &exter_events_[dev_id], cudaEventDisableTiming));
 #endif
       VLOG(10) << "Create events on dev_id:" << dev_id
@@ -142,7 +142,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
@@ -192,7 +192,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
 #ifdef PADDLE_WITH_HIP
@@ -202,11 +202,7 @@ class NCCLOpHandleBase : public OpHandleBase {
 #endif
 
     if (FLAGS_sync_nccl_allreduce) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 
@@ -230,26 +226,21 @@ class NCCLOpHandleBase : public OpHandleBase {
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     hipEventRecord(exter_events_.at(dev_id), stream);
-
-    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-    }
 #else
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
-
+#endif
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+      platform::GpuStreamSync(stream);
     }
-#endif
   }
 
   void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
@@ -269,7 +260,7 @@ class NCCLOpHandleBase : public OpHandleBase {
 #else
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         sendbuff, count, datatype, 0, comm, stream));
   }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4b5d0563d73946..25b5eefc05cda3 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -35,9 +35,9 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
     }
   }
@@ -50,10 +50,10 @@ void OpHandleBase::InitCUDA() {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
 #endif
   }
@@ -182,9 +182,9 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
         static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
     }
   }
@@ -221,10 +221,10 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
               static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                   ->stream();
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #endif
 #else
@@ -250,11 +250,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
             auto stream =
                 static_cast<platform::CUDADeviceContext *>(pool.Get(place))
                     ->stream();
-#ifdef PADDLE_WITH_HIP
-            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+            platform::GpuStreamSync(stream);
 #else
             PADDLE_THROW(platform::errors::PreconditionNotMet(
                 "Not compiled with CUDA."));
@@ -279,10 +275,10 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #endif
 #else
@@ -319,10 +315,10 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
       auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
       VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
 #endif
     }
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index a485838a959425..bbc458804a195f 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -193,7 +193,7 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
                   buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                   ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
             });
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index d56b6b3663003c..4b9f289eaa787a 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -41,7 +41,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 82f5ea6a66891a..35dba488454725 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -59,7 +59,7 @@ struct TestReduceOpHandle {
     use_gpu_ = use_gpu;
     if (use_gpu) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      int count = p::GetCUDADeviceCount();
+      int count = p::GetGPUDeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
                         "device count is "
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 37399e5ddc09d9..d916b9bc262765 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(sync_nccl_allreduce);
@@ -182,7 +182,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
              << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
 
     all_gather_calls.emplace_back([=] {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
           in_tensor_buf, gather_buff, 2 * k, static_cast<ncclDataType_t>(dtype),
           comm, stream));
     });
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
index 8bfea0f1ae8b8a..5c3aef71ec40ea 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/dgc_const_values.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 600d75db53c7e7..15acedf3cf50a8 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -54,7 +54,7 @@ class DeviceContext;
 }  // namespace paddle
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 4e2d7bb979b617..9b8bdebe706ebf 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -115,7 +115,7 @@ void TestMainLoop() {
   std::vector<platform::Place> places{platform::CPUPlace(),
                                       platform::CUDAPlace(0),
                                       platform::CUDAPinnedPlace()};
-  if (platform::GetCUDADeviceCount() > 1) {
+  if (platform::GetGPUDeviceCount() > 1) {
     places.emplace_back(platform::CUDAPlace(1));
   }
 #else
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index f749ee8cfa0baa..82ce3b28776f12 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -24,7 +24,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 37fbf47f854ade..8564a42165961b 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -19,7 +19,7 @@
 #include <memory>
 #include <numeric>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index c9b5abf7a9befc..c91d371f5a1559 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -19,7 +19,7 @@
 #include <numeric>
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 645d725871a061..b043edca138a84 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -40,7 +40,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -397,7 +397,7 @@ class BoxWrapper {
     if (nullptr != s_instance_) {
       VLOG(3) << "Begin InitializeGPU";
       std::vector<gpuStream_t*> stream_list;
-      for (int i = 0; i < platform::GetCUDADeviceCount(); ++i) {
+      for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
         VLOG(3) << "before get context i[" << i << "]";
         platform::CUDADeviceContext* context =
             dynamic_cast<platform::CUDADeviceContext*>(
@@ -416,7 +416,7 @@ class BoxWrapper {
         slot_name_omited_in_feedpass_.insert(slot_name);
       }
       slot_vector_ = slot_vector;
-      keys_tensor.resize(platform::GetCUDADeviceCount());
+      keys_tensor.resize(platform::GetGPUDeviceCount());
     }
   }
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 66c043e137a247..225c2656fbfd1d 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -740,10 +740,10 @@ void FleetWrapper::PushDenseVarsAsync(
                  BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
                  sizeof(float) * count, stream);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
 #endif
 
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 6fddedccf02585..deb2b90c933532 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -35,7 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 646a2e97d319fb..e7f098320c6c75 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -28,7 +28,7 @@ limitations under the License. */
 // #include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
 #ifdef PADDLE_WITH_HETERPS
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index ec852ec83ca09c..c293b07e8995c1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -347,7 +347,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
 
   gpuStream_t streams[stream_num];
   for (int i = 0; i < stream_num; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(streams[i])));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
     auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType));
     auto d_v_buf = memory::AllocShared(place, chunk_size * sizeof(ValType));
     d_key_bufs.push_back(d_k_buf);
@@ -360,11 +360,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
   while (cur_len < len) {
     cur_stream = cur_stream % stream_num;
     int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len,
                         sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
                         streams[cur_stream]));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len,
                         sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice,
                         streams[cur_stream]));
@@ -378,7 +378,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
 
   for (int i = 0; i < stream_num; ++i) {
     cudaStreamSynchronize(streams[i]);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
   }
 }
 
@@ -402,14 +402,14 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads,
       d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
 
   void* d_buff = NULL;
   auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
       d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
   temp_storage_bytes = 0;
@@ -417,7 +417,7 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
   auto d_num_runs_out_mem = memory::AllocShared(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
       NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr,
       d_grads, d_num_runs_out, merger_, len, stream, false));
 
@@ -426,13 +426,13 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
     d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceReduce::ReduceByKey(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
       d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
       d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false));
 
   cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int),
                   cudaMemcpyDeviceToHost, stream);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -461,12 +461,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
 
   size_t temp_storage_bytes;
   const int num_bits = 1 + log2i(total_gpu);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
       d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
 
   auto d_temp_storage = memory::AllocShared(place, temp_storage_bytes);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
       d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
   calc_shard_offset<<<grid_size, block_size_, 0, stream>>>(d_shard_index_ptr,
@@ -720,12 +720,12 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
              cudaMemcpyHostToDevice);
 
   // allgather grad len
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, ncclInt,
       nccl_inner_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu,
              cudaMemcpyDeviceToHost);
 
@@ -737,15 +737,15 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   storage.alloc(max_size * total_gpu);
 
   // allgather keys and grads
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
       nccl_inner_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   int h_left[total_gpu];
   int h_right[total_gpu];
@@ -802,11 +802,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   cudaMemcpy(d_node_len, h_node_len, sizeof(int), cudaMemcpyHostToDevice);
 
   // allgather grad len
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_,
              cudaMemcpyDeviceToHost);
 
@@ -818,15 +818,15 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   storage.alloc(max_size * node_size_);
 
   // allgather keys and grads
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
       nccl_inter_comm, stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   int merge_num = 0;
   for (int i = 0; i < node_size_; ++i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index a369a612d4935d..ccdb6c5cdd64e4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -30,11 +30,11 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
   remote_streams_.resize(dev_ids_.size());
 
   for (size_t i = 0; i < dev_ids_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&local_streams_[i], cudaStreamNonBlocking));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&comm_streams_[i], cudaStreamNonBlocking));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaStreamCreateWithFlags(&remote_streams_[i], cudaStreamNonBlocking));
   }
 }
@@ -42,13 +42,13 @@ GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
 GPUResource::~GPUResource() {
   platform::CUDADeviceGuard guard(dev_id_);
   for (size_t i = 0; i < local_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(local_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(local_streams_[i]));
   }
   for (size_t i = 0; i < comm_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(comm_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(comm_streams_[i]));
   }
   for (size_t i = 0; i < remote_streams_.size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(remote_streams_[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(remote_streams_[i]));
   }
 }
 
@@ -58,7 +58,7 @@ void HeterPsResource::enable_p2p() {
     for (size_t j = 0; j < dev_ids_.size(); ++j) {
       if (i != j) {
         int p2p_flag;
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaDeviceCanAccessPeer(&p2p_flag, dev_ids_[i], dev_ids_[j]));
         if (p2p_flag == 1) {
           cudaError_t ret = cudaDeviceEnablePeerAccess(dev_ids_[j], 0);
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 3ac95632de6bf6..cbd06deeafc75a 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -22,7 +22,7 @@ bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
 #endif
@@ -38,7 +38,7 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
@@ -52,9 +52,9 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.global_ranks_ = ranks;
   platform::SetDeviceId(local_rank);
 #ifdef PADDLE_WITH_RCCL
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
 #endif
   return;
@@ -67,7 +67,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
 #ifdef PADDLE_WITH_RCCL
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 6519a514ff3b69..a0954ef0709dc3 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index b726a629586e18..c163c2de110191 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -37,8 +37,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_PSCORE
@@ -230,7 +230,7 @@ class PSGPUWrapper {
                              ? 1.0
                              : config["mf_max_bound"];
     for (size_t i = 0; i < heter_devices_.size(); i++) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(heter_devices_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i]));
       this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound,
                          learning_rate, initial_g2sum, initial_range);
       this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 9ab6b5d8c178b9..8b6a5747dbfced 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -83,9 +83,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
     : GarbageCollector(place, max_memory_size) {
   platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_.reset(
       new platform::StreamCallbackManager<gpuStream_t>(stream_));
 #endif
@@ -94,13 +94,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
 StreamGarbageCollector::~StreamGarbageCollector() {
   auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
   platform::CUDADeviceGuard guard(place.device);
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
-#endif
+  platform::GpuStreamSync(stream_);
+  platform::GpuDestroyStream(stream_);
 }
 
 gpuStream_t StreamGarbageCollector::stream() const { return stream_; }
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 154154fc795179..a020bda8231670 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace framework {
@@ -33,7 +33,7 @@ const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
   static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
 
   std::call_once(num_devices_init_flag, []() {
-    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    num_cuda_devices = paddle::platform::GetGPUDeviceCount();
     cuda_device_flags.resize(num_cuda_devices);
     default_cuda_generators.resize(num_cuda_devices);
   });
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 8049a1c9424beb..93b7869cc1d250 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -51,11 +51,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
     platform::CUDAPlace place = platform::CUDAPlace(num);
     platform::CUDADeviceGuard guard(place.device);
     cudaStream_t stream;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
     copy_streams_.push_back(stream);
     places_.push_back(place);
     cudaEvent_t event;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     events_.push_back(event);
 #endif
@@ -104,7 +104,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
   //   platform::CUDAPlace place = platform::CUDAPlace(num);
   //   platform::CUDADeviceGuard guard(place.device);
   //   cudaStream_t stream;
-  //   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+  //   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
   //   copy_streams_.push_back(stream);
   //   places_.push_back(place);
   // }
@@ -157,7 +157,7 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
     }
   }
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
   cudaEventSynchronize(event);
 #endif
 }
@@ -287,7 +287,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
 #ifdef PADDLE_WITH_CUDA
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::CUDADeviceGuard guard(dev_id);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
 #endif
       object_pool_.Push(context);
@@ -441,7 +441,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #ifdef PADDLE_WITH_CUDA
     auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     platform::CUDADeviceGuard guard(dev_id);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
 #endif
   }
@@ -461,7 +461,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #endif
     }
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
     while (cudaEventQuery(context->event_) != cudaSuccess) {
       VLOG(3) << "wait for kernel";
@@ -481,7 +481,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
 #ifdef PADDLE_WITH_CUDA
   auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaEventRecord(context->event_, dev_ctx->stream()));
   // cudaEventSynchronize(context->event_);
   {
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index ae662c64af3319..f12273e94dddd5 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -24,12 +24,8 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index ec014d331fa447..005f006ab04788 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -15,13 +15,8 @@
 #include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h"
 #include <string>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 8fb59d682e40fb..10e7ed0fb60219 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -24,7 +24,7 @@
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 template <typename T>
 using vec = paddle::framework::Vector<T>;
@@ -63,7 +63,7 @@ TEST(mixed_vector, GPU_VECTOR) {
 }
 
 TEST(mixed_vector, MultiGPU) {
-  if (paddle::platform::GetCUDADeviceCount() < 2) {
+  if (paddle::platform::GetGPUDeviceCount() < 2) {
     LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
                     "GPUs in your machine.";
     return;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 9f6e0557815062..dcbdd12f88fb7a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -398,13 +398,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << op->Type()
-            << "): context wait and get last error";
-#endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()
             << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 51c9e3d66a6f00..8df8db35592bb3 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
@@ -45,7 +45,7 @@ class ProfilerGuard {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
       cost_info_->device_memory_bytes =
-          platform::RecordedCudaMallocSize(cuda_place.device);
+          platform::RecordedGpuMallocSize(cuda_place.device);
 #endif
     }
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d60fdd90e2a2a4..4236fcf8dc1343 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1212,14 +1212,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
+#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
 #endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
     VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
-#endif
   }
 
   if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 78774f04896389..18d0ee78ffbbc7 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 1d5e638729361d..eb8a1e4cea9fbd 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -22,7 +22,7 @@
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include <cudnn.h>
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
@@ -30,8 +30,8 @@
 #endif
 #ifdef PADDLE_WITH_HIP
 #if defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
-#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index ae7ae85207d849..9a9b90cd811790 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -23,15 +23,15 @@
 #ifdef PADDLE_WITH_CUDA
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
 #ifdef PADDLE_WITH_HIP
 #if defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
-#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index b922811b4f1045..31da214fbc39ab 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -28,8 +28,8 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/parallel_context.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -64,7 +64,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
   dst->Resize(src.dims());
   auto *dst_ptr = dst->mutable_data(src.place(), src.type());
   auto nccl_dtype = platform::ToNCCLDataType(src.type());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
       src_ptr, dst_ptr, src.numel(), nccl_dtype, ncclSum, comm->comm(),
       stream));
 }
@@ -100,16 +100,12 @@ static void AllReduce(const framework::SelectedRows &src,
   if (!use_calc_stream) {
     dev_ctx->Wait();
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       gpu_rows_num_ptr + strategy.local_rank_, gpu_rows_num_ptr, 1, ncclInt64,
       comm->comm(), stream));
 
   if (!use_calc_stream) {
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
   }
 
   const auto *cpu_rows_num_ptr = rows_num_vector.data();
@@ -146,11 +142,11 @@ static void AllReduce(const framework::SelectedRows &src,
     // allgather is used to speed up the allreduce by replacing broadcast.
     auto row_sendcount = cpu_rows_num_ptr[0];
     VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_rows_ptr, dst_rows_ptr, row_sendcount, ncclInt64, comm->comm(),
         stream));
     auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
         comm->comm(), stream));
     return;
@@ -158,13 +154,13 @@ static void AllReduce(const framework::SelectedRows &src,
   for (int i = 0; i < strategy.nranks_; ++i) {
     if (cpu_rows_num_ptr[i] > 0) {
       // 2. Broadcast the rows of SelectedRows
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
           src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i],
           ncclInt64, i, comm->comm(), stream));
       // 3. Broadcast the tensor data of SelectedRows
       auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
                                row_offset * feature_size * sizeof_dtype;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBroadcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast(
           src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size,
           nccl_dtype, i, comm->comm(), stream));
       row_offset += cpu_rows_num_ptr[i];
@@ -209,12 +205,8 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
       AllReduce(src.Get<framework::SelectedRows>(),
                 tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream,
                 comm);
-// stream must synchronize to ensure accuracy of the move operation
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      // stream must synchronize to ensure accuracy of the move operation
+      platform::GpuStreamSync(stream);
       *dst = std::move(tmp_dst);
     }
 #endif
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 32becda4edc95a..0eb06983f409b1 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -153,11 +153,11 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 
 // compute_stream-->event-->comm_stream
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
 #endif
 }
 
@@ -179,11 +179,11 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 
 // comm_stream-->event-->compute_stream
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
 #endif
 }
 
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 1eee393aa714bb..1938fa08312f61 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
 #ifdef PADDLE_WITH_NCCL
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8e61b7d2eed880..8875ef74bce14e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -24,6 +24,8 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(run_pten_kernel);
 DECLARE_bool(benchmark);
@@ -523,12 +525,8 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA)
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
-#endif
-#if defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
   }
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index ceca7e8146a790..49c4b8d7372e27 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -19,8 +19,8 @@
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/helper.h"
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b1408995fa157b..2293b702468532 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -41,8 +41,8 @@
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/pten/api/ext/op_meta_info.h"
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index 57b5167337e252..d27f20a93b3a4b 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -27,7 +27,7 @@ using paddle::PaddleDType;
 void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
 #if defined(PADDLE_WITH_CUDA)
   void* ptr = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMallocHost(&ptr, size));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMallocHost(&ptr, size));
   return ptr;
 #else
   return nullptr;
@@ -36,7 +36,7 @@ void* TensorUtils::CudaMallocPinnedMemory(size_t size) {
 
 void TensorUtils::CudaFreePinnedMemory(void* ptr) {
 #if defined(PADDLE_WITH_CUDA)
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(ptr));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(ptr));
 #endif
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index d9cf9e2e860018..b468518fa5a3cb 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -45,7 +45,7 @@ class DefaultIOConverter : public EngineIOConverter {
             "the input max_size. But in's memory_size = %u, max_size = %u.",
             size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
           out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 64116b7973e710..2addff52829c80 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 
 #include "cuda_runtime_api.h"  // NOLINT
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
index 0f32183c0fbc15..70e5a7bcc7b4f2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/deformable_conv_op_plugin.cu
@@ -43,16 +43,16 @@ nvinfer1::Weights DeformableConvPlugin::copyToDevice(const void* hostData,
                                                      size_t count) {
   int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
   void* deviceData;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&deviceData, count * num_bytes));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(
-      deviceData, hostData, count * num_bytes, cudaMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&deviceData, count * num_bytes));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(deviceData, hostData, count * num_bytes,
+                                        cudaMemcpyHostToDevice));
   return nvinfer1::Weights{data_type_, deviceData, int64_t(count)};
 }
 
 void DeformableConvPlugin::serializeFromDevice(
     void** hostBuffer, const nvinfer1::Weights& deviceWeights) const {
   int num_bytes = (data_type_ == nvinfer1::DataType::kFLOAT ? 4 : 2);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemcpy(static_cast<char*>(*hostBuffer), deviceWeights.values,
                  deviceWeights.count * num_bytes, cudaMemcpyDeviceToHost));
   hostBuffer += deviceWeights.count * num_bytes;
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index a9a50543e7bb70..a4880a9997a539 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -17,7 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
index 88e075386d0935..7cab12b625d231 100644
--- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu
@@ -33,31 +33,31 @@ void Ltgemm_int8_linear(
     cublasLtMatmulDesc_t matmulDesc, void* alpha_scale, void* alpha_zero,
     void* alpha_one, void* workspace, cudaStream_t stream) {
   if (transA_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescT, alpha_one, A, Adesc, alpha_zero, nullptr,
         nullptr, Atransform, AtransformDesc, stream));
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescN, alpha_one, A, Adesc, alpha_zero, nullptr,
         nullptr, Atransform, AtransformDesc, stream));
   }
 
   if (transB_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescN, alpha_one, B, Bdesc, alpha_zero, nullptr,
         nullptr, Btransform, BtransformDesc, stream));
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
         ltHandle, transformDescT, alpha_one, B, Bdesc, alpha_zero, nullptr,
         nullptr, Btransform, BtransformDesc, stream));
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, Atransform, AtransformDesc, Btransform,
       BtransformDesc, nullptr, Ctransform, CtransformDesc, Ctransform,
       CtransformDesc, nullptr, workspace, 0, stream));
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransform(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransform(
       ltHandle, transformDescN, alpha_one, Ctransform, CtransformDesc,
       alpha_zero, nullptr, nullptr, C, Cdesc, stream));
 }
@@ -69,7 +69,7 @@ void Ltgemm_fp32_linear(cublasLtHandle_t ltHandle, const float* A,
                         cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
                         void* alpha_zero, void* workspace,
                         cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
       Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
 }
@@ -81,7 +81,7 @@ void Ltgemm_fp16_linear(cublasLtHandle_t ltHandle, const half* A,
                         cublasLtMatmulDesc_t matmulDesc, void* alpha_scale,
                         void* alpha_zero, void* workspace,
                         cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmul(
+  PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmul(
       ltHandle, matmulDesc, alpha_scale, A, Adesc, B, Bdesc, alpha_zero, C,
       Cdesc, C, Cdesc, nullptr, workspace, 0, stream));
 }
@@ -182,98 +182,98 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
     int const ldatransform = 32 * n_;
     int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8);
     int const ldctransform = 32 * n_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Atransform_,
         sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Btransform_,
         sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
         (void**)&Ctransform_,
         sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &AtransformDesc_, cudadataTypeIO, n_, k_, ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &BtransformDesc_, cudadataTypeIO, m_, k_, ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
         sizeof(COL4_4R2_8C)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &CtransformDesc_, cudadataTypeIO, n_, m_, ldctransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc_, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
     cublasOperation_t Transpose = CUBLAS_OP_T;
     cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
         &transformDescT_, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
         sizeof(Transpose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescCreate(
         &transformDescN_, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN_, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
@@ -282,20 +282,20 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
         CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
         sizeof(ATranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
         sizeof(BTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -303,17 +303,16 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
     for (int i = 0; i < n_; i++) {
       alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, n_ * sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float one_tem = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
     cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   } else if (type_ == nvinfer1::DataType::kHALF) {
@@ -324,70 +323,69 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
     half alpha_tem = static_cast<half>(alpha_);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_scale_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
     half zero_tem = static_cast<half>(zero);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
   } else {
@@ -398,71 +396,70 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc_, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n_ : k_,
         AopTranspose == CUBLAS_OP_N ? k_ : n_,
         AopTranspose == CUBLAS_OP_N ? n_ : k_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc_, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k_ : m_,
         BopTranspose == CUBLAS_OP_N ? m_ : k_,
         BopTranspose == CUBLAS_OP_N ? k_ : m_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc_, cudadataTypeIO, n_, m_, n_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch_), sizeof(batch_)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc_, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc_, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc_, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc_, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
     float alpha_tem = alpha_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   }
@@ -613,13 +610,13 @@ void MatmulPluginDynamic::configurePlugin(
   int const ldatransform = 32 * n_max;
   int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8);
   int const ldctransform = 32 * n_max;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Atransform_,
       sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Btransform_,
       sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(
       (void**)&Ctransform_,
       sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform));
 
@@ -628,38 +625,35 @@ void MatmulPluginDynamic::configurePlugin(
     for (int i = 0; i < n_max; i++) {
       alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, n_max * sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float one_tem = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float)));
     cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   } else if (type_ == nvinfer1::DataType::kHALF) {
     half alpha_tem = static_cast<half>(alpha_);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_scale_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
     half zero_tem = static_cast<half>(zero);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half),
                     cudaMemcpyHostToDevice);
   } else {
     float alpha_tem = alpha_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMalloc((void**)&alpha_scale_, sizeof(float)));
     cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
     float zero_tem = zero;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMalloc((void**)&alpha_zero_, sizeof(float)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float)));
     cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float),
                     cudaMemcpyHostToDevice);
   }
@@ -766,88 +760,88 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
     cublasLtOrder_t COL32 = CUBLASLT_ORDER_COL32;
     cublasLtOrder_t COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &AtransformDesc, cudadataTypeIO, n, k, ldatransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &BtransformDesc, cudadataTypeIO, m, k, ldbtransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL4_4R2_8C,
         sizeof(COL4_4R2_8C)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &CtransformDesc, cudadataTypeIO, n, m, ldctransform));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &COL32, sizeof(COL32)));
 
     cublasOperation_t Transpose = CUBLAS_OP_T;
     cublasLtPointerMode_t transform_model = CUBLASLT_POINTER_MODE_DEVICE;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixTransformDescCreate(&transformDescT, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &Transpose,
         sizeof(Transpose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescT, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixTransformDescCreate(&transformDescN, cudaDataTypeS));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
         &cudaDataTypeS, sizeof(cudaDataTypeS)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixTransformDescSetAttribute(
         transformDescN, CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
         &transform_model, sizeof(transform_model)));
 
@@ -856,20 +850,20 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
         CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &ATranspose,
         sizeof(ATranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BTranspose,
         sizeof(BTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -889,60 +883,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
@@ -959,60 +953,60 @@ int MatmulPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 #else
     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F_FAST_16F;
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Adesc, cudadataTypeIO, AopTranspose == CUBLAS_OP_N ? n : k,
         AopTranspose == CUBLAS_OP_N ? k : n,
         AopTranspose == CUBLAS_OP_N ? n : k));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridea),
         sizeof(stridea)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate(
         &Bdesc, cudadataTypeIO, BopTranspose == CUBLAS_OP_N ? k : m,
         BopTranspose == CUBLAS_OP_N ? m : k,
         BopTranspose == CUBLAS_OP_N ? k : m));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(strideb),
         sizeof(strideb)));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatrixLayoutCreate(&Cdesc, cudadataTypeIO, n, m, n));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &cudadataTypeIO,
         sizeof(cudadataTypeIO)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &(batch), sizeof(batch)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutSetAttribute(
         Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &(stridec),
         sizeof(stridec)));
 
     cublasLtPointerMode_t matmul_model = CUBLASLT_POINTER_MODE_DEVICE;
 
 #if CUBLAS_VER_MAJOR < 11
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dyl::cublasLtMatmulDescCreate(&matmulDesc, cudaComputeType));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescCreate(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescCreate(
         &matmulDesc, cudaComputeType, cudaDataTypeS));
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &AopTranspose,
         sizeof(AopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &BopTranspose,
         sizeof(BopTranspose)));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
+    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatmulDescSetAttribute(
         matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &matmul_model,
         sizeof(matmul_model)));
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 091680ff672d0e..ec4fcca6d74d0c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -136,7 +136,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
   float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
       cudaMemcpyHostToDevice, stream));
 
@@ -263,7 +263,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
     float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
         cudaMemcpyHostToDevice, stream));
 
@@ -279,7 +279,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
     half* const* h_odatas = reinterpret_cast<half* const*>(outputs);
     half** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(half*),
         cudaMemcpyHostToDevice, stream));
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 86666950bc36e6..c330867607f8e4 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -85,7 +85,7 @@ bool TRTInt8Calibrator::setBatch(
           engine_name_, it.first));
     }
     const auto& d = dataptr->second;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
   }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index b7b238bd0bf534..8314a1df931cac 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -30,13 +30,10 @@
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#include "paddle/fluid/platform/cuda_graph.h"
-#else
-#include <hip/hip_runtime.h>
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #endif
 
 #if CUDA_VERSION >= 10020
@@ -145,8 +142,7 @@ class AllocatorFacadePrivate {
                           "naive_best_fit strategy";
           FLAGS_use_stream_safe_cuda_allocator = false;
         }
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
@@ -172,13 +168,13 @@ class AllocatorFacadePrivate {
         if (FLAGS_use_stream_safe_cuda_allocator) {
           // TODO(Ruibiao): Support multi-stream allocator for other strategies
           default_stream_ = nullptr;
-          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id),
                                         default_stream_);
           }
         } else {
-          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
                ++dev_id) {
             InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
                                         allow_free_idle_chunk_);
@@ -208,8 +204,7 @@ class AllocatorFacadePrivate {
           FLAGS_use_stream_safe_cuda_allocator = false;
         }
 
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
@@ -399,10 +394,10 @@ class AllocatorFacadePrivate {
     CUdevice device;
     int val;
     try {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGetAttribute(
               &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
               device));
@@ -476,10 +471,10 @@ class AllocatorFacadePrivate {
     CUdevice device;
     int val;
     try {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cuDeviceGetAttribute(
               &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
               device));
@@ -599,7 +594,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
-    int device_count = platform::GetCUDADeviceCount();
+    int device_count = platform::GetGPUDeviceCount();
     for (int i = 0; i < device_count; ++i) {
       platform::CUDAPlace p(i);
       system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
@@ -612,7 +607,7 @@ class AllocatorFacadePrivate {
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    int device_count = platform::GetCUDADeviceCount();
+    int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
     }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 4cd8b4e91e614e..0d9f1043d9e86a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -19,7 +19,7 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index 193ef5a0cb922d..4469673b305bfe 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -19,7 +19,7 @@
 #include <thread>  // NOLINT
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_double(fraction_of_gpu_memory_to_use);
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index b1a45afa99d9a5..4242083f2e617a 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -25,8 +25,8 @@
 
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -37,8 +37,8 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
           "GPU memory is freed in incorrect device. This may be a bug"));
-  platform::RecordedCudaFree(allocation->ptr(), allocation->size(),
-                             place_.device);
+  platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
+                            place_.device);
   delete allocation;
 }
 
@@ -46,13 +46,13 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
   void* ptr;
-  auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
+  auto result = platform::RecordedGpuMalloc(&ptr, size, place_.device);
   if (LIKELY(result == gpuSuccess)) {
     return new Allocation(ptr, size, platform::Place(place_));
   }
 
   size_t avail, total, actual_avail, actual_total;
-  bool is_limited = platform::RecordedCudaMemGetInfo(
+  bool is_limited = platform::RecordedGpuMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
   size_t allocated = total - avail;
 
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 3d6f1d7bcbea64..9e04fd3f0619e3 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -81,10 +81,10 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
 #endif
   }
@@ -93,9 +93,9 @@ class CUDADeviceContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
     }
   }
@@ -111,12 +111,11 @@ class CUDADeviceContextAllocator : public Allocator {
         new CUDADeviceContextAllocation(memory::Alloc(place_, size));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
 #endif
     return allocation;
   }
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index e3780f2f11359c..f4baca8288f03c 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -23,8 +23,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #endif
 #if CUDA_VERSION >= 10020
 
@@ -49,10 +49,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
   // Prepare the access descriptor array indicating where and how the backings
   // should be visible.
-  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
     if (place.device != dev_id) {
       int capable = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
       if (!capable) {
         VLOG(1) << "device(" << place.device
@@ -73,10 +73,10 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
   // Get the minimum granularity needed for all devices
   // (the max of the minimum granularity of each participating device)
   granularity_ = 0;
-  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
     size_t granularity;
     prop.location.id = dev_id;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         paddle::platform::dynload::cuMemGetAllocationGranularity(
             &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
     granularity_ = std::max(granularity, granularity_);
@@ -84,7 +84,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
   size_t actual_avail, actual_total;
   paddle::platform::CUDADeviceGuard guard(place.device);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
 
   virtual_mem_size_ = AlignedSize(actual_total, granularity_);
 
@@ -93,7 +93,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
   // GPU,
   // so the virtual address space size we reserve is equal to the GPU video
   // memory size
-  PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
       &virtual_mem_base_, virtual_mem_size_, 0, 0, 0));
 
   virtual_mem_alloced_offset_ = 0;
@@ -123,11 +123,11 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
   auto result =
       paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
   if (result != CUDA_ERROR_DEINITIALIZED) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
   }
 
   if (result != CUDA_ERROR_DEINITIALIZED) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::RecordedGpuMemRelease(
         iter->second.first, iter->second.second, place_.device));
   }
 
@@ -166,12 +166,12 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
 
   // Create physical memory backing allocation.
   auto result =
-      platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);
+      platform::RecordedGpuMemCreate(&handle, size, &prop_, 0, place_.device);
 
   if (result != CUDA_SUCCESS) {
     if (result == CUDA_ERROR_OUT_OF_MEMORY) {
       size_t actual_avail, actual_total;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
       size_t actual_allocated = actual_total - actual_avail;
 
       PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -186,7 +186,7 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
           string::HumanReadableSize(actual_allocated),
           string::HumanReadableSize(actual_avail), place_.device));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(result);
+      PADDLE_ENFORCE_GPU_SUCCESS(result);
     }
     return nullptr;
   }
@@ -197,8 +197,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
   result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);
 
   if (result != CUDA_SUCCESS) {
-    platform::RecordedCuMemRelease(handle, size, place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
     return nullptr;
   }
 
@@ -208,8 +208,8 @@ Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
 
   if (result != CUDA_SUCCESS) {
     paddle::platform::dynload::cuMemUnmap(ptr, size);
-    platform::RecordedCuMemRelease(handle, size, place_.device);
-    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
     return nullptr;
   }
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 3bdd856759dc11..6de32335c62b22 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,8 +20,8 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/string/printf.h"
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5aa0514432844c..c56a7235c109ca 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,18 +20,18 @@ namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
   delete allocation;
 }
 Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index b2e13af6ef956e..d11240bc844870 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -112,13 +112,13 @@ void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
   for (gpuStream_t stream : *recorded_streams) {
     gpuEvent_t event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event, hipEventDisableTiming));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
 #endif
     outstanding_events->emplace_back(event);
     VLOG(9) << "Record event " << event << " in stream " << stream;
@@ -162,8 +162,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(*deque_it));
 #else
       gpuError_t err = hipEventQuery(*deque_it);
       if (err == hipErrorNotReady) {
@@ -173,8 +173,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(*deque_it));
 #endif
       ++deque_it;
     }
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 654fb3fe7bc044..c55f579981b005 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 88dbec2bcfd0c1..b7be895b358308 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -25,8 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 8b3d776cef2102..cd152843553a9f 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -24,8 +24,8 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 75b93088e55028..b300f936f7a683 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -27,9 +27,9 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -115,7 +115,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
   if (size <= 0) return nullptr;
 
   void* p;
-  auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
+  auto result = platform::RecordedGpuMalloc(&p, size, gpu_id_);
 
   if (result == gpuSuccess) {
     *index = 0;
@@ -123,7 +123,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     return p;
   } else {
     size_t avail, total, actual_avail, actual_total;
-    bool is_limited = platform::RecordedCudaMemGetInfo(
+    bool is_limited = platform::RecordedGpuMemGetInfo(
         &avail, &total, &actual_avail, &actual_total, gpu_id_);
     size_t allocated = total - avail;
 
@@ -166,7 +166,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
                         size, gpu_alloc_size_));
   gpu_alloc_size_ -= size;
 
-  platform::RecordedCudaFree(p, size, gpu_id_);
+  platform::RecordedGpuFree(p, size, gpu_id_);
 }
 
 bool GPUAllocator::UseGpu() const { return true; }
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index ead188341dac46..bb7f47f9d30ec4 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
 
 DECLARE_bool(use_pinned_memory);
 
@@ -77,11 +80,7 @@ TEST(GPUAllocator, AllocFailure) {
     allocator.Alloc(&index, alloc_size);
     ASSERT_TRUE(false);
   } catch (paddle::memory::allocation::BadAlloc&) {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
-#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::GpuGetLastError());
   }
 }
 #endif
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index c630437224cd09..7d2d2526ab1245 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 76a880755e21b8..837c964e2ad32c 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 // This unit test is an example comparing the performance between using pinned
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 6a5818fd9603be..a0293e8410c586 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -26,7 +26,7 @@
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -53,9 +53,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     for (size_t i = 1; i < stream_num_; ++i) {
       gpuStream_t stream;
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
 #endif
       streams_.emplace_back(stream);
     }
@@ -65,10 +65,10 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       std::shared_ptr<Allocation> allocation =
           AllocShared(place_, allocation_size, streams_[i]);
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemset(allocation->ptr(), 0, allocation->size()));
 #endif
       allocations_.emplace_back(allocation);
@@ -111,13 +111,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 // tricky code, the allocations are still accessible even though
 // allocations_.clear() has been called
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
                      data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
-                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(host_x.get(), allocations_[i]->ptr(),
+                                           data_num_ * sizeof(int),
+                                           hipMemcpyDeviceToHost));
 #endif
       for (int j = 0; j < data_num_; ++j) {
         EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
@@ -127,9 +127,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
   void TearDown() override {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
     for (gpuStream_t stream : streams_) {
       Release(place_, stream);
@@ -137,14 +137,14 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
     for (size_t i = 1; i < stream_num_; ++i) {
 #ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
 #endif
     }
 
     uint64_t cuda_malloc_size =
-        platform::RecordedCudaMallocSize(place_.GetDeviceId());
+        platform::RecordedGpuMallocSize(place_.GetDeviceId());
     ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
                                    << " bytes memory that not released yet,"
                                    << " there may be a memory leak problem";
@@ -192,11 +192,11 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream1));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream2));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream1));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream2));
 #endif
   size_t available_size = platform::GpuAvailableMemToAlloc();
   // alloc_size < available_size < 2 * alloc_size
@@ -216,9 +216,9 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   allocation2.reset();
 
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
   Release(place, stream1);
diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc
index 38499783eb492a..2ad92e36272b30 100644
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@@ -14,11 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index b197d3511f96b9..2776fe9c131329 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -14,11 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace platform {
@@ -64,13 +60,13 @@ struct CudnnActivationFunctor {
     x_desc.set(x);
     out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
         out->mutable_data<T>(ctx_.GetPlace())));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
@@ -108,14 +104,14 @@ struct CudnnActivationGradFunctor {
     dout_desc.set(dout);
     dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationBackward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), dx_desc.desc(),
         dx->mutable_data<T>(ctx_.GetPlace())));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 0294bfd5b05d5a..07cf516c476e89 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 5fa1e18553bd53..cf4041f721af25 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -23,7 +23,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index b8ce52387b9592..31801b14564d3e 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -108,7 +108,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
     const T* output_grad_data = output_grad->data<T>();
     T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSpatialTfGridGeneratorBackward(
             handle, cudnn_st_desc, output_grad_data, theta_grad_data));
   }
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 3125e005174de2..d1da11028c05c5 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -18,12 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index 58b56bdcf5614e..bcf7deefc98f03 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/affine_grid_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index f50d5e619ebea7..6236a07de4bc61 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/argsort_op.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 #ifdef __HIPCC__
 namespace rocprim {
@@ -169,7 +169,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8,
         cu_stream);
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
 
   Tensor temp_storage;
   temp_storage.mutable_data<uint8_t>(ctx.GetPlace(), temp_storage_bytes);
@@ -188,7 +188,7 @@ void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input,
         cu_stream);
   }
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+  PADDLE_ENFORCE_GPU_SUCCESS(err);
 }
 
 template <typename T, typename IndType>
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 2796a6b2239b98..3bffe0a05a8f75 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index b686c766e0f8b9..c326929a146809 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index b4cf9c48df2a80..e3dc54e17cd7fd 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -197,18 +197,18 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 // miopenTensorDescriptor_t bn_param_desc_;
 // miopenBatchNormMode_t mode_;
 
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
 
@@ -251,23 +251,22 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
 //     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
 //     const_cast<int *>(strides.data())));
 // Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(
 //         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
     // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_,
-            test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_,
+        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
 #endif
 
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -341,7 +340,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationForwardInference(
 //         handle, miopenBNSpatial,
 //         const_cast<void *>(
@@ -364,7 +363,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 //             est_var->template data<BatchNormParamType<T>>())),
 //         epsilon));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
               handle,
               // Note: PERSISTENT not implemented for inference
@@ -426,7 +425,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                 "The argument ReserveSpace of batch_norm op is not found."));
 
         // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                     /*handle=*/handle,
@@ -440,7 +439,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                     /*sizeInBytes=*/&workspace_size));
 
         // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
                     /*handle=*/handle,
@@ -454,7 +453,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
             ctx.GetPlace(), transformed_x.type(), reserve_space_size);
         workspace_ptr = workspace_tensor.mutable_data(
             ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
                 handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
                 CudnnDataType<T>::kZero(), data_desc_,
@@ -508,7 +507,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationForwardTraining(
 //         handle, mode_, const_cast<void *>(static_cast<const void *>(
 //                            CudnnDataType<T>::kOne())),
@@ -537,7 +536,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
                   handle, mode_, CudnnDataType<T>::kOne(),
                   CudnnDataType<T>::kZero(), data_desc_,
@@ -568,15 +567,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
   }
@@ -981,18 +980,18 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 // miopenTensorDescriptor_t bn_param_desc_;
 // miopenBatchNormMode_t mode_;
 
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
       cudnnBatchNormMode_t mode_;
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 #endif
       if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -1022,18 +1021,18 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
 //     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
 //     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
 //                                                       data_desc_, mode_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
           x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
                                                            data_desc_, mode_));
 #endif
@@ -1063,7 +1062,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         Tensor workspace_tensor;
         auto reserve_space_size = reserve_space->memory_size();
         // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::
                 cudnnGetBatchNormalizationBackwardExWorkspaceSize(
                     /*handle=*/dev_ctx.cudnn_handle(),
@@ -1081,7 +1080,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         workspace_ptr = workspace_tensor.mutable_data(
             ctx.GetPlace(), transformed_x.type(), workspace_size);
 
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnBatchNormalizationBackwardEx(
                 /*handle=*/dev_ctx.cudnn_handle(),
                 /*mode=*/mode_,
@@ -1151,7 +1150,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
           }
 
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenBatchNormalizationBackward(
 //         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
 //         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -1166,7 +1165,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
                   dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
                   CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -1231,15 +1230,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_CUDA_SUCCESS(
+// PADDLE_ENFORCE_GPU_SUCCESS(
 //     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
 #endif
     } else {
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 8bd2b7fe2d127c..73f73a81c088eb 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/operators/bce_loss_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index 3c64ed1acc847d..3fd8995745acb4 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/bilateral_slice_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index 757f7286291069..34facf1ea1fa90 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/bincount_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index bb4246e3e9b845..6b393b5666bb29 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index f15d1fe5e02ac2..549bb5ae75affe 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <iostream>
 #include "paddle/fluid/operators/center_loss_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu
index 4426057305249b..0bfddf8b5f386e 100644
--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -131,27 +131,26 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
                                    int lda, int* info) const {                 \
     auto handle = dev_ctx.cusolver_dn_handle();                                \
     int workspace_size = 0;                                                    \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                               \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
         platform::dynload::cusolverDn##C##potrf_bufferSize(                    \
             handle, uplo, n, A, lda, &workspace_size));                        \
     auto workspace = memory::Alloc(dev_ctx, workspace_size);                   \
     T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());                 \
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##potrf(       \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf(        \
         handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));        \
   }
 
 FUNC_WITH_TYPES(POTRF_INSTANCE);
 
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
-#define POTRF_BATCH_INSTANCE(T, C)                                          \
-  template <>                                                               \
-  void CholeskyGPUKernel<T>::PotrfBatched(                                  \
-      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,    \
-      int n, T* Aarray[], int lda, int* info_array, int batch_size) const { \
-    auto handle = dev_ctx.cusolver_dn_handle();                             \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                            \
-        platform::dynload::cusolverDn##C##potrfBatched(                     \
-            handle, uplo, n, Aarray, lda, info_array, batch_size));         \
+#define POTRF_BATCH_INSTANCE(T, C)                                             \
+  template <>                                                                  \
+  void CholeskyGPUKernel<T>::PotrfBatched(                                     \
+      const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,       \
+      int n, T* Aarray[], int lda, int* info_array, int batch_size) const {    \
+    auto handle = dev_ctx.cusolver_dn_handle();                                \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \
+        handle, uplo, n, Aarray, lda, info_array, batch_size));                \
   }
 
 FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE);
diff --git a/paddle/fluid/operators/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn_launch_op.cu.cc
index d557cfc7c08927..fae2d6ddb487d9 100644
--- a/paddle/fluid/operators/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn_launch_op.cu.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/type_defs.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
@@ -45,9 +45,9 @@ void CUDART_CB ReleaseBuffers(void* data) {
 template <>
 void ReleaseResource<platform::CUDADeviceContext>(
     const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
       static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaLaunchHostFunc(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
       static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
 }
 
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index cfcfd04e6fc7c2..29286be0dd6b20 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -30,7 +30,7 @@ namespace cub = hipcub;
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -335,7 +335,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
           static_cast<platform::CUDADeviceContext*>(
               platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
               ->stream();
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           num_classes_per_device_ptr, num_classes_per_device_ptr,
           num_classes_per_device.numel(),
           platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
@@ -346,13 +346,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // step 2: Determine temporary device storage requirements
     int num_buffer_ele = std::max(batch_size, num_classes);
     size_t cub_sort_temp_store_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         nullptr, cub_sort_temp_store_size, nullptr, nullptr, nullptr, nullptr,
         num_buffer_ele, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
 
     size_t cub_sum_temp_store_size = 0;
     NotEqualToPreviousAdjacentIterator<T> unique_counting_iter_temp(nullptr, 0);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         (cub::DeviceScan::InclusiveSum<NotEqualToPreviousAdjacentIterator<T>,
                                        T*>(
             nullptr, cub_sum_temp_store_size, unique_counting_iter_temp,
@@ -360,7 +360,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 
     size_t cub_scan_temp_store_size = 0;
     ActualNumSampledFunctor<T> actual_num_sampled_op_temp(num_samples);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan(
         nullptr, cub_scan_temp_store_size, num_classes_per_device_ptr,
         num_classes_per_device_ptr, actual_num_sampled_op_temp, nranks + 1,
         ctx.cuda_device_context().stream())));
@@ -384,7 +384,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr();
 
     // step 4: Calculate class interval among nranks
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum(
         cub_temp_storage_ptr, cub_temp_storage_bytes,
         num_classes_per_device_ptr, class_interval_ptr, nranks + 1,
         ctx.cuda_device_context().stream())));
@@ -415,13 +415,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 
     // step 7: sort class center by ascending, so that positive class center
     // always be sampled.
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, cub_sort_keys_ptr,
         cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_values_out_ptr,
         num_classes, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
 
     // step 8: sort input label ascending
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceRadixSort::SortPairs<T, T>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, label->data<T>(),
         cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_keys_ptr,
         batch_size, 0, sizeof(T) * 8, ctx.cuda_device_context().stream())));
@@ -430,8 +430,8 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // label
     NotEqualToPreviousAdjacentIterator<T> unique_counting_iter(
         cub_sort_keys_out_ptr, 0);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum<
-                                 NotEqualToPreviousAdjacentIterator<T>, T*>(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum<
+                                NotEqualToPreviousAdjacentIterator<T>, T*>(
         cub_temp_storage_ptr, cub_temp_storage_bytes, unique_counting_iter,
         cub_sort_values_ptr, batch_size, ctx.cuda_device_context().stream())));
 
@@ -445,13 +445,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
     // Since maybe num_positive_class_center > num_samples,
     // we need to ensure all positive class center per device are sampled.
     ActualNumSampledFunctor<T> actual_num_sampled_op(num_samples);
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveScan(
         cub_temp_storage_ptr, cub_temp_storage_bytes, bound_value_ptr,
         num_classes_per_device_ptr, actual_num_sampled_op, nranks + 1,
         ctx.cuda_device_context().stream())));
 
     // step 12: Calculate actual sampled class interval among nranks
-    PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum(
+    PADDLE_ENFORCE_GPU_SUCCESS((cub::DeviceScan::InclusiveSum(
         cub_temp_storage_ptr, cub_temp_storage_bytes,
         num_classes_per_device_ptr, class_interval_ptr, nranks + 1,
         ctx.cuda_device_context().stream())));
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 157924f08546bf..4e6d86d49e8632 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,15 +69,11 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
         red_type = ncclMin;
         break;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
         comm, stream));
     if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 1bcb47fc686cfe..02b10f17da5a3d 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -62,15 +62,15 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
     size_t offset = 0;
     send_numel /= nranks;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
     for (auto i = 0; i < nranks; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
           send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
       offset += send_numel;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index b8631b44f14caa..c9aef237699f3d 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -41,13 +41,9 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
     ncclRedOp_t nccl_red_type = ncclSum;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
 #else
     PADDLE_THROW(platform::errors::Unavailable(
         "PaddlePaddle should compile with NCCL."));
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index fa4d7ee4cce5d1..daaaf8b7a2e410 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace ops = paddle::operators;
@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
     auto comm = dev_ctx.nccl_comm();
     auto stream = dev_ctx.stream();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         send_recv_buffer, static_cast<size_t>(in->numel()),
         platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
 
@@ -62,11 +62,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             << " From " << root_dev_id << " to " << dev_id;
 
     if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 597e4321d66bdb..f174473c049ece 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -56,7 +56,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 6d569b454e6916..714dc4e19f9b13 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -386,7 +386,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index b37bd250c15583..6deb837069761d 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -46,7 +46,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
 
     int root = ctx.Attr<int>("root");
     if (root == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
           root, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
@@ -59,7 +59,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
             static_cast<framework::Tensor*>(out));
       }
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
                                        dtype, root, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 60a9b1ee44fcc2..db9a8428e3d033 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index aee10dcdc27323..f69fe8f1e3f1fa 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 // #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index bfdc49c440aae7..738ed162861317 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -71,7 +71,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index 858ca79f85b0e6..9b343b34a3e51f 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_embedding_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 0a0a824b775866..d392beb3a48345 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -30,7 +30,7 @@ namespace operators {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
   }
 }
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 74f41bff9dc865..b950339bd22be0 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -30,7 +30,7 @@ limitations under the License. */
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -316,7 +316,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
                                            "kRedMax, kRedMin, kRedProd."));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
         stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 4d19ee42641f4d..141fa760413b35 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -57,7 +57,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
         send_buff, recv_buff, recv_numel, static_cast<ncclDataType_t>(dtype),
         ncclSum, comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index 0c9dc2af14f394..4d4dc0c12af55c 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -66,7 +66,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     framework::Tensor temp;
     auto out_ptr = temp.mutable_data<T>(out_dims, place);
     if (root_id == comm->rank()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
           root_id, comm->comm(), stream));
 
@@ -74,7 +74,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
                             *platform::DeviceContextPool::Instance().Get(place),
                             static_cast<framework::Tensor*>(&temp));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           out_ptr, numel, dtype, root_id, comm->comm(), stream));
     }
 
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 77db86e7111112..6371d523cfa4a2 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -119,7 +119,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 1> along_axis(1);
     eigen_logits_max.device(*dev_ctx.eigen_device()) =
         eigen_logits.maximum(along_axis);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         logits_max_buff, logits_max_buff, logits_max.numel(),
         platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
         stream));
@@ -160,7 +160,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
         platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
         comm->comm(), stream));
@@ -178,7 +178,7 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
         eigen_softmax.sum(along_axis);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
         platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
         stream));
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index 034accbb480c78..a8c4eafede41ba 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 72faf4298cf601..72339bbd487527 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -55,11 +55,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
     auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
 
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
-#endif
+    platform::GpuStreamSync(dev_ctx->stream());
 
 #elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 03894b24a913b4..21bad096c2d493 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -67,11 +67,7 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
     auto stream =
         platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
 
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+    platform::GpuStreamSync(stream);
 
 #elif defined(PADDLE_WITH_ASCEND_CL)
     PADDLE_ENFORCE_EQ(is_npu_place(place), true,
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index d0dfc3bb1c2e5f..dfa4dcd0fac59b 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -54,11 +54,11 @@ class CWaitCommOp : public framework::OperatorBase {
 
 // comm_stream-->event-->compute_stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
 #endif
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 12a28040ef1c5b..e038617bf3d6a9 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -57,11 +57,11 @@ class CWaitComputeOp : public framework::OperatorBase {
 
 // compute_stream-->event-->comm_stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
 #endif
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 99a92469e8502b..7a5b6b5f429b2e 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -37,7 +37,7 @@ namespace operators {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclGetUniqueId(&(*nccl_ids)[i]));
   }
 }
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 70b5d0244d3852..e2ff823420aefd 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -103,24 +103,24 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
 
     for (auto i = 0; i < n_expert; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
       for (auto j = 0; j < nranks; ++j) {
         int idx = i + j * n_expert;
         if (cpu_global_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclSend(send_buf + send_ptr * in_feat,
                                           cpu_global_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
           send_ptr += cpu_global_count_data[idx];
         }
         if (cpu_local_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclRecv(recv_buf + expert_ptr[idx] * in_feat,
                                           cpu_local_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
         }
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
     }
 #else
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index bec984c6b57e19..c47d27366c5f27 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -102,24 +102,24 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto recv_buf = out->mutable_data<T>(out_dims, place);
 
     for (auto i = 0; i < n_expert; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
       for (auto j = 0; j < nranks; ++j) {
         int idx = i + j * n_expert;
         if (cpu_local_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclSend(send_buf + expert_ptr[idx] * in_feat,
                                           cpu_local_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
         }
         if (cpu_global_count_data[idx]) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::ncclRecv(recv_buf + recv_ptr * in_feat,
                                           cpu_global_count_data[idx] * in_feat,
                                           dtype, j, comm->comm(), stream));
           recv_ptr += cpu_global_count_data[idx];
         }
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
     }
 
 #else
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 8c32f8c41bbf25..094847beca214b 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -67,7 +67,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
         send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
         comm->comm(), stream));
 #else
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 49eafa5c7c4f53..d59c062a31b8c8 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -80,7 +80,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
     int recv_numel = numel / num;
     int offset = recv_numel * id;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::ncclRecv(out->data<T>() + offset, recv_numel, dtype,
                                     peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 2463f208746ed6..8a4f7f750a15b3 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -74,7 +74,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
     int send_numel = numel / num;
     int offset = send_numel * id;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
         x->data<T>() + offset, send_numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " send " << send_numel
             << " from offset[" << offset << "] to " << peer;
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index df94fee5223c6c..18d6af4c2aaa11 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,7 +69,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
         auto out_dims = out->dims();
         out->mutable_data<T>(out_dims, place, 0);
         auto numel = out->numel();
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
             out->data<T>(), numel, dtype, peer, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " recv "
                 << framework::product(out_dims) << " from " << peer;
@@ -83,7 +83,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     auto numel = out->numel();
 
     out->mutable_data<T>(out_dims, place);
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
         out->data<T>(), numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " recv "
             << framework::product(out->dims()) << " from " << peer;
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index dc28910e9ec9cb..952fcf2065d596 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -62,7 +62,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         auto& x = x_array.at(idx);
         int numel = x.numel();
         ncclDataType_t dtype = platform::ToNCCLDataType(x.type());
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
             x.data<T>(), numel, dtype, peer, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " send "
                 << framework::product(x.dims()) << " to " << peer;
@@ -73,7 +73,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     int numel = x->numel();
 
     ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
         x->data<T>(), numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " send "
             << framework::product(x->dims()) << " to " << peer;
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index dec0e789776a43..55bd4879ab7947 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -27,7 +27,7 @@ class OpBase;
 }  // namespace imperative
 }  // namespace paddle
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 namespace paddle {
@@ -35,7 +35,7 @@ namespace operators {
 
 static size_t CUDADevCount() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  return platform::GetCUDADeviceCount();
+  return platform::GetGPUDeviceCount();
 #else
   return 0UL;
 #endif
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index f4183bf570926d..a783a619473ef2 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -25,7 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
 namespace paddle {
 namespace operators {
 
@@ -98,7 +99,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
 inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) {
   int max_algos = 0;
 #if CUDNN_VERSION_MIN(7, 0, 1)
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
           cudnn_handle, &max_algos));
 #endif
@@ -176,22 +177,22 @@ static void SetConvMathType(const framework::ExecutionContext& ctx,
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
   auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
   if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
     VLOG(5) << "use cudnn_tensor_op_math";
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
   } else if (dev_ctx.GetComputeCapability() >= 80 &&
              dtype == CUDNN_DATA_BFLOAT16) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8, 1, 0)
   } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
   } else {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cdesc.desc(), CUDNN_DEFAULT_MATH));
     VLOG(5) << "NOT use cudnn_tensor_op_math";
   }
@@ -245,7 +246,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
       int perf_count;
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
@@ -264,7 +265,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionForwardAlgorithm(
                 args.handle, args.idesc.desc(), args.wdesc.desc(),
                 args.cdesc.desc(), args.odesc.desc(),
@@ -273,7 +274,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               args.handle, args.idesc.desc(), args.wdesc.desc(),
               args.cdesc.desc(), args.odesc.desc(),
@@ -306,7 +307,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
             std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                       args.handle, args.idesc.desc(), args.x->data<T>(),
                       args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -332,7 +333,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             args.handle, args.idesc.desc(), args.wdesc.desc(),
             args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
@@ -362,7 +363,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(
           new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
@@ -395,7 +396,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 args.handle, args.wdesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.idesc.desc(),
@@ -404,7 +405,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               args.handle, args.wdesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.idesc.desc(),
@@ -435,7 +436,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
             std::array<perf_t, kNUM_CUDNN_BWD_DATA_ALGS> perf_stat;
 
             auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::
                       cudnnFindConvolutionBackwardDataAlgorithmEx(
                           args.handle, args.wdesc.desc(), args.w->data<T>(),
@@ -464,7 +465,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             args.handle, args.wdesc.desc(), args.odesc.desc(),
             args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
@@ -496,7 +497,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       int best_algo_idx = 0;
       std::unique_ptr<perf_t[]> perf_results(
           new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
@@ -515,7 +516,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
                    "the workspace size request("
                 << workspace_size << ") exceeds the limit("
                 << workspace_size_limit << ")";
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 args.handle, args.idesc.desc(), args.odesc.desc(),
                 args.cdesc.desc(), args.wdesc.desc(),
@@ -524,7 +525,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
 #endif
       }
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               args.handle, args.idesc.desc(), args.odesc.desc(),
               args.cdesc.desc(), args.wdesc.desc(),
@@ -553,7 +554,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               int returned_algo_count;
               std::array<perf_t, kNUM_CUDNN_BWD_FILTER_ALGS> perf_stat;
               auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::
                         cudnnFindConvolutionBackwardFilterAlgorithmEx(
                             args.handle, args.idesc.desc(), args.x->data<T>(),
@@ -584,7 +585,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
               algo_t chosen_algo;
               std::vector<perf_t> perf_results(max_algos);
               int actual_algos = 0;
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::
                       cudnnFindConvolutionBackwardFilterAlgorithm(
                           args.handle, args.idesc.desc(), args.odesc.desc(),
@@ -605,7 +606,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
     platform::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
             args.handle, args.idesc.desc(), args.odesc.desc(),
             args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 275e81fc7f33a0..566e99c357fbed 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -261,9 +261,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     // cudnn 7 can support groups, no need to do it manually
     // FIXME(typhoonzero): find a better way to disable groups
     // rather than setting it to 1.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
-                                                         groups));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
+        args.cdesc.desc(), groups));
     groups = 1;
 #endif
 #ifdef PADDLE_WITH_HIP
@@ -328,7 +327,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     workspace_handle.RunFunc(
         [&](void* workspace_ptr) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionForward(
                   handle, &alpha, args.idesc.desc(), input_data,
                   args.wdesc.desc(), filter_data, args.cdesc.desc(), algo,
@@ -340,7 +339,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::cudnnConvolutionForward(
                     handle, &alpha, args.idesc.desc(),
                     input_data + i * group_offset_in, args.wdesc.desc(),
@@ -718,7 +717,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
         T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardData(
                       handle, &alpha, args1.odesc.desc(), output_grad_data,
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
@@ -726,7 +725,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       cudnn_workspace_ptr, workspace_size));
             },
             workspace_size);
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
             handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
             transformed_input_grad_data, &alpha, args1.idesc.desc(),
             temp_tensor_data, &beta, args1.idesc.desc(),
@@ -734,7 +733,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardData(
                       handle, &alpha, args1.odesc.desc(), output_grad_data,
                       args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
@@ -749,7 +748,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardData(
                       handle, &alpha, args1.wdesc.desc(),
                       filter_data + i * group_offset_filter, args1.odesc.desc(),
@@ -796,7 +795,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       workspace_handle.RunFunc(
           [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardWeights(
                     handle, &alpha, args2.odesc.desc(), output_grad_data,
                     args2.idesc.desc(), input_data, args2.cdesc.desc(),
@@ -808,7 +807,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args2.idesc.desc(),
                       input_data + i * group_offset_in, args2.odesc.desc(),
@@ -1228,7 +1227,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args1.idesc.desc(), ddx,
                       args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
@@ -1240,7 +1239,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionForward(
                         handle, &alpha, args1.idesc.desc(),
                         ddx + i * group_offset_in, args1.wdesc.desc(),
@@ -1258,7 +1257,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         // MIOPEN ONLY support beta to be 0.0f
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
                       ddw, args2.cdesc.desc(), fwd_algo2, &beta,
@@ -1270,7 +1269,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
         for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionForward(
                         handle, &alpha, args2.idesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
@@ -1294,7 +1293,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       wkspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardWeights(
                     handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
                     args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
@@ -1306,7 +1305,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args3.idesc.desc(),
                       ddx + i * group_offset_in, args3.odesc.desc(),
@@ -1325,7 +1324,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       wkspace_handle.RunFunc(
           [&](void* workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_GPU_SUCCESS(
                 platform::dynload::miopenConvolutionBackwardData(
                     handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
                     args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
@@ -1337,7 +1336,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardData(
                       handle, &alpha, args4.wdesc.desc(),
                       ddw + i * group_offset_filter, args4.odesc.desc(),
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index 23a471cfa00674..291e5f92f322cb 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -18,11 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_exhaustive_search);
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index befe09c8e6beb3..9c9795143eb78d 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/miopen_desc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -137,7 +137,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionForwardAlgorithm(
               args.handle, args.idesc.desc(), args.x->data<T>(),
               args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -154,7 +154,7 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
             args.handle, args.wdesc.desc(), args.idesc.desc(),
             args.cdesc.desc(), args.odesc.desc(), &workspace_size));
@@ -179,7 +179,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
               args.handle, args.odesc.desc(), args.o->data<T>(),
               args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
@@ -196,7 +196,7 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
             args.handle, args.odesc.desc(), args.wdesc.desc(),
             args.cdesc.desc(), args.idesc.desc(), &workspace_size));
@@ -221,7 +221,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
               args.handle, args.odesc.desc(), args.o->data<T>(),
               args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
@@ -238,7 +238,7 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
             args.handle, args.odesc.desc(), args.idesc.desc(),
             args.cdesc.desc(), args.wdesc.desc(), &workspace_size));
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 1610705c4694cb..41f6f75200697a 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -20,13 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_version_registry.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -222,7 +216,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::BF16 &&
       library == framework::LibraryType::kCUDNN) {
     PADDLE_ENFORCE_GE(
-        platform::CudnnVersion(), 8100,
+        platform::DnnVersion(), 8100,
         platform::errors::InvalidArgument(
             "bfloat16 can only be used when CUDNN_VERSION >= 8100"));
   }
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 314d33310588ed..2289104d2dbfbf 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index c4cd5854c0f78a..19c0be44a1d0b7 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -265,7 +265,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::miopenConvolutionBackwardData(
                 handle, &alpha, args.odesc.desc(),
                 input_data + input_offset * g, args.wdesc.desc(),
@@ -275,7 +275,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
       };
 #else   // PADDLE_WITH_HIP
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBackwardData(
                 handle, &alpha, args.wdesc.desc(),
                 filter_data + filter_offset * g, args.odesc.desc(),
@@ -549,7 +549,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionForward(
                   handle, &alpha, args1.idesc.desc(),
                   output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
@@ -560,13 +560,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         };
 #else   // PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::cudnnConvolutionForward(
-                  handle, &alpha, args1.idesc.desc(),
-                  output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
-                  filter_data + filter_offset * g, args1.cdesc.desc(),
-                  data_algo, cudnn_workspace, workspace_size, &beta,
-                  args1.odesc.desc(), input_grad_data + input_offset * g));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
+              handle, &alpha, args1.idesc.desc(),
+              output_grad_data + output_grad_offset * g, args1.wdesc.desc(),
+              filter_data + filter_offset * g, args1.cdesc.desc(), data_algo,
+              cudnn_workspace, workspace_size, &beta, args1.odesc.desc(),
+              input_grad_data + input_offset * g));
         };
 #endif  // PADDLE_WITH_HIP
         workspace_handle.RunFunc(cudnn_func, workspace_size);
@@ -598,7 +597,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       for (int g = 0; g < groups; g++) {
 #ifdef PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::miopenConvolutionBackwardWeights(
                   handle, &alpha, args2.odesc.desc(),
                   input_data + input_offset * g, args2.idesc.desc(),
@@ -609,7 +608,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         };
 #else   // PADDLE_WITH_HIP
         auto cudnn_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnConvolutionBackwardFilter(
                   handle, &alpha, args2.idesc.desc(),
                   output_grad_data + output_grad_offset * g, args2.odesc.desc(),
@@ -1054,7 +1053,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::miopenConvolutionBackwardData(
                         handle, &alpha, args1.odesc.desc(),
                         ddx + i * group_offset_in, args1.wdesc.desc(),
@@ -1067,7 +1066,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionBackwardData(
                         handle, &alpha, args1.wdesc.desc(),
                         w + i * group_offset_filter, args1.odesc.desc(),
@@ -1089,7 +1088,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
           T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::miopenConvolutionBackwardData(
                         handle, &alpha, args2.odesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
@@ -1099,7 +1098,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                         workspace_size));
               },
               workspace_size);
-          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
               handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
               transformed_ddy_channel + i * group_offset_out, &alpha,
               args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
@@ -1108,7 +1107,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+                PADDLE_ENFORCE_GPU_SUCCESS(
                     platform::dynload::cudnnConvolutionBackwardData(
                         handle, &alpha, args2.wdesc.desc(),
                         ddw + i * group_offset_filter, args2.odesc.desc(),
@@ -1152,7 +1151,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardWeights(
                       handle, &alpha, args3.odesc.desc(),
                       ddx + i * group_offset_in, args3.idesc.desc(),
@@ -1165,7 +1164,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionBackwardFilter(
                       handle, &alpha, args3.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
@@ -1185,7 +1184,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::miopenConvolutionForward(
                       handle, &alpha, args4.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
@@ -1198,7 +1197,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 #else   // PADDLE_WITH_HIP
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_GPU_SUCCESS(
                   platform::dynload::cudnnConvolutionForward(
                       handle, &alpha, args4.idesc.desc(),
                       transformed_dy_channel + i * group_offset_out,
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index b7859237e737a1..5451cf815cae33 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 
 namespace paddle {
@@ -77,7 +77,7 @@ class ScopedRNNBase {
     // ------------------- cudnn dropout descriptors ---------------------
     size_t state_size;
     if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -86,7 +86,7 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- cudnn rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
@@ -94,14 +94,14 @@ class ScopedRNNBase {
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
 
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
     PADDLE_ENFORCE_EQ(
         weights_size_, sizeof(T) * weight_numel_,
@@ -113,10 +113,10 @@ class ScopedRNNBase {
     std::vector<int> dim_w = {dim_tmp, 1, 1};
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- cudnn workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 27f64b41948be9..6f696afa23886a 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -111,14 +111,14 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference(
         handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
         rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
         rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
         rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
         workspace_data->data<uint8_t>(), workspace_size));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference(
         handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
         rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
         rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
@@ -129,7 +129,7 @@ void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
     // for inference
     // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
         handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
         init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
         w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
@@ -277,7 +277,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -285,7 +285,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -297,15 +297,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
         // for train
         // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
-                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
-                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
-                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
-                reserve_data, reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(),
+            init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(),
+            w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data,
+            rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+            nullptr, nullptr, nullptr, nullptr, workspace_data_.data<uint8_t>(),
+            workspace_size, reserve_data, reserve_size));
 #else
         PADDLE_THROW(platform::errors::Unavailable(
             "The padded input is supported by "
@@ -433,7 +431,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     if (!has_seq_length) {
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData(
           handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
           rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -442,13 +440,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           rnn.weight_desc(), weight_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData(
           handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
           rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -457,7 +455,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
           workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
@@ -467,7 +465,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
 #if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
       // for train
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
           handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
           out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
           rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -477,7 +475,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           workspace_data_.data<uint8_t>(), workspace_size,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
           out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index a6a23a91c76c02..6c059257b94e8b 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -92,15 +92,15 @@ struct CudnnRNNCache {
     std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           x_desc_[i], cudnn_type, 3, dims.data(), strides.data()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           y_desc_[i], cudnn_type, 3, dims_y.data(), strides_y.data()));
     }
 
@@ -108,78 +108,78 @@ struct CudnnRNNCache {
                                 hidden_size_};
     std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         hx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         cx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         hy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         cy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dhx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dcx_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dhy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         dcy_desc_, cudnn_type, 3, dims_hx.data(), strides_hx.data()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state_->Resize({static_cast<int64_t>(state_size)});
       uint8_t *dropout_state_data =
           dropout_state_->mutable_data<uint8_t>(place);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetDropoutDescriptor(
           dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
           seed_));
     } else {
       uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnRestoreDropoutDescriptor(
               dropout_desc_, handle, dropout_prob_, dropout_state_data,
               state_size, 0));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
 
     PADDLE_ENFORCE_EQ(
@@ -191,14 +191,14 @@ struct CudnnRNNCache {
     dim_w[0] = weights_size_ / cudnn_size;
     dim_w[1] = 1;
     dim_w[2] = 1;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
         w_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
         dw_desc_, cudnn_type, CUDNN_TENSOR_NCHW, 3, dim_w));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
@@ -208,40 +208,40 @@ struct CudnnRNNCache {
 
   void release() {
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
   }
 };
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index d9e19eb7f61a6a..977e301f13663b 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/cum_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 using Tensor = paddle::framework::Tensor;
 using LoDTensor = paddle::framework::LoDTensor;
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 75976c968c9e8b..ad96dc24b9206c 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cvm_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 1043faa56f01bc..5d157a77b3dd16 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -176,23 +176,19 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
     if (need_sync_stats) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_size),
           reinterpret_cast<void *>(d_batch_size), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_sum),
           reinterpret_cast<void *>(d_batch_sum), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_square_sum),
           reinterpret_cast<void *>(d_batch_square_sum), C,
           platform::ToNCCLDataType(x->type()), ncclSum, comm->comm(), stream));
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU, and need_sync_stats connot be "
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 67f5ee332eeb2f..924adafa4b8d80 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -27,7 +27,7 @@
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu
index e399a1fafdb71d..c252700528c492 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
@@ -30,7 +30,7 @@
 #include "paddle/fluid/operators/deformable_conv_v1_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index c1d4cc9d17ab4b..6489c1f9784cf3 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -32,7 +32,7 @@
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 9f63f8ed6f5201..39f4fdb71b69dd 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 725983f8153e4f..6f5137be620110 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -18,15 +18,14 @@ limitations under the License. */
 #include <vector>
 #ifdef __NVCC__
 #include "cub/cub.cuh"
-#include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
-#include "paddle/fluid/platform/miopen_helper.h"
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index e02f99a613c019..17013efcc98b7f 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
index 0693029eaea9c2..6e5fa1e2933533 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <thrust/host_vector.h>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/box_coder_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index 70767f1d7b1152..ed97559aa8bb56 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ffd9ac6b2af806..bd5703022db900 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -26,7 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 7ccb354e1773a3..1df7dcbe670c05 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -26,7 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 5977a434a6023f..5ff479eac8df0e 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 7b34e197ffe214..2ddcc7a06f6797 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index ed1676200dc470..10c402e5a4078a 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 83a0eb87d02dd5..23bd6af6bd2e80 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/diagonal_op.cu b/paddle/fluid/operators/diagonal_op.cu
index e2b5f24d6619e1..b1268e903df190 100644
--- a/paddle/fluid/operators/diagonal_op.cu
+++ b/paddle/fluid/operators/diagonal_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diagonal_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index bd4d690577a6fa..c97a523caa7673 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_impl_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -167,14 +167,14 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #endif
       return;
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 80490af33a1f95..f28fa4d6338d70 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index d3ab8ad9d69858..ad5a55aede751e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
 
 // only can include the headers in paddle/pten/include dirs
@@ -43,8 +43,8 @@ limitations under the License. */
 #include <thrust/iterator/iterator_adaptor.h>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index ab45b6f4de276b..706475bc82fade 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -30,10 +30,9 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
   if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #elif defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, n, hipMemcpyHostToDevice));
 #else
     PADDLE_THROW(
         platform::errors::InvalidArgument("Check your paddle version, current "
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 8f2235c7e3d21f..b95bbc775a0d76 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index f7478364cdfc51..990ac8dbc81216 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -22,11 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index 6b94f4ea5bdd2f..f2ce0bccd2fb57 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/conv_op.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index f5ee7f55991845..38326e7560c0df 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -18,11 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/math/padding.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
@@ -169,7 +165,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     miopenConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc,
                                                           groups));
     // Now only support NCHW
@@ -194,14 +190,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto f_dims = framework::vectorize(filter->dims());
 
     size_t workspace_size = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
             handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
             cudnn_output_desc, &workspace_size));
     int find_count;
     miopenConvAlgoPerf_t find_result;
     auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenFindConvolutionForwardAlgorithm(
               handle, cudnn_input_desc, input_data, cudnn_filter_desc,
               filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
@@ -215,23 +211,23 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     {
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenConvolutionForward(
             handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
             filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
             output_data, cudnn_workspace, workspace_size));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenConvolutionForwardBias(
               handle, &alpha, cudnn_bias_desc, bias_data, &beta,
               cudnn_output_desc, output_data));
       if (activation != "identity") {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenActivationForward(
             handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data,
             &beta, cudnn_output_desc, output_data));
       }
       if (residual) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor(
             handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data,
             &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc,
             output_data));
@@ -240,9 +236,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 #else  // PADDLE_WITH_HIP
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSetConvolutionGroupCount(cudnn_conv_desc,
-                                                         groups));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize<int>(transformed_input.dims()));
@@ -273,13 +268,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     auto handle = dev_ctx.cudnn_handle();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
-                                                         CUDNN_FMA_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_FMA_MATH));
     }
 #endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
 
@@ -292,20 +286,20 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       size_t tmp_size = 0;
       std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
           new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, algo, &workspace_size_in_bytes));
       if (workspace_size_in_bytes > workspace_size_limit)
         workspace_size_limit = workspace_size_in_bytes;
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
               cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -319,7 +313,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
         auto cudnn_find_func = [&](void* cudnn_workspace) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                   handle, cudnn_input_desc, input_data, cudnn_filter_desc,
                   filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
@@ -355,7 +349,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       VLOG(3) << "choose algo " << algo;
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
             handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
             cudnn_output_desc, algo, &workspace_size_in_bytes));
@@ -375,13 +369,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // ------------- cudnn conv forward and bias add ---------------------
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnConvolutionForward(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnConvolutionForward(
             handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
             filter_data, cudnn_conv_desc, algo, cudnn_workspace,
             workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnAddTensor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnAddTensor(
           handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
     } else {
@@ -392,7 +386,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       ScalingParamType<T> alpha1 = 1.0f;
       ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
       auto cudnn_func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBiasActivationForward(
                 handle, &alpha1, cudnn_input_desc, input_data,
                 cudnn_filter_desc, filter_data, cudnn_conv_desc, algo,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index dc703f9a822b5b..913772fb65050b 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
index 1de64cf5ad947d..13fad0b7cbb3d2 100644
--- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h
+++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -31,19 +31,19 @@ class CudnnFusionOp {
  public:
   explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) {
     // New 'fused op' descriptor creation
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
         &op_variant_params_, op_id));
   }
 
   ~CudnnFusionOp() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
   }
 
   // Execute fused op
@@ -53,7 +53,7 @@ class CudnnFusionOp {
         platform::errors::Fatal(
             "CudnnFusionOp exec requested without a valid 'plan', need: "
             "<set const params>, GetWorkspaceSizeBytes(), Execute()."));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_));
   }
 
@@ -61,9 +61,8 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label,
                            T *param_ptr) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnSetFusedOpsConstParamPackAttribute(
-            op_const_params_, param_label, param_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute(
+        op_const_params_, param_label, param_ptr));
     plan_created_ = false;
   }
 
@@ -81,9 +80,8 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label,
                            T param) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_,
-                                                         param_label, &param));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFusedOpsConstParamPackAttribute(
+        op_const_params_, param_label, &param));
     plan_created_ = false;
   }
 
@@ -101,7 +99,7 @@ class CudnnFusionOp {
   template <typename T>
   void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label,
                                 T *param_ptr) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnSetFusedOpsVariantParamPackAttribute(
             op_variant_params_, param_label, param_ptr));
   }
@@ -120,7 +118,7 @@ class CudnnFusionOp {
   size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
     if (!plan_created_) {
       workspace_bytes_ = 0U;
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
           cudnn_handle, op_, op_const_params_, &workspace_bytes_));
       plan_created_ = true;
     }
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index 9b9328a5ca6208..c8871388dd4507 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -320,7 +319,7 @@ class CudnnNormConvolutionGrad {
     ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;
     ctx.cudnn_workspace_handle().RunFunc(
         [&](void *cudnn_workspace_ptr) {
-          PADDLE_ENFORCE_CUDA_SUCCESS(
+          PADDLE_ENFORCE_GPU_SUCCESS(
               platform::dynload::cudnnConvolutionBackwardData(
                   cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr,
                   args_.out_desc.desc(), output_grad_ptr,
@@ -370,7 +369,7 @@ class CudnnNormConvolutionGrad {
   size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) {
     size_t workspace_size = 0U;
     auto handle = ctx.cudnn_handle();
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
             handle, args_.filter_desc.desc(), args_.out_desc.desc(),
             args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 5166ff27234f23..d0205208acc474 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 9f6d6e2270673d..173ef48b83dc2d 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 9339ae8e470de8..83328caf3844fc 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -22,7 +22,7 @@
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -107,22 +107,21 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     VLOG(3) << "Setting descriptors.";
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
@@ -144,7 +143,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             "The argument ReserveSpace of batch_norm op is not found."));
 
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::
             cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                 /*handle=*/handle,
@@ -158,7 +157,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                 /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
             /*handle=*/handle,
             /*mode=*/mode_,
@@ -171,7 +170,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
                                                     reserve_space_size);
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
             handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
@@ -190,9 +189,9 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
             reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
@@ -271,9 +270,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
@@ -282,12 +281,11 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -305,7 +303,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     cudnnActivationDescriptor_t activation_desc_ =
         scope_act_desc.descriptor<T>(act_type);
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -322,7 +320,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationBackwardEx(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -358,9 +356,9 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
             /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index c92b13b5f58473..7c124a0d6b6612 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -21,7 +21,7 @@
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -87,20 +87,19 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
 
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     double this_factor = 1. - momentum;
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
@@ -122,7 +121,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
             "The argument ReserveSpace of batch_norm op is not found."));
 
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::
             cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
                 /*handle=*/handle,
@@ -136,7 +135,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                 /*sizeInBytes=*/&workspace_size));
 
     // -------------- cudnn batchnorm reserve space --------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
             /*handle=*/handle,
             /*mode=*/mode_,
@@ -149,7 +148,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
                                                     reserve_space_size);
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
             handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
@@ -169,9 +168,9 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
             reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
@@ -231,9 +230,9 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     cudnnTensorDescriptor_t bn_param_desc_;
     cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
       LOG(ERROR) << "Provided epsilon is smaller than "
@@ -242,12 +241,11 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                         data_desc_, mode_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
@@ -265,7 +263,7 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     cudnnActivationDescriptor_t activation_desc_ =
         scope_act_desc.descriptor<T>(act_type);
     // --------------- cudnn batchnorm workspace ---------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -281,7 +279,7 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
 
     workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
                                                   workspace_size);
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationBackwardEx(
             /*handle=*/dev_ctx.cudnn_handle(),
             /*mode=*/mode_,
@@ -315,9 +313,9 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
             /*reserveSpaceSizeInBytes=*/reserve_space_size));
 
     // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 049c37f1ea0c44..eb651e4ea7b4fc 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -23,10 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/operators/math/functors.h"
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -93,7 +93,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 template <typename T>
 inline void SetZero(const platform::CUDADeviceContext &ctx, T *ptr,
                     const size_t size) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
 }
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index dc068e02be4ecf..c5b1fd93929504 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -22,7 +22,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index f257d3efa433e6..1827e137c15f18 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -169,7 +169,7 @@ void LaunchLayernormResidualDropoutBias(
     auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
     memory::Copy(cuda_place, dst, cuda_place, residual, rows * cols * sizeof(T),
                  ctx.stream());
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
         mask_data, 0, rows * cols * sizeof(MaskType), ctx.stream()));
 
     // call layernorm forward
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
index ea1e9512ca519f..eeeb004003c9c4 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index b3796f1df5fdf2..44312be7973985 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 DECLARE_uint64(conv_workspace_size_limit);
 
@@ -95,15 +95,15 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionDescriptor_t* conv_desc =
         new cudnnConvolutionDescriptor_t[4];
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i]));
     }
 
@@ -127,11 +127,11 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     for (int i = 0; i < 4; ++i) {
       filter_dims.push_back(framework::vectorize<int>(filters[i]->dims()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetFilterNdDescriptor(
           filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data()));
       bias_dims.push_back({1, filter_dims[i][0], 1, 1});
       bias_strides.push_back({filter_dims[i][0], 1, 1, 1});
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(),
           bias_strides[i].data()));
       in_dims.push_back({n, filter_dims[i][1], h, w});
@@ -140,22 +140,21 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       out_strides.push_back({oc * h * w, h * w, w, 1});
 
       if (i < 2) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionNdDescriptor(
                 conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(),
                 CUDNN_CROSS_CORRELATION, compute_type));
       } else {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionNdDescriptor(
                 conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(),
                 CUDNN_CROSS_CORRELATION, compute_type));
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
-                                                         CUDNN_DEFAULT_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          conv_desc[i], CUDNN_DEFAULT_MATH));
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
       if (!platform::allow_tf32_cudnn) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                            CUDNN_FMA_MATH));
       }
@@ -165,7 +164,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     in_strides[2][0] = oc * h * w;
     out_strides[2][0] = filter_dims[2][0] * h * w;  // this out is continuous.
     in_strides[3][0] = filter_dims[2][0] * h * w;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2));
 
     cudnnConvolutionFwdAlgo_t algo[4];
@@ -181,9 +180,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     }
 
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           out_desc[i], cudnn_dtype, 4, out_dims[i].data(),
           out_strides[i].data()));
 
@@ -192,13 +191,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       size_t tmp_size = 0;
       std::unique_ptr<cudnnConvolutionFwdAlgoPerf_t[]> perf_results(
           new cudnnConvolutionFwdAlgoPerf_t[kNUM_CUDNN_FWD_ALGS]);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
               handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
               kNUM_CUDNN_FWD_ALGS, &perf_count, perf_results.get()));
       algo[i] = (perf_results.get())[best_algo_idx].algo;
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i],
               algo[i], &tmp_size));
@@ -215,7 +214,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     // branch1: pool + 1x1 conv
     ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         pool_out_desc, temp_data));
 
@@ -237,7 +236,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     for (int i = 0; i < 4; ++i) {
       auto func = [&](void* cudnn_workspace) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cudnnConvolutionBiasActivationForward(
                 handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
                 static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
@@ -252,34 +251,34 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
 
     cudnnTensorDescriptor_t x_desc;
     cudnnTensorDescriptor_t y_desc;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&x_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&y_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
         handle, CudnnDataType<T>::kOne(), x_desc,
         static_cast<const void*>(out_datas[2]), CudnnDataType<T>::kZero(),
         y_desc, static_cast<void*>(output_data + (oc0 + oc1) * h * w)));
 
     for (int i = 0; i < 4; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i]));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(x_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(y_desc));
   }
 };
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 37a442a7815716..1fa4225934d394 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace platform {
@@ -50,9 +50,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
 
     cudnnTensorDescriptor_t in_desc;
     cudnnTensorDescriptor_t out_desc;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
@@ -92,12 +92,12 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         dims_y[i] = 1;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           in_desc, cudnn_dtype, max_dim, dims_y.data(), stride_x.data()));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           out_desc, cudnn_dtype, max_dim, dims_y.data(), stride_y.data()));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnTransformTensor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnTransformTensor(
           handle, CudnnDataType<T>::kOne(), in_desc,
           static_cast<const void*>(ins[k]->data<T>()),
           CudnnDataType<T>::kZero(), out_desc, static_cast<void*>(odata)));
@@ -108,9 +108,9 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         odata += flat_shape[1];
       }
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_desc));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(out_desc));
   }
 };
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 05af4ff150f399..700de8074ff8a6 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
index d9f56ec4dc0388..6e5e203e2d9434 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/graph_send_recv_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index d2002b487ca337..080dadeacaae71 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace framework {
@@ -70,7 +70,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
         DataLayout::kNCHW, framework::vectorize<int>(output->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerForward(
         handle, cudnn_st_desc, CudnnDataType<T>::kOne(), cudnn_input_desc,
         input_data, grid_data, CudnnDataType<T>::kZero(), cudnn_output_desc,
         output_data));
@@ -123,13 +123,12 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
         output_grad_desc.descriptor<T>(
             DataLayout::kNCHW, framework::vectorize<int>(output_grad->dims()));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnSpatialTfSamplerBackward(
-            handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
-            input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
-            input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
-            output_grad_data, grid_data, CudnnDataType<T>::kZero(),
-            grid_grad_data));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSpatialTfSamplerBackward(
+        handle, cudnn_st_dest, CudnnDataType<T>::kOne(), cudnn_input_desc,
+        input_data, CudnnDataType<T>::kZero(), cudnn_input_grad_desc,
+        input_grad_data, CudnnDataType<T>::kOne(), cudnn_output_grad_desc,
+        output_grad_data, grid_data, CudnnDataType<T>::kZero(),
+        grid_grad_data));
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 0b410f07fcb576..04aa6a3e10f6e3 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -17,12 +17,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 762d14096a5ab4..8e9f445f3b1169 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/grid_sampler_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index e029c84090af19..055fd791af5a3e 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -21,8 +21,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index 6a9183a8b465b7..b9419cbcc57b58 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/histogram_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 46dd91fed6cbc1..40a968b8a397d5 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -15,8 +15,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu
index 2353781daaa399..acf959896f9499 100644
--- a/paddle/fluid/operators/index_select_op.cu
+++ b/paddle/fluid/operators/index_select_op.cu
@@ -15,7 +15,7 @@
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_select_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -110,22 +110,14 @@ class IndexSelectCUDAKernel : public framework::OpKernel<T> {
           (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data,
                                                 numel, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     } else {
       const int* index_data = index->data<int>();
       index_select_cuda_kernel<T, int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                              PADDLE_CUDA_NUM_THREADS,
                                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
           in_data, out_data, index_data, numel, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 };
@@ -181,11 +173,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
                                                 index_data, index_nums,
                                                 out_nums, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     } else {
       const int* index_data = index->data<int>();
       index_select_grad_cuda_kernel<T, int><<<
@@ -193,11 +181,7 @@ class IndexSelectGradCUDAKernel : public framework::OpKernel<T> {
           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data,
                                                 index_data, index_nums,
                                                 out_nums, stride, size, delta);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
+      platform::GpuStreamSync(stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index affd0b7e1edd70..e0401366693b1b 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -26,12 +26,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/instance_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -114,17 +109,17 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     miopenTensorDescriptor_t data_desc_;
     miopenTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
     if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -143,20 +138,19 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
         const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDeriveBNTensorDescriptor(
             in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
 
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -202,7 +196,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenBatchNormalizationForwardTraining(
             handle, miopenBNSpatial,
             const_cast<void *>(
@@ -225,12 +219,12 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
                 saved_variance->template mutable_data<BatchNormParamType<T>>(
                     ctx.GetPlace()))));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnBatchNormalizationForwardTraining(
             handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
             CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
@@ -243,9 +237,9 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
             saved_variance->template mutable_data<BatchNormParamType<T>>(
                 ctx.GetPlace())));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
   }
@@ -396,17 +390,17 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     miopenTensorDescriptor_t data_desc_;
     miopenTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t in_param_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
 
@@ -418,20 +412,19 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
         const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDeriveBNTensorDescriptor(
             in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
 
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
@@ -442,7 +435,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
         saved_var->template data<BatchNormParamType<T>>();
     if (d_scale && d_bias) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenBatchNormalizationBackward(
               dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
               CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
@@ -456,7 +449,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
                   ctx.GetPlace()),
               epsilon, saved_mean_data, saved_var_data));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnBatchNormalizationBackward(
               dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
               CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
@@ -487,14 +480,14 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
   }
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 6be7dbdc110d52..3c857eb326ace4 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index fe9228135606dc..bc1ab704aafe3a 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -13,9 +13,9 @@
 #include <string>
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
index 73316d66b6cf26..2320b9e0b2fbf4 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -21,7 +21,7 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 4280c86ca99ab8..3656bd1a181671 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -23,13 +23,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index a4f0693323297c..4bf2a7cb372cb7 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index ec9f5dd95d4d0f..5d2a1683d381b4 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 #include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/tensor_utils.h"
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 7c47ad90502ebd..6676cde1cafcab 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -16,7 +16,7 @@
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/operators/math/functors.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 3edea025b2a044..5aa546cbcc21ae 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lookup_table_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 493966ecda7bda..317f9eeb94f39a 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 7c5e64d2afa46a..1deaa3ef1ee7c6 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -30,7 +30,7 @@ namespace cub = hipcub;
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
 namespace paddle {
@@ -69,7 +69,7 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
           platform::DeviceContextPool::Instance().Get(place))
           ->stream();
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
       num_classes_per_device_ptr, num_classes_per_device_ptr,
       num_classes_per_device.numel(),
       platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
@@ -314,7 +314,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           logits_max_buff, logits_max_buff, logits_max.numel(),
           platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
           stream));
@@ -335,7 +335,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
           platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum,
           comm->comm(), stream));
@@ -368,7 +368,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nranks > 1) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           loss_ptr, loss_ptr, loss->numel(),
           platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(),
           stream));
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index ed3ead47d171ef..0cc552d34c5872 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 70c6cf9dcab036..92162e639ff860 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -32,33 +32,33 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasScopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasScopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -82,7 +82,7 @@ struct CUBlas<float> {
     VLOG(5) << "use_tensor_op_math: "
             << (dev_ctx->tensor_core_available() ? "True" : "False");
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc));
     });
@@ -94,36 +94,33 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasSmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasSgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasSgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasStrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasStrsmBatched(args...));
   }
 };
 
@@ -131,33 +128,33 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDaxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -173,36 +170,33 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsm(args...));
   }
 
   template <typename... ARGS>
   static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetrfBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrfBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetriBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetriBatched(args...));
   }
 
   template <typename... ARGS>
   static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasDmatinvBatched(args...));
   }
 
   template <typename... ARGS>
   static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cublasDgetrsBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDgetrsBatched(args...));
   }
 
   template <typename... ARGS>
   static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDtrsmBatched(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasDtrsmBatched(args...));
   }
 };
 
@@ -215,7 +209,7 @@ struct CUBlas<platform::float16> {
                    const float16 *alpha, const float16 *A, int lda,
                    const float16 *B, int ldb, const float16 *beta, float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
                                        reinterpret_cast<const __half *>(alpha),
                                        reinterpret_cast<const __half *>(A), lda,
@@ -235,7 +229,7 @@ struct CUBlas<platform::float16> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasHgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const __half *>(alpha),
         reinterpret_cast<const __half *>(A), lda, strideA,
@@ -270,7 +264,7 @@ struct CUBlas<platform::float16> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -289,7 +283,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemv(
         handle, transa, m, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
         reinterpret_cast<const cuFloatComplex *>(B), ldb,
@@ -301,7 +295,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *alpha,
                    const platform::complex<float> *X, const int incX,
                    platform::complex<float> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCaxpy(
         handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(X), incX,
         reinterpret_cast<cuFloatComplex *>(Y), incY));
@@ -320,7 +314,7 @@ struct CUBlas<platform::complex<float>> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda, strideA,
@@ -340,7 +334,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
@@ -355,7 +349,7 @@ struct CUBlas<platform::complex<float>> {
                    const paddle::platform::complex<float> *alpha,
                    const paddle::platform::complex<float> *A, int lda,
                    paddle::platform::complex<float> *B, int ldb) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsm(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
@@ -384,7 +378,7 @@ struct CUBlas<platform::complex<float>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -401,7 +395,7 @@ struct CUBlas<platform::complex<float>> {
                          const paddle::platform::complex<float> **A, int lda,
                          paddle::platform::complex<float> **B, int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasCtrsmBatched(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex **>(A), lda,
@@ -417,7 +411,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemv(
         handle, transa, m, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
         reinterpret_cast<const cuDoubleComplex *>(B), ldb,
@@ -429,7 +423,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *alpha,
                    const platform::complex<double> *X, const int incX,
                    platform::complex<double> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZaxpy(
         handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(X), incX,
         reinterpret_cast<cuDoubleComplex *>(Y), incY));
@@ -448,7 +442,7 @@ struct CUBlas<platform::complex<double>> {
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemmStridedBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemmStridedBatched(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda, strideA,
@@ -468,7 +462,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
@@ -483,7 +477,7 @@ struct CUBlas<platform::complex<double>> {
                    const paddle::platform::complex<double> *alpha,
                    const paddle::platform::complex<double> *A, int lda,
                    paddle::platform::complex<double> *B, int ldb) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsm(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
@@ -497,7 +491,7 @@ struct CUBlas<platform::complex<double>> {
                          const paddle::platform::complex<double> **A, int lda,
                          paddle::platform::complex<double> **B, int ldb,
                          int batch_size) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZtrsmBatched(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasZtrsmBatched(
         handle, side, uplo, transa, diag, m, n,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex **>(A), lda,
@@ -526,7 +520,7 @@ struct CUBlas<platform::complex<double>> {
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, computeType, algo));
     });
@@ -842,7 +836,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
           handle, cuTransB, cuTransA, N, M, K, &alpha, B, fp, ldb, strideB, A,
           fp, lda, strideA, &beta, C, fp, ldc, strideC, batchCount, fp, algo));
     });
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index f972d38adda5fb..32479189eea58d 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -15,8 +15,8 @@
 #pragma once
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
@@ -31,32 +31,32 @@ template <>
 struct CUBlas<float> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_saxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_saxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_scopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_scopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_sgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_sgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_sgemm_strided_batched(args...));
   }
 
@@ -70,7 +70,7 @@ struct CUBlas<float> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_strsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_strsm(args...));
   }
 
   template <typename... ARGS>
@@ -102,32 +102,32 @@ template <>
 struct CUBlas<double> {
   template <typename... ARGS>
   static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemm(args...));
   }
 
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_daxpy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_daxpy(args...));
   }
 
   template <typename... ARGS>
   static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dscal(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dscal(args...));
   }
 
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dcopy(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dcopy(args...));
   }
 
   template <typename... ARGS>
   static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dgemv(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dgemv(args...));
   }
 
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_dgemm_strided_batched(args...));
   }
 
@@ -139,7 +139,7 @@ struct CUBlas<double> {
 
   template <typename... ARGS>
   static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_dtrsm(args...));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_dtrsm(args...));
   }
 
   template <typename... ARGS>
@@ -176,7 +176,7 @@ struct CUBlas<platform::float16> {
                    const float16 *alpha, const float16 *A, int lda,
                    const float16 *B, int ldb, const float16 *beta, float16 *C,
                    int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_hgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_half *>(alpha),
         reinterpret_cast<const rocblas_half *>(A), lda,
@@ -195,14 +195,13 @@ struct CUBlas<platform::float16> {
                                  const float16 *beta, float16 *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_hgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_half *>(alpha),
-            reinterpret_cast<const rocblas_half *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_half *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_half *>(beta),
-            reinterpret_cast<rocblas_half *>(C), ldc, strideC, batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_hgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_half *>(alpha),
+        reinterpret_cast<const rocblas_half *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_half *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_half *>(beta),
+        reinterpret_cast<rocblas_half *>(C), ldc, strideC, batchCount));
   }
 
   // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
@@ -217,7 +216,7 @@ struct CUBlas<platform::float16> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
@@ -232,7 +231,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(A), lda,
@@ -245,7 +244,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *alpha,
                    const platform::complex<float> *X, const int incX,
                    platform::complex<float> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_caxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_caxpy(
         handle, n, reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(X), incX,
         reinterpret_cast<rocblas_float_complex *>(Y), incY));
@@ -263,15 +262,14 @@ struct CUBlas<platform::complex<float>> {
                                  platform::complex<float> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_cgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_float_complex *>(alpha),
-            reinterpret_cast<const rocblas_float_complex *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_float_complex *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_float_complex *>(beta),
-            reinterpret_cast<rocblas_float_complex *>(C), ldc, strideC,
-            batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_float_complex *>(alpha),
+        reinterpret_cast<const rocblas_float_complex *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_float_complex *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_float_complex *>(beta),
+        reinterpret_cast<rocblas_float_complex *>(C), ldc, strideC,
+        batchCount));
   }
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
@@ -281,7 +279,7 @@ struct CUBlas<platform::complex<float>> {
                    const platform::complex<float> *B, int ldb,
                    const platform::complex<float> *beta,
                    platform::complex<float> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_cgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(A), lda,
@@ -302,7 +300,7 @@ struct CUBlas<platform::complex<float>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
@@ -317,7 +315,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemv(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(A), lda,
@@ -330,7 +328,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *alpha,
                    const platform::complex<double> *X, const int incX,
                    platform::complex<double> *Y, const int incY) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zaxpy(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zaxpy(
         handle, n, reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(X), incX,
         reinterpret_cast<rocblas_double_complex *>(Y), incY));
@@ -348,15 +346,14 @@ struct CUBlas<platform::complex<double>> {
                                  platform::complex<double> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::rocblas_zgemm_strided_batched(
-            handle, transa, transb, m, n, k,
-            reinterpret_cast<const rocblas_double_complex *>(alpha),
-            reinterpret_cast<const rocblas_double_complex *>(A), lda, strideA,
-            reinterpret_cast<const rocblas_double_complex *>(B), ldb, strideB,
-            reinterpret_cast<const rocblas_double_complex *>(beta),
-            reinterpret_cast<rocblas_double_complex *>(C), ldc, strideC,
-            batchCount));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm_strided_batched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const rocblas_double_complex *>(alpha),
+        reinterpret_cast<const rocblas_double_complex *>(A), lda, strideA,
+        reinterpret_cast<const rocblas_double_complex *>(B), ldb, strideB,
+        reinterpret_cast<const rocblas_double_complex *>(beta),
+        reinterpret_cast<rocblas_double_complex *>(C), ldc, strideC,
+        batchCount));
   }
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
@@ -366,7 +363,7 @@ struct CUBlas<platform::complex<double>> {
                    const platform::complex<double> *B, int ldb,
                    const platform::complex<double> *beta,
                    platform::complex<double> *C, int ldc) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemm(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_zgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(A), lda,
@@ -387,7 +384,7 @@ struct CUBlas<platform::complex<double>> {
                       rocblas_datatype computeType) {
     rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
     dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_gemm_ex(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
           handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
           beta, C, Ctype, ldc, C, Ctype, ldc, computeType, algo, 0, 0));
     });
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 614ae93d9fa826..32bb479e00517e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index 537c7e47155fe9..56ba145da1cade 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 55662e1d0aad7a..3e80e40f3577c3 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 6da1bfb964f24f..6ff2ddaa338df9 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -23,8 +23,8 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index 62c45f4dc098ba..75d4809a462cb7 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index 24885d37020dc9..851a62dbe9a48d 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 01f05530e34e64..b24f5d40e8dca7 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -184,14 +184,12 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
                       values_stride >= 32 && values_stride <= 512);
     syevjInfo_t syevj_params;
     if (use_syevj) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cusolverDnCreateSyevjInfo(&syevj_params));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cusolverDnSsyevj_bufferSize(
-              dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
-              reinterpret_cast<const float *>(input_vector), lda,
-              reinterpret_cast<const float *>(out_value), &lwork,
-              syevj_params));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
+          dev_ctx.cusolver_dn_handle(), jobz, uplo, n,
+          reinterpret_cast<const float *>(input_vector), lda,
+          reinterpret_cast<const float *>(out_value), &lwork, syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(), jobz, uplo, n, input_vector, lda,
                 out_value, &lwork);
@@ -203,7 +201,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
       auto *value_data = out_value + i * values_stride;
       auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
             handle, jobz, uplo, n, reinterpret_cast<float *>(input_data), lda,
             reinterpret_cast<float *>(value_data),
             reinterpret_cast<float *>(work_ptr), lwork, info_ptr,
@@ -220,7 +218,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (use_syevj) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cusolverDnDestroySyevjInfo(syevj_params));
     }
     if (has_vectors) {
@@ -255,7 +253,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                       \
       cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W,   \
       int *lwork) const {                                                      \
-    PADDLE_ENFORCE_CUDA_SUCCESS(                                               \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                                \
         platform::dynload::cusolverDn##C##evd_bufferSize(                      \
             handle, jobz, uplo, n, reinterpret_cast<const CastType *>(A), lda, \
             W, lwork));                                                        \
@@ -269,7 +267,7 @@ FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
       cusolverDnHandle_t handle, cusolverEigMode_t jobz,                  \
       cublasFillMode_t uplo, int n, T *A, int lda, ValueType *W, T *work, \
       int lwork, int *devInfo) const {                                    \
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDn##C##evd(    \
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##evd(     \
         handle, jobz, uplo, n, reinterpret_cast<CastType *>(A), lda, W,   \
         reinterpret_cast<CastType *>(work), lwork, devInfo));             \
   }
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 3eadaa2677ab4f..f616e116d0aee7 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 71080bf424a012..54a37db1df71a1 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -42,7 +42,7 @@ static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter,
   void *temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
   for (size_t i = 0; i < 2; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cub::DeviceScan::InclusiveScan(
+    PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::InclusiveScan(
         temp_storage, temp_storage_bytes, x_iter, y_iter, op,
         static_cast<int>(n),  // Maybe overflow?
         dev_ctx.stream()));
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 8b134a29d81cf6..1856fb4eb48c73 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 84a970a9a26067..076d3aa3361f0f 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 #ifdef __HIPCC__
 #define POOLING_BLOCK_SIZE 256
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index dc1e3c1c3ded10..70aae2ba59e2ca 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -16,11 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 446acc033eb7ff..f596c1bc3dcf38 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -144,13 +144,13 @@ void GPUSampleWithProb<T>::operator()(
   VLOG(1) << "num_tries: " << num_tries;
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(samples_data + num_true, s_data,
-                                        sizeof(int64_t) * num_samples,
-                                        hipMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(samples_data + num_true, s_data,
+                                       sizeof(int64_t) * num_samples,
+                                       hipMemcpyHostToDevice));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
-                                         sizeof(int64_t) * num_samples,
-                                         cudaMemcpyHostToDevice));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
+                                        sizeof(int64_t) * num_samples,
+                                        cudaMemcpyHostToDevice));
 #endif
 
   int threads = 512;
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index b49b5036ac42e2..67cf3162460073 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index f3ef537a31b44c..0e04c37ed2b12c 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index cba8dd935ef1b3..b3e1922e106574 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 5578f1f0138c42..1807c77e37ca16 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 9e9fe5b9c1020d..bc32e068f566d2 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -16,11 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -54,7 +50,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
       Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
@@ -64,7 +60,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
@@ -97,7 +93,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
       CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
@@ -110,7 +106,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   cudnnTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
       context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
       CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index ad23892f37903a..dbb3d64350caea 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index d83b5b0fe3afb3..d9c757544a9c6a 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index d85a262b5e910a..757c780b4ea53e 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -162,9 +162,9 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
@@ -173,7 +173,7 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -186,7 +186,7 @@ void MatrixRankGPUKernel<float>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -203,9 +203,9 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
@@ -214,7 +214,7 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -228,7 +228,7 @@ void MatrixRankGPUKernel<double>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -247,14 +247,14 @@ void MatrixRankGPUKernel<float>::SyevjBatched(
   int stride_A = lda * n;
   int lwork = 0;
   syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSsyevj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj(
         handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
         lwork, info, params));
 
@@ -268,7 +268,7 @@ void MatrixRankGPUKernel<float>::SyevjBatched(
             "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
             error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroySyevjInfo(params));
 }
 
@@ -285,15 +285,15 @@ void MatrixRankGPUKernel<double>::SyevjBatched(
   int stride_A = lda * n;
   int lwork = 0;
   syevjInfo_t params = NULL;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateSyevjInfo(&params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize(
       handle, jobz, uplo, n, A, lda, W, &lwork, params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
 
   for (int i = 0; i < batchSize; i++) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDsyevj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj(
         handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr,
         lwork, info, params));
     int error_info;
@@ -306,7 +306,7 @@ void MatrixRankGPUKernel<double>::SyevjBatched(
             "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i,
             error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroySyevjInfo(params));
 }
 
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 7098a720cc3a03..79aff52a16fa97 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/mean_iou_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 430036bc67de70..1a10b7033f69e9 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -65,14 +65,14 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
 
     auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
                                       out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     framework::Tensor tmp;
     auto* temp_storage = tmp.mutable_data<uint8_t>(
         framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
         context.GetPlace());
     err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
                                  out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
   }
 };
 
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index 3d22fc60993c7b..6f19100fa9d37e 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
index 40609381c17aee..1cb7eba8775e81 100644
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ b/paddle/fluid/operators/metrics/auc_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/metrics/auc_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index a357e6e5af6af0..c307218baa406e 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -66,7 +66,7 @@ class ScopedRNNBase {
     // ------------------- miopen dropout descriptors ---------------------
     size_t state_size;
     if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -75,7 +75,7 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
@@ -83,7 +83,7 @@ class ScopedRNNBase {
 
     // ------------------- miopen weights_size ---------------------
     size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, miopen_type));
     PADDLE_ENFORCE_EQ(
         weights_size_, sizeof(T) * weight_numel_,
@@ -95,10 +95,10 @@ class ScopedRNNBase {
     std::vector<int> dim_w = {dim_tmp, 1, 1};
     weight_desc_.descriptor<T>(layout, dim_w);
     // ------------------- miopen workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
diff --git a/paddle/fluid/operators/miopen_rnn_cache.h b/paddle/fluid/operators/miopen_rnn_cache.h
index 97d608331ccb51..38cea39abd5ded 100644
--- a/paddle/fluid/operators/miopen_rnn_cache.h
+++ b/paddle/fluid/operators/miopen_rnn_cache.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -95,16 +95,16 @@ struct CudnnRNNCache {
     std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
 
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenCreateTensorDescriptor(&x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenCreateTensorDescriptor(&y_desc_[i]));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
           x_desc_[i], miopen_type, 3, const_cast<int *>(dims.data()),
           const_cast<int *>(strides.data())));
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
           y_desc_[i], miopen_type, 3, const_cast<int *>(dims_y.data()),
           const_cast<int *>(strides_y.data())));
     }
@@ -113,85 +113,85 @@ struct CudnnRNNCache {
                                 hidden_size_};
     std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         hx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         cx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         hy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         cy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dhx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dcx_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dhy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dcy_desc_, miopen_type, 3, const_cast<int *>(dims_hx.data()),
         const_cast<int *>(strides_hx.data())));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateDropoutDescriptor(&dropout_desc_));
 
     size_t state_size;
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state_->Resize({static_cast<int64_t>(state_size)});
       uint8_t *dropout_state_data =
           dropout_state_->mutable_data<uint8_t>(place);
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetDropoutDescriptor(
           dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
           seed_, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     } else {
       uint8_t *dropout_state_data = dropout_state_->data<uint8_t>();
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenRestoreDropoutDescriptor(
               dropout_desc_, handle, dropout_prob_, dropout_state_data,
               state_size, 0, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateRNNDescriptor(&rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
         rnn_desc_, hidden_size_, num_layers_, miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
         miopenRNNNoBias, miopenRNNdefault, miopen_type));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenCreateTensorDescriptor(&dw_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_, x_desc_[0], &weights_size_, miopen_type));
 
     PADDLE_ENFORCE_EQ(
@@ -208,14 +208,14 @@ struct CudnnRNNCache {
     dim_s[1] = 1;
     dim_s[0] = dim_w[1];
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         w_desc_, miopen_type, 3, dim_w, dim_s));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
         dw_desc_, miopen_type, 3, dim_w, dim_s));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_, seq_length_, x_desc_, &workspace_size_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_, seq_length_, x_desc_, reserve_size_));
 
@@ -225,40 +225,40 @@ struct CudnnRNNCache {
 
   void release() {
     for (size_t i = 0; i < seq_length_; ++i) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDestroyTensorDescriptor(x_desc_[i]));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDestroyTensorDescriptor(y_desc_[i]));
     }
 
     delete[] x_desc_;
     delete[] y_desc_;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(hx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(cx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(hy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(cy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dhx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dcx_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dhy_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dcy_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyDropoutDescriptor(dropout_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyRNNDescriptor(rnn_desc_));
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(w_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenDestroyTensorDescriptor(dw_desc_));
   }
 };
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
index 6513e5d95e4acc..4ca07b650c80a6 100644
--- a/paddle/fluid/operators/mish_op.cu
+++ b/paddle/fluid/operators/mish_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mish_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
index ee638ede22b640..cec17f13243134 100644
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mv_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 169af47e95acdc..bcbc96ea1b6d10 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -50,7 +50,7 @@ void Communicator::InitAll(const std::vector<int>& gpus) {
   for (size_t i = 0; i < gpus.size(); ++i) {
     (*comm_id_map)[gpus[i]] = i;
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
   inited = true;
 }
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 9a4a036077f587..f319ce159f6dd0 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -74,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
     VLOG(3) << "gpu : "
             << " invoke allreduce. send " << x->numel() << " recv "
             << out->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
         NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
@@ -111,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     }
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
         x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
         reduction_op_, root, comm->comms().at(idx),
         ctx.cuda_device_context().stream()));
@@ -136,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
@@ -145,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* out = ctx.Output<LoDTensor>("Out");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
               << framework::product(out->dims());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           out->mutable_data<T>(ctx.GetPlace()), out->numel(),
           NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
           ctx.cuda_device_context().stream()));
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 6c7fba8d4ac789..41c1b4d7a8f815 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -23,9 +23,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -44,7 +44,7 @@ const f::DDim kDims = {20, 20};
 class NCCLTester : public ::testing::Test {
  public:
   void SetUp() override {
-    int count = p::GetCUDADeviceCount();
+    int count = p::GetGPUDeviceCount();
     if (count <= 0) {
       LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
                    << count;
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index b6e7cd256e18d4..03af45634149df 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/nll_loss_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 843736833f8156..241c634e3fc98a 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -26,11 +26,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index bffd1d5305127c..3da7a3afcc93dc 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
index 2366f1422244e3..22eb6c81845d15 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_v2_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 5043468d4c5f72..8b939b7c6b3ba2 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index a5d9ad271f23a4..3582e939f30ac7 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index f1516320ec5733..23e37ea27b54f7 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -445,12 +445,12 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
       for_range_index(range_functor);
 
       size_t temp_storage_bytes = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           (cub::DeviceRadixSort::SortPairs<IndexT, IndexT>(
               nullptr, temp_storage_bytes, nullptr, nullptr, nullptr, nullptr,
               static_cast<int>(num_index))));
       auto d_temp_storage = memory::Alloc(ctx.GetPlace(), temp_storage_bytes);
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           (cub::DeviceRadixSort::SortPairs<IndexT, IndexT>(
               d_temp_storage->ptr(), temp_storage_bytes, index->data<IndexT>(),
               sorted_index_ptr, sort_value_ptr, grad_index_ptr,
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index a77d0a5650ef32..a854fa6091ab4c 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
index f243a78e5578bb..1567251236550d 100644
--- a/paddle/fluid/operators/pad3d_op.cu
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 8fcd40a9a2df40..bbe31740129478 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -16,14 +16,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/miopen_helper.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -204,17 +201,17 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     char *pool_workspace;
     size_t pool_worksize = 0;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
             cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
         tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data,
         false, pool_workspace, pool_worksize));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc,
         tranformed_input_data, &beta, cudnn_output_desc,
         tranformed_output_data));
@@ -468,17 +465,17 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       char *pool_workspace;
       size_t pool_worksize = 0;
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenPoolingGetWorkSpaceSizeV2(
               cudnn_pool_desc, cudnn_output_desc, &pool_worksize));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenPoolingBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data, pool_workspace));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(pool_workspace));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnPoolingBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward(
           handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
           cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
           &beta, cudnn_input_desc, input_grad_data));
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 9d8f086ce0f187..fa98e76e39338f 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -15,12 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
 
 #include <unordered_map>
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
index ce3f5969cef49d..06cc9ed7a96e53 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/prelu.h"
 #include "paddle/fluid/operators/prelu_op.h"
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index f9e2b78d5d31a2..38f8d6542ac32c 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index f69edfc1fcfec9..5a0d1a700417cb 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/psroi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
index 5bde6bc2e5cbbd..26a02ea622479f 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
index 8bba9db5426b70..96a1b1c08b79c2 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_sparse_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/pull_box_sparse_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 992df172ace0c7..3eb5f72b5b117e 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -167,7 +167,7 @@ void QrGPUKernel<float>::BatchedGeqrf(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize(
       handle, m, n, a, lda, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
@@ -178,7 +178,7 @@ void QrGPUKernel<float>::BatchedGeqrf(
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgeqrf(
         handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
         info_d));
     // Do we need synchronized here?
@@ -201,7 +201,7 @@ void QrGPUKernel<double>::BatchedGeqrf(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize(
       handle, m, n, a, lda, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
@@ -212,7 +212,7 @@ void QrGPUKernel<double>::BatchedGeqrf(
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgeqrf(
         handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork,
         info_d));
     // Do we need synchronized here?
@@ -235,7 +235,7 @@ void QrGPUKernel<float>::BatchedOrgqr(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
   float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
@@ -246,7 +246,7 @@ void QrGPUKernel<float>::BatchedOrgqr(
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSorgqr(
         handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
         lwork, info_d));
     // Do we need synchronized here?
@@ -270,7 +270,7 @@ void QrGPUKernel<double>::BatchedOrgqr(
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
   double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
@@ -281,7 +281,7 @@ void QrGPUKernel<double>::BatchedOrgqr(
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDorgqr(
         handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr,
         lwork, info_d));
     // Do we need synchronized here?
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index 6250d68730e138..23a0f2d0a24e38 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/range_op.h"
 #include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index aaa4eec7c1bf32..23b4475e1f7c18 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 58af6309e3d28b..6c28daa7eac729 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -161,14 +161,14 @@ void BufferedReader::ReadAsync(size_t i) {
         platform::SetDeviceId(
             BOOST_GET_CONST(platform::CUDAPlace, place_).device);
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             hipEventRecord(events_[i].get(), compute_stream_));
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0));
 #endif
 
@@ -199,19 +199,12 @@ void BufferedReader::ReadAsync(size_t i) {
             memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr,
                          cuda_pinned_place, cuda_pinned_ptr, size,
                          stream_.get());
-#ifdef PADDLE_WITH_HIP
-            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get()));
-#else
-            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
-#endif
+
+            platform::GpuStreamSync(stream_.get());
           }
           cuda[i].set_lod(cpu[i].lod());
         }
-#ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_.get()));
-#else
-        PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_.get()));
-#endif
+        platform::GpuStreamSync(stream_.get());
       }
     }
 #endif
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index c433cac56a4310..3d42486c6df881 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -22,8 +22,8 @@
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_info.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 6b3b4843200188..9c348477963b4e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -36,7 +36,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
@@ -464,9 +465,9 @@ struct ReduceConfig {
       reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
     }
     int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     int num_threads = block_dim->x * block_dim->y;
     int max_num_blocks = max_threads / num_threads;
@@ -506,9 +507,9 @@ struct ReduceConfig {
     left_num = last_dim_num;
     grid_dim->z = grid_z;
     int device_id = platform::GetCurrentDeviceId();
-    int max_mp = platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     // init
     int num_block = (max_threads / left_num);
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index 07329a9175e525..de4847ddc45903 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -16,12 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -97,12 +92,12 @@ class RNNDescriptors {
     bool is_initialized = dropout_state->IsInitialized();
     if (!is_test_ && !is_initialized) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::miopenDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
       dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
                                            place);
@@ -114,19 +109,19 @@ class RNNDescriptors {
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_,
         miopenRNNwithBias, miopenRNNdefault, cudnn_type));
 #elif CUDNN_VERSION >= 6000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
         rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
@@ -135,7 +130,7 @@ class RNNDescriptors {
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
           rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
     }
 #endif
@@ -143,10 +138,10 @@ class RNNDescriptors {
     // ------------------- cudnn weights_size ---------------------
     size_t weights_size_;
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #endif
     PADDLE_ENFORCE_EQ(
@@ -160,18 +155,18 @@ class RNNDescriptors {
     weight_desc_.descriptor<T>(layout, dim_w);
 // ------------------- cudnn workspace, reserve size ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
         workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnGetRNNTrainingReserveSize(
             handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
             reserve_size));
@@ -557,7 +552,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -565,7 +560,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
             rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
             rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
@@ -577,15 +572,13 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
         // for train
         // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
-                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
-                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
-                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
-                reserve_data, reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardTrainingEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(),
+            init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(),
+            w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data,
+            rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+            nullptr, nullptr, nullptr, nullptr, workspace_data_.data<uint8_t>(),
+            workspace_size, reserve_data, reserve_size));
 #else
         PADDLE_THROW(platform::errors::Unavailable(
             "The padded input is supported by "
@@ -606,14 +599,14 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 // for inference
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNForwardInference(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNForwardInference(
           handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
           rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
           workspace_data->data<uint8_t>(), workspace_size));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInference(
           handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
@@ -624,7 +617,7 @@ class RNNCudnnKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
       // for inference
       // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
           handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data,
           rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
           rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data,
@@ -831,7 +824,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
     if (!has_seq_length) {
       if (in_grad) {
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardData(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardData(
             handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
             rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
             rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -842,7 +835,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
             const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
         // This interface is used when the input/output is unpadded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardData(
             handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
             rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
             rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
@@ -855,7 +848,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       }
       if (!weight_grad_list.empty()) {
 #ifdef PADDLE_WITH_HIP
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenRNNBackwardWeights(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
             rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
             rnn.weight_desc(), weight_grad_data,
@@ -865,7 +858,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
         tensor_to_permuted_weight<T>(place, stream, weight_grad,
                                      &weight_grad_list, rnn_mode, is_bidirec);
 #else
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
             handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
             rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
             workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
@@ -878,7 +871,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       // for train
       // This interface is used when the input/output is padded.
       if (in_grad) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
             handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data,
             rnn.y_seq_desc(), out_grad_data, nullptr, nullptr,
             rnn.last_h_desc(), last_h_grad_data, rnn.last_c_desc(),
@@ -891,13 +884,12 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
       }
 
       if (!weight_grad_list.empty()) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNBackwardWeightsEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
-                rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(),
-                out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
-                rnn.weight_desc(), weight_grad_data,
-                const_cast<uint8_t *>(reserve_data), reserve_size));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+            handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
+            rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(), out->data<T>(),
+            workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
+            weight_grad_data, const_cast<uint8_t *>(reserve_data),
+            reserve_size));
       }
 #else
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 111828005222bb..a08339d776ff1a 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 562ff8d576b7d6..0a4a076c6caaef 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index d70bd58887f846..57986d262820d0 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index a712878854298b..586cf3239b5752 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index e3791351cefb3a..6c7a0a8886ef0c 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index 379a07a26dd5c3..4e20844dc3275f 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index 5f9635c8ae111c..307bf4010f7ffe 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -72,11 +72,11 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     const IndexT* segment_ids = segment->data<IndexT>();
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
                   hipMemcpyDeviceToHost));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT),
                    cudaMemcpyDeviceToHost));
 #endif
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 6d8f60ce932abb..8092a40d19b195 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -15,7 +15,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index bacaaeadbf5765..bb928cf401c330 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index c8b6156881c96f..1c4265a71d4eac 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index d4f4051c3a4602..f63fa5be7f496c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu
index f2800c60c33048..115b3f47d664ba 100644
--- a/paddle/fluid/operators/shard_index_op.cu
+++ b/paddle/fluid/operators/shard_index_op.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/shard_index_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index dbc3e1a7ebe26f..582d1ea0f26af3 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 8611249a29f636..cc012230c10629 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -21,7 +21,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h
index 68b694a59f47d9..c538fbade8ae8f 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -18,12 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
@@ -503,12 +499,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data,
           MIOPEN_SOFTMAX_LOG, mode));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data,
           MIOPEN_SOFTMAX_ACCURATE, mode));
@@ -517,12 +513,12 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
           handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
           desc_, x.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           out_data));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
           handle, CUDNN_SOFTMAX_ACCURATE, mode,
           platform::CudnnDataType<T>::kOne(), desc_, x.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, out_data));
@@ -591,12 +587,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
           desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           dx_data, MIOPEN_SOFTMAX_LOG, mode));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
           handle, platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(),
           desc_, dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
           dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
@@ -605,12 +601,12 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
     if (LogMode) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
           handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
           desc_, out.data<T>(), desc_, dout.data<T>(),
           platform::CudnnDataType<T>::kZero(), desc_, dx_data));
     } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
           handle, CUDNN_SOFTMAX_ACCURATE, mode,
           platform::CudnnDataType<T>::kOne(), desc_, out.data<T>(), desc_,
           dout.data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 3b1753b49b11d1..cb97a0bb27cb5c 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -18,13 +18,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 6a9dca9fe2a6ae..520c95b6f3484d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -20,12 +20,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -453,14 +449,14 @@ static void SoftmaxWithCrossEntropyHardLabel(
 #ifdef PADDLE_WITH_HIP
     auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                  : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
         handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
         platform::CudnnDataType<T>::kZero(), descp, softmax_data,
         MIOPEN_SOFTMAX_LOG, mode));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
         handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
         descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
         softmax_data));
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index 924ec7cd52d50d..39639768241d49 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -66,7 +66,7 @@ class CuFFTHandle {
 
  public:
   CuFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
   }
 
   CuFFTHandle(const CuFFTHandle& other) = delete;
@@ -79,7 +79,7 @@ class CuFFTHandle {
   const ::cufftHandle& get() const { return handle_; }
 
   ~CuFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
   }
 };
 
@@ -136,12 +136,12 @@ class FFTConfig {
     }
 
     // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
         plan(), /* autoAllocate */ 0));
 
     size_t ws_size_t;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
         plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
@@ -176,7 +176,7 @@ class HIPFFTHandle {
 
  public:
   HIPFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
   }
 
   HIPFFTHandle(const HIPFFTHandle& other) = delete;
@@ -189,7 +189,7 @@ class HIPFFTHandle {
   const ::hipfftHandle& get() const { return handle_; }
 
   ~HIPFFTHandle() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
   }
 };
 using plan_size_type = int;
@@ -248,12 +248,12 @@ class FFTConfig {
     }();
 
     // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
         plan(), /* autoAllocate */ 0));
 
     size_t ws_size_t;
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
         plan(), signal_ndim, signal_sizes.data(),
         /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
         /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index e97af7cea7e087..4ad99724fd6224 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -96,7 +96,7 @@ static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
                                 void* out_data, bool forward) {
   auto& plan = config.plan();
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
       plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
 }
 
@@ -167,20 +167,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
   if (value_type == framework::proto::VarType::FP32) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
             plan, static_cast<hipfftComplex*>(in_data),
             static_cast<hipfftComplex*>(out_data),
             forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
             plan, static_cast<hipfftReal*>(in_data),
             static_cast<hipfftComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
             plan, static_cast<hipfftComplex*>(in_data),
             static_cast<hipfftReal*>(out_data)));
         return;
@@ -189,20 +189,20 @@ static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
   } else if (value_type == framework::proto::VarType::FP64) {
     switch (config.transform_type()) {
       case FFTTransformType::C2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
             plan, static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data),
             forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
         return;
       }
       case FFTTransformType::R2C: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
             plan, static_cast<hipfftDoubleReal*>(in_data),
             static_cast<hipfftDoubleComplex*>(out_data)));
         return;
       }
       case FFTTransformType::C2R: {
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
             plan, static_cast<hipfftDoubleComplex*>(in_data),
             static_cast<hipfftDoubleReal*>(out_data)));
         return;
@@ -332,11 +332,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   }
 
   // prepare cufft for execution
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
   workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
       config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
   exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
@@ -355,11 +355,11 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
   config = &(plan_cache.lookup(key));
 
   // prepare cufft for execution
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   framework::Tensor workspace_tensor;
   workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
       config->plan(), workspace_tensor.data<To>()));
   // execute transform plan
   exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu
index 9e5e45f4d22d91..5b3f03445d3525 100644
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
@@ -16,7 +16,7 @@
 #include <limits>
 #include <vector>
 #include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/svd_op.cu b/paddle/fluid/operators/svd_op.cu
index ade7496d646221..0a7ed093ad0b84 100644
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
@@ -91,9 +91,9 @@ void SvdGPUKernel<float>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
@@ -102,7 +102,7 @@ void SvdGPUKernel<float>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -116,7 +116,7 @@ void SvdGPUKernel<float>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
@@ -134,9 +134,9 @@ void SvdGPUKernel<double>::GesvdjBatched(
   int ldt = n;
   int lwork = 0;
   auto handle = dev_ctx.cusolver_dn_handle();
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize(
       handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork,
       gesvdj_params));
   auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
@@ -145,7 +145,7 @@ void SvdGPUKernel<double>::GesvdjBatched(
   int stride_U = ldu * (thin_UV ? k : m);
   int stride_V = ldt * (thin_UV ? k : n);
   for (int i = 0; i < batchSize; ++i) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgesvdj(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj(
         handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i,
         U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork,
         info, gesvdj_params));
@@ -159,7 +159,7 @@ void SvdGPUKernel<double>::GesvdjBatched(
         platform::errors::PreconditionNotMet(
             "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
   }
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index 69617b7e208a88..201de5ac1a4285 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -21,19 +21,18 @@ limitations under the License. */
 #include <vector>
 #ifdef __NVCC__
 #include "cub/cub.cuh"
-#include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
-#include "paddle/fluid/platform/miopen_helper.h"
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -192,7 +191,7 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
     if (comm) {
       int dtype = platform::ToNCCLDataType(mean_out->type());
       // In-place operation
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
           comm, stream));
     }
@@ -466,7 +465,7 @@ void SyncBatchNormGradFunctor(
   if (comm) {
     int dtype = platform::ToNCCLDataType(scale->type());
     // In-place operation
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
         comm, stream));
   }
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index cb1ff5335cdf04..eb5a78f9dc0ece 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -10,8 +10,8 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 07749f90ebaa29..05ae5c9188cebc 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #endif
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/float16.h"
 
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index 784d97b543fbd6..6c637effee2cba 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/trunc_op.cu b/paddle/fluid/operators/trunc_op.cu
index a284e0ea6e3939..68d8c608f63388 100644
--- a/paddle/fluid/operators/trunc_op.cu
+++ b/paddle/fluid/operators/trunc_op.cu
@@ -12,8 +12,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/trunc_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index f38f5d9f723579..1426c799007a02 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -16,12 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#endif
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
index b1cd172923ee6d..feb8e83864e84e 100644
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
@@ -24,7 +24,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/where_index_op.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
index 721c6e5390e85d..54b0d5b69086cd 100644
--- a/paddle/fluid/operators/where_op.cu
+++ b/paddle/fluid/operators/where_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 
 namespace platform = paddle::platform;
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 280674f9ab1478..4f3c70f5ea0505 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -47,18 +47,11 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 IF(WITH_GPU)
-    nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
-    nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-    nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
     nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
 ELSE()
     cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
 
-IF(WITH_ROCM)
-    hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
-ENDIF()
-
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
@@ -125,8 +118,7 @@ if(WITH_ASCEND_CL)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
-    target_link_libraries(device_context cuda_resource_pool)
+    target_link_libraries(device_context gpu_resource_pool)
 endif()
 
 if(WITH_ASCEND_CL)
@@ -147,8 +139,6 @@ if(WITH_GPU)
   nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
-  nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
   nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
 
@@ -158,8 +148,6 @@ if(WITH_ROCM)
   hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
-  hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
   hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
 
@@ -172,11 +160,9 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
-  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
-  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
   hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
   cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 03359d932b5ab9..25f8f3ed9f3d8e 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include <utility>
 
-#include "paddle/fluid/platform/cuda_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 namespace paddle {
 namespace platform {
@@ -96,7 +96,7 @@ NCCLComm* NCCLCommContext::CreateComm(ncclUniqueId* nccl_id, int nranks,
 
   ncclComm_t comm = nullptr;
   SetDeviceId(dev_id);
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
 
   auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id);
@@ -121,7 +121,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
 
   const int kDevices = dev_ids.size();
   ncclComm_t comms[kDevices];
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
@@ -153,18 +153,18 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
           << ", rind_id: " << ring_id;
   ncclComm_t comms[kDevices];
   {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipSetDevice(i));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(i));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
       platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers,
                                           *nccl_id, train_id * kDevices + i);
       VLOG(1) << "ncclCommInitRank: " << i;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     VLOG(1) << "nccl group end seccessss";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h
index a85ebf4b813663..40204c0ed83f94 100644
--- a/paddle/fluid/platform/cuda_device_guard.h
+++ b/paddle/fluid/platform/cuda_device_guard.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index 6586146c5aefbe..fe082c850aa4d2 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_graph.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 515453afb63bed..5b67473b77eadd 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -2,6 +2,9 @@
 IF(WITH_XPU)
   add_subdirectory(xpu)
 ENDIF()
+IF(WITH_GPU OR WITH_ROCM)
+  add_subdirectory(gpu)
+ENDIF()
 
 # NPU
 IF(WITH_ASCEND OR WITH_ASCEND_CL)
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
new file mode 100644
index 00000000000000..5cf2258204fdab
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -0,0 +1,15 @@
+IF(WITH_GPU)
+    add_subdirectory(cuda)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS cuda_info gflags glog enforce monitor dynload_cuda)
+
+    nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+    nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+ELSEIF(WITH_ROCM)
+    add_subdirectory(rocm)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS rocm_info gflags glog enforce monitor dynload_cuda)
+
+    hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+    hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+ENDIF()
+
+cc_library(gpu_resource_pool SRCS gpu_resource_pool.cc DEPS gpu_info)
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
new file mode 100644
index 00000000000000..5df1de1b00fac7
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+nv_library(cuda_info SRCS cuda_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
+nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
+
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
similarity index 67%
rename from paddle/fluid/platform/cuda_device_function.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 352143302388a9..e7d807573957f5 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -22,16 +22,11 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_HIP
-#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
-#else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
-#ifdef PADDLE_WITH_CUDA
   if (dim > 512) {
     return 1024;
   } else if (dim > 256) {
@@ -45,17 +40,6 @@ inline static int RoundToPowerOfTwo(int dim) {
   } else {
     return 32;
   }
-#else  // HIP results in error or nan if > 256
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
-  }
-#endif
 }
 
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
@@ -76,71 +60,15 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl_down(val, delta, width);
-#else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
-#endif
 }
 
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl_xor(val, width);
-#else
   return __shfl_xor_sync(mask, val, width);
-#endif
-}
-
-#if defined(PADDLE_WITH_HIP)
-template <>
-__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
-                                                       float16 val, int delta,
-                                                       int width) {
-  return float16(__shfl_down(static_cast<float>(val),
-                             static_cast<unsigned>(delta), width));
 }
 
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
-    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
-  float real = __shfl_down(val.real, delta, width);
-  float imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double>
-CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
-                    int delta, int width) {
-  double real = __shfl_down(val.real, delta, width);
-  double imag = __shfl_down(val.imag, delta, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
-                                                      float16 val, int width) {
-  return float16(__shfl_xor(static_cast<float>(val), width));
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<float> val, int width) {
-  float real = __shfl_xor(val.real, width);
-  float imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
-    unsigned mask, paddle::platform::complex<double> val, int width) {
-  double real = __shfl_xor(val.real, width);
-  double imag = __shfl_xor(val.imag, width);
-  return paddle::platform::complex<double>(real, imag);
-}
-#else
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
@@ -197,16 +125,11 @@ __forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
       __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
   return paddle::platform::complex<double>(real, imag);
 }
-#endif
 
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if defined(PADDLE_WITH_HIP)
-  return __shfl(val, src_line, width);
-#else
   return __shfl_sync(mask, val, src_line, width);
-#endif
 }
 
 template <typename T>
@@ -216,17 +139,13 @@ HOSTDEVICE T Infinity() {
 
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
-// NOTE(zcd): The warp size should be taken from the
-// parameters of the GPU but not specified as 32 simply.
-// To make the reduceSum more efficiently,
-// I use Warp-Level Parallelism and assume the Warp size
-// is 32 which may be different for different GPU,
-// but most card's warp size is 32.
-#ifdef PADDLE_WITH_HIP
-  const int warpSize = 64;
-#else
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
   const int warpSize = 32;
-#endif
   __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);
diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
similarity index 90%
rename from paddle/fluid/platform/cuda_graph.cc
rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index 6f3d452ef5c503..3970acf82d3ea3 100644
--- a/paddle/fluid/platform/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/cuda_graph.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
 
 namespace paddle {
 namespace platform {
@@ -23,11 +23,11 @@ void CUDAGraph::Reset() {
   if (is_reset_) return;
 #if CUDA_VERSION >= 10010
   for (auto graph : graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph));
   }
   graphs_.clear();
   for (auto exec_graph : exec_graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphExecDestroy(exec_graph));
   }
   exec_graphs_.clear();
 #endif
@@ -46,7 +46,7 @@ void CUDAGraph::Replay() {
                     errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
   for (auto exec_graph : exec_graphs_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphLaunch(exec_graph, stream_));
   }
 #endif
 }
@@ -58,7 +58,7 @@ void CUDAGraph::BeginSegmentCapture() {
       IsCapturing(), true,
       errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                                "Graph is capturing."));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamBeginCapture(
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
   PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
                     platform::errors::PermissionDenied(
@@ -92,19 +92,19 @@ void CUDAGraph::EndSegmentCapture() {
   PADDLE_ENFORCE_EQ(IsCapturing(), true,
                     errors::PermissionDenied("No CUDA Graph is capturing."));
   cudaGraph_t graph;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamEndCapture(capturing_graph_->stream_, &graph));
   auto num_nodes = static_cast<size_t>(-1);
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphGetNodes(graph, nullptr, &num_nodes));
   if (num_nodes == 0) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaGraphDestroy(graph));
     VLOG(10) << "Skip empty CUDA Graph with ID " << capturing_graph_->id_
              << ", segment id " << capturing_graph_->graphs_.size();
     return;
   }
 
   cudaGraphExec_t exec_graph;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaGraphInstantiate(&exec_graph, graph, nullptr, nullptr, 0));
   VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_
            << ", segment id " << capturing_graph_->graphs_.size();
@@ -123,7 +123,7 @@ bool CUDAGraph::IsValidCapturing() {
   if (!IsCapturing()) return false;
   cudaStreamCaptureStatus status;
   CUDAGraphID id;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
   return status == cudaStreamCaptureStatusActive;
 #else
@@ -154,7 +154,7 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname,
         ConcatPath(dirname, "segment_" + std::to_string(i) + ".dot");
     VLOG(10) << "Save the " << i << "-th segment of graph " << id_ << " to "
              << filename;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags));
   }
 #else
diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
similarity index 96%
rename from paddle/fluid/platform/cuda_graph.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index f70a66f76242fb..0856e0fad1900a 100644
--- a/paddle/fluid/platform/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -21,7 +21,7 @@
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -129,7 +129,7 @@ class CUDAGraphCaptureModeGuard {
   explicit CUDAGraphCaptureModeGuard(
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
       // After cudaThreadExchangeStreamCaptureMode is called,
       // the variable "mode" would be set to the old capturing mode.
       old_mode_ = mode;
@@ -138,7 +138,7 @@ class CUDAGraphCaptureModeGuard {
 
   ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaThreadExchangeStreamCaptureMode(&old_mode_));
     }
   }
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
similarity index 78%
rename from paddle/fluid/platform/cuda_helper.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 202be920c55953..3199af9c975205 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -16,12 +16,7 @@
 
 #include <mutex>  // NOLINT
 
-#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/rocblas.h"
-#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -72,28 +67,13 @@ namespace platform {
  *
 */
 
-#ifdef __HIPCC__
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
-  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
-  for (index_type i = __index__; __index__ < (num);                   \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
-#else
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
   int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
   for (index_type i = __index__; __index__ < (num);          \
        __index__ += blockDim.x * gridDim.x, i = __index__)
-#endif
-
-#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
 
 class CublasHandleHolder {
  public:
-#ifdef PADDLE_WITH_HIP
-  explicit CublasHandleHolder(hipStream_t stream) {
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
-  }
-#else
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
@@ -109,20 +89,11 @@ class CublasHandleHolder {
     }
 #endif  // CUDA_VERSION >= 9000
   }
-#endif
 
-#ifdef PADDLE_WITH_HIP
-  const rocblas_handle& GetCublasHandle() const { return handle_; }
-#else
   const cublasHandle_t& GetCublasHandle() const { return handle_; }
-#endif
 
   ~CublasHandleHolder() PADDLE_MAY_THROW {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
-#else
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
-#endif
   }
 
   template <typename Callback>
@@ -134,11 +105,7 @@ class CublasHandleHolder {
  private:
   DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
 
-#ifdef PADDLE_WITH_HIP
-  rocblas_handle handle_;
-#else
   cublasHandle_t handle_;
-#endif
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
new file mode 100644
index 00000000000000..6109ed65543189
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_info.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<paddle::gpuDeviceProp> g_device_props;
+
+namespace paddle {
+namespace platform {
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  return dynload::cudnnGetVersion();
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  cudaError_t status = cudaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code =
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code =
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 10 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() {
+  int device = GetCurrentDeviceId();
+  int driver_version = GetGPUComputeCapability(device);
+  return driver_version >= 70;
+}
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(
+      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDevice(&device_id));
+  return device_id;
+}
+
+dim3 GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  dim3 ret;
+  int size;
+  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret.x = size;
+
+  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret.y = size;
+
+  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret.z = size;
+  return ret;
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = platform::GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = platform::GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(platform::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id, static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return cudaGetLastError(); }
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
similarity index 85%
rename from paddle/fluid/platform/cuda_profiler.cc
rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index 998dd80dc5e7df..42351fe097a9df 100644
--- a/paddle/fluid/platform/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 
 namespace paddle {
 namespace platform {
@@ -25,13 +25,13 @@ void CudaProfilerInit(std::string output_file, std::string output_mode,
                      "`csv`, but received `%s`.",
                      output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
 }
 
-void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
+void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
 
-void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
+void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
 
 #ifndef _WIN32
 void CudaNvtxRangePush(std::string name) {
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
similarity index 100%
rename from paddle/fluid/platform/cuda_profiler.h
rename to paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
similarity index 84%
rename from paddle/fluid/platform/cudnn_desc.h
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
index 318c85ee484bef..7bff2c69381e69 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h
@@ -23,7 +23,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -99,7 +99,7 @@ class ActivationDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cudnnDestroyActivationDescriptor(t));
         t = nullptr;
       }
@@ -107,13 +107,13 @@ class ActivationDescriptor {
   };
   ActivationDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(cudnnActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor(
         desc_.get(), mode, CUDNN_NOT_PROPAGATE_NAN, static_cast<double>(coef)));
   }
 
@@ -130,14 +130,14 @@ class TensorDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   TensorDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -153,7 +153,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
         desc_.get(), ToCudnnDataType(tensor.type()), dims_with_group.size(),
         dims_with_group.data(), strides.data()));
   }
@@ -166,7 +166,7 @@ class TensorDescriptor {
     } else {
       transformed_dims = dims;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
         desc_.get(), format, dtype, transformed_dims.size(),
         transformed_dims.data()));
   }
@@ -187,14 +187,14 @@ class FilterDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(t));
         t = nullptr;
       }
     }
   };
   FilterDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -211,7 +211,7 @@ class FilterDescriptor {
     if (groups > 1) {
       transformed_dims[1] = transformed_dims[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
         desc_.get(), dtype, format, transformed_dims.size(),
         transformed_dims.data()));
   }
@@ -233,7 +233,7 @@ class ConvolutionDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cudnnDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
@@ -241,7 +241,7 @@ class ConvolutionDescriptor {
   };
   ConvolutionDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
@@ -255,28 +255,26 @@ class ConvolutionDescriptor {
     cudnnDataType_t compute_type =
         (dtype == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     T* desc = desc_.get();
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
         desc, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, compute_type));
 #if CUDNN_VERSION_MIN(7, 0, 1)
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         desc, CUDNN_DEFAULT_MATH));
     if (dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(desc,
-                                                         CUDNN_TENSOR_OP_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          desc, CUDNN_TENSOR_OP_MATH));
 #if CUDA_VERSION >= 11000
 #if CUDNN_VERSION_MIN(8, 1, 0)
     } else if (dtype == CUDNN_DATA_BFLOAT16) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(desc,
-                                                         CUDNN_TENSOR_OP_MATH));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          desc, CUDNN_TENSOR_OP_MATH));
 #endif  // CUDNN_VERSION_MIN(8,1,0)
     } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
     }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
similarity index 88%
rename from paddle/fluid/platform/cudnn_helper.h
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
index 65dd69a37d37f8..2bcdbaa201889b 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -191,10 +191,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
 class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
   }
 
   inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -216,20 +216,20 @@ class ScopedTensorDescriptor {
 
     if (dims.size() == 4) {
       if (format == CUDNN_TENSOR_NCHW) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
             desc_, type, dims_with_group.size(), dims_with_group.data(),
             strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensor4dDescriptor(
             desc_, format, type, dims[0], dims[3], dims[1], dims[2]));
       }
     } else if (dims.size() == 5) {
       if (format == CUDNN_TENSOR_NCHW) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
             desc_, type, dims_with_group.size(), dims_with_group.data(),
             strides.data()));
       } else {  // CUDNN_TENSOR_NHWC
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptorEx(
             desc_, format, type, dims.size(), dims.data()));
       }
     }
@@ -247,7 +247,7 @@ class ScopedTensorDescriptor {
   inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type,
                                             const std::vector<int>& dim,
                                             const std::vector<int>& stride) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
         desc_, cudnn_type, dim.size(), dim.data(), stride.data()));
     return desc_;
   }
@@ -269,11 +269,11 @@ class ScopedTensorDescriptor {
 class ScopedRNNTensorDescriptor {
  public:
   ScopedRNNTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
   }
 
   ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
   }
 
   inline cudnnRNNDataDescriptor_t descriptor(
@@ -288,7 +288,7 @@ class ScopedRNNTensorDescriptor {
       layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
     }
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
         desc_, cudnn_type, layout, max_seq_length, batch_size, input_size,
         seq_length.data(), static_cast<void*>(&padding_fill)));
 
@@ -314,10 +314,10 @@ class ScopedRNNTensorDescriptor {
 class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
   }
 
   inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle,
@@ -327,19 +327,19 @@ class ScopedDropoutDescriptor {
                                              framework::Tensor* dropout_state_,
                                              int seed, size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor(
           desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
           0 /* seed */));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
     } else {
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
     }
     return desc_;
@@ -354,10 +354,10 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
   }
 
   inline cudnnRNNDescriptor_t desc() { return desc_; }
@@ -370,10 +370,10 @@ class ScopedRNNDescriptor {
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
   }
 
   inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
@@ -389,7 +389,7 @@ class ScopedFilterDescriptor {
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
         desc_, type, format, kernel_with_group.size(),
         kernel_with_group.data()));
     return desc_;
@@ -413,11 +413,11 @@ class ScopedFilterDescriptor {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyConvolutionDescriptor(desc_));
   }
 
@@ -438,7 +438,7 @@ class ScopedConvolutionDescriptor {
 
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
@@ -459,10 +459,10 @@ class ScopedConvolutionDescriptor {
 class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
   }
 
   inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -480,7 +480,7 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
         desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
@@ -495,18 +495,18 @@ class ScopedPoolingDescriptor {
 class ScopedSpatialTransformerDescriptor {
  public:
   ScopedSpatialTransformerDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
   }
   ~ScopedSpatialTransformerDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
                                                         const int dimA[]) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
         desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
     return desc_;
   }
@@ -519,11 +519,11 @@ class ScopedSpatialTransformerDescriptor {
 class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnDestroyActivationDescriptor(desc_));
   }
 
@@ -561,7 +561,7 @@ class ScopedActivationDescriptor {
             "Unrecognized CUDNN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetActivationDescriptor(
         desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
     return desc_;
   }
@@ -587,15 +587,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
 class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline cudnnCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
     return desc_;
   }
diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
similarity index 98%
rename from paddle/fluid/platform/cudnn_helper_test.cc
rename to paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
index 98ec2be87755cd..851d0d18c604cd 100644
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
similarity index 98%
rename from paddle/fluid/platform/cuda_helper_test.cu
rename to paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index fd46aa23934035..ab8bb2cad8c512 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -21,11 +21,11 @@
 #include <random>
 
 #define PADDLE_CUDA_FP16
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
similarity index 90%
rename from paddle/fluid/platform/cudnn_desc_test.cc
rename to paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
index db5362f5cb1f5d..8ea30027e8aded 100644
--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/device/gpu/cudnn_desc_test.cc
@@ -12,11 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_desc.h"
-#else
-#include "paddle/fluid/platform/cudnn_desc.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h
similarity index 50%
rename from paddle/fluid/platform/type_defs.h
rename to paddle/fluid/platform/device/gpu/gpu_device_function.h
index 88a2d16472fa70..a8daa5e87fdc38 100644
--- a/paddle/fluid/platform/type_defs.h
+++ b/paddle/fluid/platform/device/gpu/gpu_device_function.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,28 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
+#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h"
 #else
-#include <cuda_runtime.h>
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h"
 #endif
 
-namespace paddle {
-
-#ifdef PADDLE_WITH_HIP
-#define gpuSuccess hipSuccess
-using gpuStream_t = hipStream_t;
-using gpuError_t = hipError_t;
-using gpuEvent_t = hipEvent_t;
-using gpuDeviceProp = hipDeviceProp_t;
-#else
-#define gpuSuccess cudaSuccess
-using gpuStream_t = cudaStream_t;
-using gpuError_t = cudaError_t;
-using gpuEvent_t = cudaEvent_t;
-using gpuDeviceProp = cudaDeviceProp;
 #endif
-
-using CUDAGraphID = unsigned long long;  // NOLINT
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_dnn.h b/paddle/fluid/platform/device/gpu/gpu_dnn.h
new file mode 100644
index 00000000000000..3f9bc5e6de80b5
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_dnn.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_desc.h"
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
+#else  // CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h"
+#endif
+
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
new file mode 100644
index 00000000000000..6077a7b625d250
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
+#else
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
+#endif
+
+#define CUDA_KERNEL_LOOP(i, num) CUDA_KERNEL_LOOP_TYPE(i, num, int)
+
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
new file mode 100644
index 00000000000000..e68277cc37b381
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -0,0 +1,356 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include <cstdlib>
+#include <mutex>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/miopen.h"
+#else
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#endif
+#include "paddle/fluid/memory/malloc.h"
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_string(selected_gpus);
+DECLARE_uint64(gpu_memory_limit_mb);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_GPU_MEM_STAT;
+namespace paddle {
+namespace platform {
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetGPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void GpuMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentDeviceId());
+}
+
+size_t GpuAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  GpuMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+
+static size_t GpuAllocSize(bool realloc) {
+  size_t available_to_alloc = GpuAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
+
+size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t max_chunk_size = GpuMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+static void RaiseNonOutOfMemoryError(gpuError_t *status) {
+  if (*status == gpuErrorOutOfMemory) {
+    *status = gpuSuccess;
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(*status);
+
+  *status = platform::GpuGetLastError();
+  if (*status == gpuErrorOutOfMemory) {
+    *status = gpuSuccess;
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(*status);
+}
+
+class RecordedGpuMallocHelper {
+ private:
+  explicit RecordedGpuMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
+
+ public:
+  static RecordedGpuMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetGPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        instances_.emplace_back(
+            new RecordedGpuMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
+   * or cudaSuccess would be returned, and the cudaGetLastError() flag
+   * would be clear.
+   */
+  gpuError_t Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
+      return gpuErrorOutOfMemory;
+    }
+
+    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto result = hipMalloc(ptr, size);
+#else
+    CUDAGraphCaptureModeGuard capture_mode_guard;
+    auto result = cudaMalloc(ptr, size);
+#endif
+    if (result == gpuSuccess) {
+      cur_size_.fetch_add(size);
+      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+      return gpuSuccess;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return gpuErrorOutOfMemory;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto err = hipFree(ptr);
+    if (err != hipErrorDeinitialized) {
+#else
+    auto err = cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+#endif
+      PADDLE_ENFORCE_GPU_SUCCESS(err);
+      cur_size_.fetch_sub(size);
+      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+    } else {
+      platform::GpuGetLastError();  // clear the error flag when
+                                    // cudaErrorCudartUnloading /
+                                    // hipErrorDeinitialized
+    }
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+      auto result = hipMemGetInfo(actual_avail, actual_total);
+#else
+      auto result = cudaMemGetInfo(actual_avail, actual_total);
+#endif
+      if (result != gpuSuccess) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const { return cur_size_.load(); }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                     const CUmemAllocationProp *prop,
+                     unsigned long long flags) {  // NOLINT
+    auto result =
+        paddle::platform::dynload::cuMemCreate(handle, size, prop, flags);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_add(size);
+    }
+    return result;
+  }
+
+  CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) {
+    auto result = paddle::platform::dynload::cuMemRelease(handle);
+    if (result == CUDA_SUCCESS) {
+      cur_size_.fetch_sub(size);
+    }
+    return result;
+  }
+
+#endif
+#endif
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  std::atomic<uint64_t> cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+};  // NOLINT
+
+std::once_flag RecordedGpuMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
+    RecordedGpuMallocHelper::instances_;
+
+gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedGpuFree(void *p, size_t size, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10020
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                              const CUmemAllocationProp *prop,
+                              unsigned long long flags, int dev_id) {  // NOLINT
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
+                                                              prop, flags);
+}
+
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                               int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
+}
+#endif
+#endif
+
+bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedGpuMallocSize(int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsGpuMallocRecorded(int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+void EmptyCache(void) {
+  std::vector<int> devices = GetSelectedDevices();
+  for (auto device : devices) {
+    memory::Release(CUDAPlace(device));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
similarity index 70%
rename from paddle/fluid/platform/gpu_info.h
rename to paddle/fluid/platform/device/gpu/gpu_info.h
index 93e787fcf36f50..18e6ac83295f89 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,49 +11,42 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-// Note: this header for simplify HIP and CUDA type string
+
 #include <stddef.h>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/type_defs.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 
 namespace paddle {
 namespace platform {
-//! Get the version of cudnn
-int CudnnVersion();
+//! Get the version of dnn
+int DnnVersion();
 
 //! Get the total number of GPU devices in system.
-int GetCUDADeviceCount();
+int GetGPUDeviceCount();
 
 //! Get the compute capability of the ith GPU (format: major * 10 + minor)
-int GetCUDAComputeCapability(int i);
+int GetGPUComputeCapability(int id);
 
 //! Get the runtime version of the ith GPU
-int GetCUDARuntimeVersion(int id);
+int GetGPURuntimeVersion(int id);
 
 //! Get the driver version of the ith GPU
-int GetCUDADriverVersion(int id);
+int GetGPUDriverVersion(int id);
 
 //! Wheter the current device support TensorCore
 bool TensorCoreAvailable();
 
 //! Get the MultiProcessors of the ith GPU.
-int GetCUDAMultiProcessors(int i);
+int GetGPUMultiProcessors(int id);
 
 //! Get the MaxThreads of each MultiProcessor of the ith GPU.
-int GetCUDAMaxThreadsPerMultiProcessor(int i);
+int GetGPUMaxThreadsPerMultiProcessor(int id);
 
 //! Get the MaxThreads of each block of the ith GPU.
-int GetCUDAMaxThreadsPerBlock(int i);
+int GetGPUMaxThreadsPerBlock(int id);
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
@@ -97,19 +87,11 @@ size_t GpuMaxChunkSize();
 
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-#ifdef PADDLE_WITH_HIP
-                    enum hipMemcpyKind kind, hipStream_t stream);
-#else
-                    enum cudaMemcpyKind kind, cudaStream_t stream);
-#endif
+                    gpuMemcpyKind kind, gpuStream_t stream);
 
 //! Copy memory from address src to dst synchronously.
 void GpuMemcpySync(void *dst, const void *src, size_t count,
-#ifdef PADDLE_WITH_HIP
-                   enum hipMemcpyKind kind);
-#else
-                   enum cudaMemcpyKind kind);
-#endif
+                   gpuMemcpyKind kind);
 
 //! Copy memory from one device to another device asynchronously.
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
@@ -125,34 +107,40 @@ void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
 //! Blocks until stream has completed all operations.
 void GpuStreamSync(gpuStream_t stream);
 
+void GpuDestroyStream(gpuStream_t stream);
+
+// ! Blocks until device has completed all operations.
+void GpuDeviceync();
+
 //! CudaMalloc with recorded info
-gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
+gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id);
 
 //! CudaFree with recorded info
-void RecordedCudaFree(void *p, size_t size, int dev_id);
+void RecordedGpuFree(void *p, size_t size, int dev_id);
+
+gpuError_t GpuGetLastError();
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-
 //! cuMemCreate with recorded info
-CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags, int dev_id);  // NOLINT
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                              const CUmemAllocationProp *prop,
+                              unsigned long long flags, int dev_id);  // NOLINT
 
 //! cuMemRelease with recorded info
-CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
-                              int dev_id);
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+                               int dev_id);
 #endif
 #endif
 
 //! Get available and total gpu memory with considering limitation
-bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                            size_t *actual_total, int dev_id);
+bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
 
 //! Get recorded cudaMalloc size. If record is disabled, return 0.
-uint64_t RecordedCudaMallocSize(int dev_id);
+uint64_t RecordedGpuMallocSize(int dev_id);
 
-bool IsCudaMallocRecorded(int dev_id);
+bool IsGpuMallocRecorded(int dev_id);
 
 //! Empty idle cached memory held by the allocator.
 void EmptyCache(void);
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
similarity index 98%
rename from paddle/fluid/platform/gpu_launch_config.h
rename to paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 399f1dbaa03e1f..55f4c8eb4cd55e 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
similarity index 100%
rename from paddle/fluid/platform/cuda_primitives.h
rename to paddle/fluid/platform/device/gpu/gpu_primitives.h
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
similarity index 84%
rename from paddle/fluid/platform/cuda_resource_pool.cc
rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 70d2ec55057988..2c55eb972b7657 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -13,24 +13,24 @@
 // limitations under the License.
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_resource_pool.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
 
 CudaStreamResourcePool::CudaStreamResourcePool() {
-  int dev_cnt = platform::GetCUDADeviceCount();
+  int dev_cnt = platform::GetGPUDeviceCount();
   pool_.reserve(dev_cnt);
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
       gpuStream_t stream;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 #endif
       return stream;
@@ -39,9 +39,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
     auto deleter = [dev_idx](gpuStream_t stream) {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
     };
 
@@ -69,17 +69,17 @@ std::shared_ptr<CudaStreamObject> CudaStreamResourcePool::New(int dev_idx) {
 }
 
 CudaEventResourcePool::CudaEventResourcePool() {
-  int dev_cnt = platform::GetCUDADeviceCount();
+  int dev_cnt = platform::GetGPUDeviceCount();
   pool_.reserve(dev_cnt);
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
       gpuEvent_t event;
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
 #endif
       return event;
@@ -88,9 +88,9 @@ CudaEventResourcePool::CudaEventResourcePool() {
     auto deleter = [dev_idx](gpuEvent_t event) {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
     };
 
diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
similarity index 100%
rename from paddle/fluid/platform/cuda_resource_pool.h
rename to paddle/fluid/platform/device/gpu/gpu_resource_pool.h
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
new file mode 100644
index 00000000000000..d7362fe9cbd81d
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/dynload/rocblas.h"
+
+#else
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#endif
+
+namespace paddle {
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
+#else  // CDUA
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif
+
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor, cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t, cudnnActivationMode_t,
+                     miopenActivationMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor, cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t, cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor, cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t, cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor, cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t, cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t, cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t, cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = ROCM_CV;
+#else  // CDUA
+
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+  constexpr auto GPU_CV = CUDA_CV;
+#endif
+
+DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, cudaErrorMemoryAllocation,
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+
+#undef DECLARE_CONSTANT_FOR_GPU
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
similarity index 99%
rename from paddle/fluid/platform/nccl_helper.h
rename to paddle/fluid/platform/device/gpu/nccl_helper.h
index e297e7203c6988..f26116749077e2 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -70,11 +70,11 @@ class NCCLGroupGuard {
 
   inline NCCLGroupGuard() {
     NCCLMutex().lock();
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
   }
 
   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     NCCLMutex().unlock();
   }
 };
diff --git a/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
new file mode 100644
index 00000000000000..86b9ecd5f5445c
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/CMakeLists.txt
@@ -0,0 +1,3 @@
+hip_library(rocm_info SRCS rocm_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+
+hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
similarity index 88%
rename from paddle/fluid/platform/miopen_desc.h
rename to paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
index c82e61ceb122c0..d2389ba409e5eb 100644
--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h
@@ -23,8 +23,8 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/miopen_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -88,7 +88,7 @@ class ActivationDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::miopenDestroyActivationDescriptor(t));
         t = nullptr;
       }
@@ -96,13 +96,13 @@ class ActivationDescriptor {
   };
   ActivationDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateActivationDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   template <typename T>
   void set(miopenActivationMode_t mode, const T& coef) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor(
         desc_.get(), mode, static_cast<double>(coef), 0.0, 0.0));
   }
 
@@ -119,15 +119,14 @@ class TensorDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   TensorDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -144,7 +143,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -166,7 +165,7 @@ class TensorDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -183,15 +182,14 @@ class FilterDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(t));
         t = nullptr;
       }
     }
   };
   FilterDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::miopenCreateTensorDescriptor(&raw_ptr));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
   T* desc() { return desc_.get(); }
@@ -212,7 +210,7 @@ class FilterDescriptor {
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
         static_cast<int>(dims_with_group.size()),
         const_cast<int*>(dims_with_group.data()),
@@ -229,7 +227,7 @@ class ConvolutionDescriptor {
   struct Deleter {
     void operator()(T* t) {
       if (t != nullptr) {
-        PADDLE_ENFORCE_CUDA_SUCCESS(
+        PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::miopenDestroyConvolutionDescriptor(t));
         t = nullptr;
       }
@@ -237,7 +235,7 @@ class ConvolutionDescriptor {
   };
   ConvolutionDescriptor() {
     T* raw_ptr;
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateConvolutionDescriptor(&raw_ptr));
     desc_.reset(raw_ptr);
   }
@@ -247,12 +245,12 @@ class ConvolutionDescriptor {
   void set(miopenDataType_t dtype, const std::vector<int>& pads,
            const std::vector<int>& strides, const std::vector<int>& dilations,
            bool allow_tf32, const int groups = 1) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
         (miopenConvolutionDescriptor_t)desc_.get(),
         static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
         miopenConvolution));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::miopenSetConvolutionGroupCount(
             (miopenConvolutionDescriptor_t)desc_.get(), groups));
   }
diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
similarity index 89%
rename from paddle/fluid/platform/miopen_helper.h
rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 46c7da83970419..bd8d05f8124a19 100644
--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -36,13 +37,6 @@ DECLARE_bool(cudnn_deterministic);
 
 namespace paddle {
 namespace platform {
-
-// MIOPEN only support NCHW, just for compatibility with CUDNN API
-typedef enum {
-  MIOPEN_TENSOR_NCHW = 0,
-  MIOPEN_TENSOR_NHWC = 1,
-} miopenTensorFormat_t;
-
 inline const char* miopenGetErrorString(miopenStatus_t status) {
   switch (status) {
     case miopenStatusSuccess:
@@ -188,10 +182,10 @@ inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
 class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -216,12 +210,12 @@ class ScopedTensorDescriptor {
                       platform::errors::InvalidArgument(
                           "format should ONLY be NCHW in MIOPEN."));
     if (dims.size() == 4) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
           desc_, type, dims_with_group.size(),
           const_cast<int*>(dims_with_group.data()),
           const_cast<int*>(strides.data())));
     } else if (dims.size() == 5) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
           desc_, type, dims_with_group.size(),
           const_cast<int*>(dims_with_group.data()),
           const_cast<int*>(strides.data())));
@@ -240,7 +234,7 @@ class ScopedTensorDescriptor {
   inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
                                              const std::vector<int>& dim,
                                              const std::vector<int>& stride) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         desc_, miopen_type, dim.size(), const_cast<int*>(dim.data()),
         const_cast<int*>(stride.data())));
     return desc_;
@@ -262,10 +256,10 @@ class ScopedTensorDescriptor {
 class ScopedDropoutDescriptor {
  public:
   ScopedDropoutDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_));
   }
   ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_));
   }
 
   inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
@@ -275,20 +269,20 @@ class ScopedDropoutDescriptor {
                                               framework::Tensor* dropout_state_,
                                               int seed, size_t state_size) {
     if (dropout_state_ == nullptr) {  // for no dropout or test
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor(
           desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
           0 /* seed */, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
       return desc_;
     }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, seed,
           false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     } else {
       auto dropout_state_dims = dropout_state_->dims();
       state_size = dropout_state_dims[0];
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenRestoreDropoutDescriptor(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenRestoreDropoutDescriptor(
           desc_, handle, dropout_prob_, dropout_state_data, state_size, 0,
           false, false, MIOPEN_RNG_PSEUDO_XORWOW));
     }
@@ -304,10 +298,10 @@ class ScopedDropoutDescriptor {
 class ScopedRNNDescriptor {
  public:
   ScopedRNNDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_));
   }
   ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_));
   }
 
   inline miopenRNNDescriptor_t desc() { return desc_; }
@@ -320,10 +314,10 @@ class ScopedRNNDescriptor {
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
   }
   ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
   }
 
   inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
@@ -344,7 +338,7 @@ class ScopedFilterDescriptor {
     for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
       stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor(
         desc_, type, kernel_with_group.size(),
         const_cast<int*>(kernel_with_group.data()),
         const_cast<int*>(stride_dim.data())));
@@ -369,11 +363,11 @@ class ScopedFilterDescriptor {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateConvolutionDescriptor(&desc_));
   }
   ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenDestroyConvolutionDescriptor(desc_));
   }
 
@@ -391,7 +385,7 @@ class ScopedConvolutionDescriptor {
             "The size of pads and dilations should be equal. But received size "
             "of pads is %d, size of dilations is %d.",
             pads.size(), dilations.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
         desc_, pads.size(), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
         miopenConvolution));
@@ -413,10 +407,10 @@ class ScopedConvolutionDescriptor {
 class ScopedPoolingDescriptor {
  public:
   ScopedPoolingDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_));
   }
   ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_));
   }
 
   inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
@@ -434,7 +428,7 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
         desc_, GetPoolingMode(mode), kernel.size(),
         const_cast<int*>(kernel.data()), const_cast<int*>(pads.data()),
         const_cast<int*>(strides.data())));
@@ -449,11 +443,11 @@ class ScopedPoolingDescriptor {
 class ScopedActivationDescriptor {
  public:
   ScopedActivationDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenCreateActivationDescriptor(&desc_));
   }
   ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         dynload::miopenDestroyActivationDescriptor(desc_));
   }
 
@@ -489,7 +483,7 @@ class ScopedActivationDescriptor {
             "Unrecognized MIOPEN activation mode: %d.",
             static_cast<int>(activation_mode)));
     }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetActivationDescriptor(
         desc_, mode, relu_ceiling, 0.0, 0.0));
     return desc_;
   }
@@ -514,15 +508,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
 class ScopedCTCLossDescriptor {
  public:
   ScopedCTCLossDescriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_));
   }
   ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_));
   }
 
   template <typename T>
   inline miopenCTCLossDescriptor_t descriptor() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetCTCLossDescriptor(
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetCTCLossDescriptor(
         desc_, CudnnDataType<T>::type, 0, false));
     return desc_;
   }
diff --git a/paddle/fluid/platform/miopen_helper_test.cc b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
similarity index 98%
rename from paddle/fluid/platform/miopen_helper_test.cc
rename to paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
index e201f4893f5778..13cf52dc2c6a30 100644
--- a/paddle/fluid/platform/miopen_helper_test.cc
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 #define GOOGLE_GLOG_DLL_DECL
 
-#include "paddle/fluid/platform/miopen_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
new file mode 100644
index 00000000000000..2263383f8fabb0
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace platform {
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+
+inline static int RoundToPowerOfTwo(int dim) {
+  // HIP results in error or nan if > 256
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+}
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
+                                                 int delta,
+                                                 int width = warpSize) {
+  return __shfl_down(val, delta, width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
+                                                int width = warpSize) {
+  return __shfl_xor(val, width);
+}
+
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down(static_cast<float>(val),
+                             static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
+    unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
+  float real = __shfl_down(val.real, delta, width);
+  float imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double>
+CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
+                    int delta, int width) {
+  double real = __shfl_down(val.real, delta, width);
+  double imag = __shfl_down(val.imag, delta, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
+                                                      float16 val, int width) {
+  return float16(__shfl_xor(static_cast<float>(val), width));
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<float> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<float> val, int width) {
+  float real = __shfl_xor(val.real, width);
+  float imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ paddle::platform::complex<double> CudaShuffleXorSync(
+    unsigned mask, paddle::platform::complex<double> val, int width) {
+  double real = __shfl_xor(val.real, width);
+  double imag = __shfl_xor(val.imag, width);
+  return paddle::platform::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
+                                             int width = 32) {
+  return __shfl(val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += platform::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
new file mode 100644
index 00000000000000..a0f3fb0f73ba5e
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                     \
+  int64_t __index__ = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; \
+  for (index_type i = __index__; __index__ < (num);                   \
+       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+
+class CublasHandleHolder {
+ public:
+  explicit CublasHandleHolder(hipStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
+  }
+
+  const rocblas_handle& GetCublasHandle() const { return handle_; }
+
+  ~CublasHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
+  }
+
+  template <typename Callback>
+  inline void Call(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
+
+  rocblas_handle handle_;
+  mutable std::mutex mtx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
new file mode 100644
index 00000000000000..06dba8ce423ef7
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_info.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<paddle::gpuDeviceProp> g_device_props;
+
+namespace paddle {
+namespace platform {
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  return version_major * 100 + version_minor * 10 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  hipError_t status = hipDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
+
+  if (cuda_visible_devices != nullptr) {
+    std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(cuda_visible_devices_str.begin(),
+                    cuda_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "HIP_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code = hipDeviceGetAttribute(
+      &major, hipDeviceAttributeComputeCapabilityMajor, id);
+  auto minor_error_code = hipDeviceGetAttribute(
+      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 100 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int runtime_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int driver_version = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute(
+      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(hipGetDevice(&device_id));
+  return device_id;
+}
+
+dim3 GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  dim3 ret;
+  int size;
+  auto error_code_x =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret.x = size;
+
+  auto error_code_y =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret.y = size;
+
+  auto error_code_z =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret.z = size;
+  return ret;
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = platform::GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = platform::GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(platform::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id, static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(hipGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetGPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than GPU count, "
+                        "but received id is: %d. GPU count is: %d.",
+                        id, GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    gpuMemcpyKind kind, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   gpuMemcpyKind kind) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(dst, src, count, kind));
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return hipGetLastError(); }
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index f1ef8650be4c16..69cea31446680e 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -66,11 +66,11 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
 
 //   inline HCCLGroupGuard() {
 //     HCCLMutex().lock();
-//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
 //   }
 
 //   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
-//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
 //     HCCLMutex().unlock();
 //   }
 // };
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c2dc60a29fe427..a0c9ff09460aff 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -370,10 +370,10 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
       char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
 #endif
     }
@@ -439,14 +439,14 @@ CUDAContext::~CUDAContext() {
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   CUDADeviceGuard guard(place_.device);
-  compute_capability_ = GetCUDAComputeCapability(place_.device);
-  multi_process_ = GetCUDAMultiProcessors(place_.device);
-  max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
+  compute_capability_ = GetGPUComputeCapability(place_.device);
+  multi_process_ = GetGPUMultiProcessors(place_.device);
+  max_threads_per_mp_ = GetGPUMaxThreadsPerMultiProcessor(place_.device);
   max_grid_dim_size_ = GetGpuMaxGridDimSize(place_.device);
-  max_threads_per_block_ = GetCUDAMaxThreadsPerBlock(place_.device);
+  max_threads_per_block_ = GetGPUMaxThreadsPerBlock(place_.device);
 
-  driver_version_ = GetCUDADriverVersion(place_.device);
-  runtime_version_ = GetCUDARuntimeVersion(place_.device);
+  driver_version_ = GetGPUDriverVersion(place_.device);
+  runtime_version_ = GetGPURuntimeVersion(place_.device);
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", GPU Compute Capability: "
@@ -459,7 +459,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
                           << (runtime_version_ % 100) / 10;
 #ifdef PADDLE_WITH_HIP
   size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
       &version_major, &version_minor, &version_patch));
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", MIOpen Version: " << version_major << "."
@@ -499,7 +499,7 @@ CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nccl_comm_) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
   }
 #endif
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 73232994516b61..552d8f1a8c4ffb 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
@@ -28,17 +28,17 @@ limitations under the License. */
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/cuda_helper.h"  // NOLINT
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"  // NOLINT
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -371,7 +371,7 @@ class CUDAContext {
     if (dynload::HasCUDNN()) {
 #ifdef PADDLE_WITH_HIP
       size_t miopen_major, miopen_minor, miopen_patch;
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
           &miopen_major, &miopen_minor, &miopen_patch));
       auto local_miopen_version =
           (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
@@ -388,8 +388,8 @@ class CUDAContext {
             << "Please recompile or reinstall Paddle with compatible MIOPEN "
                "version.";
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::miopenSetStream(cudnn_handle_, RawStream()));
 #else
       auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
@@ -425,9 +425,9 @@ class CUDAContext {
   void DestoryCuDNNContext() {
     if (cudnn_handle_) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
 #endif
     }
     cudnn_handle_ = nullptr;
@@ -442,7 +442,7 @@ class CUDAContext {
 #ifndef PADDLE_WITH_HIP
   void DestoryCuSolverContext() {
     if (cusolver_dn_handle_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnDestroy(cusolver_dn_handle_));
     }
   }
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 2f9413c4f3ea7e..cf617a478eb719 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -23,7 +23,7 @@ TEST(Device, Init) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
 
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
@@ -36,7 +36,7 @@ TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
 
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
@@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) {
   ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
 
   std::vector<Place> gpu_places;
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; ++i) {
     auto dev_ctx = pool.Get(CUDAPlace(i));
     ASSERT_NE(dev_ctx, nullptr);
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index f42eb7ece1a72b..a3f88592b7649f 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -17,12 +17,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index f72eb6731f6276..34845f24ff50dd 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -25,6 +25,12 @@ limitations under the License. */
   (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
    MIOPEN_VERSION_PATCH)  // NOLINT
 
+// MIOPEN only support NCHW, just for compatibility with CUDNN API
+typedef enum {
+  MIOPEN_TENSOR_NCHW = 0,
+  MIOPEN_TENSOR_NHWC = 1,
+} miopenTensorFormat_t;
+
 namespace paddle {
 namespace platform {
 namespace dynload {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 86f71fdf64fba1..530ae6ba79889f 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -96,7 +96,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/type_defs.h"
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
@@ -944,7 +944,7 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                         \
   do {                                                           \
     auto __cond__ = (COND);                                      \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);             \
@@ -1150,7 +1150,7 @@ DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 
 }  // namespace details
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                      \
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);           \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 6ff9e6ea903cd3..b9e42392991699 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -294,14 +294,14 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
-  PADDLE_ENFORCE_CUDA_SUCCESS(value);
+  PADDLE_ENFORCE_GPU_SUCCESS(value);
   return true;
 }
 
 template <typename T>
 bool CheckCudaStatusFailure(T value, const std::string& msg) {
   try {
-    PADDLE_ENFORCE_CUDA_SUCCESS(value);
+    PADDLE_ENFORCE_GPU_SUCCESS(value);
     return false;
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 2b11de48a1ec70..136dc2d7252083 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -148,9 +148,9 @@ class CudaEvent {
 
   void Record(const paddle::platform::stream::CUDAStream& stream) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
 #endif
   }
 
@@ -172,15 +172,15 @@ class CudaEvent {
       return false;
     }
 #endif
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     return false;
   }
 
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventSynchronize(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventSynchronize(event_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
   }
   gpuEvent_t GetRawCudaEvent() { return event_; }
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index 6e5c7f4e916609..5518dabbf92a47 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
deleted file mode 100644
index 9dc6254234a979..00000000000000
--- a/paddle/fluid/platform/gpu_info.cc
+++ /dev/null
@@ -1,734 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/gpu_info.h"
-#include <cstdlib>
-#include <mutex>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/dynload/miopen.h"
-#else
-#include "paddle/fluid/platform/cuda_graph.h"
-#include "paddle/fluid/platform/dynload/cudnn.h"
-#endif
-#include "paddle/fluid/memory/malloc.h"
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-#include "paddle/fluid/platform/dynload/cuda_driver.h"
-#endif
-#endif
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/lock_guard_ptr.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/monitor.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(enable_cublas_tensor_op_math);
-DECLARE_string(selected_gpus);
-DECLARE_uint64(gpu_memory_limit_mb);
-
-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
-static std::once_flag g_device_props_size_init_flag;
-static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<paddle::gpuDeviceProp> g_device_props;
-
-USE_GPU_MEM_STAT;
-namespace paddle {
-namespace platform {
-
-int CudnnVersion() {
-  if (!dynload::HasCUDNN()) return -1;
-
-#ifdef PADDLE_WITH_HIP
-  size_t version_major, version_minor, version_patch;
-  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
-      &version_major, &version_minor, &version_patch));
-  return version_major * 100 + version_minor * 10 + version_patch;
-#else
-  return dynload::cudnnGetVersion();
-#endif
-}
-static int GetCUDADeviceCountImpl() {
-  int driverVersion = 0;
-#ifdef PADDLE_WITH_HIP
-  hipError_t status = hipDriverGetVersion(&driverVersion);
-#else
-  cudaError_t status = cudaDriverGetVersion(&driverVersion);
-#endif
-
-  if (!(status == gpuSuccess && driverVersion != 0)) {
-    // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
-    return 0;
-  }
-
-#ifdef PADDLE_WITH_HIP
-  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
-#else
-  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
-#endif
-  if (cuda_visible_devices != nullptr) {
-    std::string cuda_visible_devices_str(cuda_visible_devices);
-    if (!cuda_visible_devices_str.empty()) {
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\''));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\'') + 1);
-      cuda_visible_devices_str.erase(
-          0, cuda_visible_devices_str.find_first_not_of('\"'));
-      cuda_visible_devices_str.erase(
-          cuda_visible_devices_str.find_last_not_of('\"') + 1);
-    }
-    if (std::all_of(cuda_visible_devices_str.begin(),
-                    cuda_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
-                 "empty. No GPU detected.";
-      return 0;
-    }
-  }
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
-#endif
-  return count;
-}
-
-int GetCUDADeviceCount() {
-  // cache the count
-  static auto dev_cnt = GetCUDADeviceCountImpl();
-  return dev_cnt;
-}
-
-/* Here is a very simple CUDA “pro tip”: cudaDeviceGetAttribute() is a much
-faster way to query device properties. You can see details in
-https://devblogs.nvidia.com/cuda-pro-tip-the-fast-way-to-query-device-properties/
-*/
-int GetCUDAComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int major, minor;
-
-#ifdef PADDLE_WITH_HIP
-  auto major_error_code = hipDeviceGetAttribute(
-      &major, hipDeviceAttributeComputeCapabilityMajor, id);
-  auto minor_error_code = hipDeviceGetAttribute(
-      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
-#else
-  auto major_error_code =
-      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
-  auto minor_error_code =
-      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
-#ifdef PADDLE_WITH_HIP
-  return major * 100 + minor;
-#else
-  return major * 10 + minor;
-#endif
-}
-
-dim3 GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  dim3 ret;
-  int size;
-#ifdef PADDLE_WITH_HIP
-  auto error_code_x =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
-#else
-  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
-  ret.x = size;
-
-#ifdef PADDLE_WITH_HIP
-  auto error_code_y =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
-#else
-  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
-  ret.y = size;
-
-#ifdef PADDLE_WITH_HIP
-  auto error_code_z =
-      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
-#else
-  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
-  ret.z = size;
-  return ret;
-}
-
-int GetCUDARuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int runtime_version = 0;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
-#endif
-  return runtime_version;
-}
-
-int GetCUDADriverVersion(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int driver_version = 0;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
-#endif
-  return driver_version;
-}
-
-bool TensorCoreAvailable() {
-#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
-  int device = GetCurrentDeviceId();
-  int driver_version = GetCUDAComputeCapability(device);
-  return driver_version >= 70;
-#else
-  return false;
-#endif
-}
-
-int GetCUDAMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
-#endif
-  return count;
-}
-
-int GetCUDAMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
-      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
-      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
-#endif
-  return count;
-}
-
-int GetCUDAMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-  int count;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
-#endif
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
-#endif
-  return device_id;
-}
-
-//! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedDevices() {
-  // use user specified GPUs in single-node multi-process mode.
-  std::vector<int> devices;
-  if (!FLAGS_selected_gpus.empty()) {
-    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
-    for (auto id : devices_str) {
-      devices.push_back(atoi(id.c_str()));
-    }
-  } else {
-    int count = GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
-  }
-  return devices;
-}
-
-const gpuDeviceProp &GetDeviceProperties(int id) {
-  std::call_once(g_device_props_size_init_flag, [&] {
-    int gpu_num = 0;
-    gpu_num = platform::GetCUDADeviceCount();
-    g_device_props_init_flags.resize(gpu_num);
-    g_device_props.resize(gpu_num);
-    for (int i = 0; i < gpu_num; ++i) {
-      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
-    }
-  });
-
-  if (id == -1) {
-    id = platform::GetCurrentDeviceId();
-  }
-
-  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(platform::errors::OutOfRange(
-        "The device id %d is out of range [0, %d), where %d is the number of "
-        "devices on this machine. Because the device id should be greater than "
-        "or equal to zero and smaller than the number of gpus. Please input "
-        "appropriate device again!",
-        id, static_cast<int>(g_device_props.size()),
-        static_cast<int>(g_device_props.size())));
-  }
-
-  std::call_once(*(g_device_props_init_flags[id]), [&] {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaGetDeviceProperties(&g_device_props[id], id));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipGetDeviceProperties(&g_device_props[id], id));
-#endif
-  });
-
-  return g_device_props[id];
-}
-
-void SetDeviceId(int id) {
-  // TODO(qijun): find a better way to cache the cuda device count
-  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(),
-                    platform::errors::InvalidArgument(
-                        "Device id must be less than GPU count, "
-                        "but received id is: %d. GPU count is: %d.",
-                        id, GetCUDADeviceCount()));
-#ifdef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
-#else
-  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
-#endif
-}
-
-void GpuMemoryUsage(size_t *available, size_t *total) {
-  size_t actual_available, actual_total;
-  RecordedCudaMemGetInfo(available, total, &actual_available, &actual_total,
-                         platform::GetCurrentDeviceId());
-}
-
-size_t GpuAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = GpuMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
-}
-
-size_t GpuMaxAllocSize() {
-  return std::max(GpuInitAllocSize(), GpuReallocSize());
-}
-
-static size_t GpuAllocSize(bool realloc) {
-  size_t available_to_alloc = GpuAvailableMemToAlloc();
-  PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
-      platform::errors::ResourceExhausted("Not enough available GPU memory."));
-  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
-  // allocated by fraction
-  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
-                           : FLAGS_initial_gpu_memory_in_mb;
-  size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
-      platform::errors::ResourceExhausted("Not enough available GPU memory."));
-  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
-           << " MiB, is it Re-alloc: " << realloc;
-  return alloc_bytes;
-}
-
-size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
-
-size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
-
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
-size_t GpuMaxChunkSize() {
-  size_t max_chunk_size = GpuMaxAllocSize();
-  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
-  return max_chunk_size;
-}
-
-#ifdef PADDLE_WITH_HIP
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum hipMemcpyKind kind, hipStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
-}
-#else
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    enum cudaMemcpyKind kind, cudaStream_t stream) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
-}
-#endif
-
-#ifdef PADDLE_WITH_HIP
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum hipMemcpyKind kind) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
-}
-#else
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
-}
-#endif
-
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-#endif
-}
-
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      hipMemcpyPeer(dst, dst_device, src, src_device, count));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
-      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
-#endif
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
-#endif
-}
-
-void GpuStreamSync(gpuStream_t stream) {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-}
-
-static void RaiseNonOutOfMemoryError(gpuError_t *status) {
-#ifdef PADDLE_WITH_HIP
-  if (*status == hipErrorOutOfMemory) {
-    *status = hipSuccess;
-  }
-#else
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-
-#ifdef PADDLE_WITH_HIP
-  *status = hipGetLastError();
-  if (*status == hipErrorOutOfMemory) {
-    *status = hipSuccess;
-  }
-#else
-  *status = cudaGetLastError();
-  if (*status == cudaErrorMemoryAllocation) {
-    *status = cudaSuccess;
-  }
-#endif
-  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
-}
-
-class RecordedCudaMallocHelper {
- private:
-  explicit RecordedCudaMallocHelper(int dev_id, uint64_t limit_size = 0)
-      : dev_id_(dev_id), limit_size_(limit_size) {
-    if (NeedRecord()) {
-      mtx_.reset(new std::mutex());
-    }
-  }
-
-  DISABLE_COPY_AND_ASSIGN(RecordedCudaMallocHelper);
-
- public:
-  static RecordedCudaMallocHelper *Instance(int dev_id) {
-    std::call_once(once_flag_, [] {
-      int dev_cnt = GetCUDADeviceCount();
-      instances_.reserve(dev_cnt);
-      for (int i = 0; i < dev_cnt; ++i) {
-        instances_.emplace_back(
-            new RecordedCudaMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
-      }
-    });
-
-    PADDLE_ENFORCE_GE(
-        dev_id, 0,
-        platform::errors::OutOfRange(
-            "Device id must be not less than 0, but got %d.", dev_id));
-    PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
-        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
-    return instances_[dev_id].get();
-  }
-
-  /**
-   * Try to allocate `size` gpu memory. Only cudaErrorMemoryAllocation
-   * or cudaSuccess would be returned, and the cudaGetLastError() flag
-   * would be clear.
-   */
-  gpuError_t Malloc(void **ptr, size_t size) {
-    LockGuardPtr<std::mutex> lock(mtx_);
-    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
-#ifdef PADDLE_WITH_HIP
-      return hipErrorOutOfMemory;
-#else
-      return cudaErrorMemoryAllocation;
-#endif
-    }
-
-    CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-    auto result = hipMalloc(ptr, size);
-#else
-    CUDAGraphCaptureModeGuard capture_mode_guard;
-    auto result = cudaMalloc(ptr, size);
-#endif
-    if (result == gpuSuccess) {
-      cur_size_.fetch_add(size);
-      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      return gpuSuccess;
-    } else {
-      RaiseNonOutOfMemoryError(&result);
-// Non out of memory error would be raised inside
-// RaiseNonOutOfMemoryError. Therefore, we can
-// return cudaErrorMemoryAllocation directly here.
-#ifdef PADDLE_WITH_HIP
-      return hipErrorOutOfMemory;
-#else
-      return cudaErrorMemoryAllocation;
-#endif
-    }
-  }
-
-  /**
-   * Free gpu memory. Usually, free is not allowed to raise error.
-   * If it does raise error, the process should be crashed.
-   */
-  void Free(void *ptr, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-    auto err = hipFree(ptr);
-    if (err != hipErrorDeinitialized) {
-#else
-    auto err = cudaFree(ptr);
-    if (err != cudaErrorCudartUnloading) {
-#endif
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      cur_size_.fetch_sub(size);
-      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-    } else {
-#ifdef PADDLE_WITH_HIP
-      hipGetLastError();  // clear the error flag when hipErrorDeinitialized
-#else
-      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
-#endif
-    }
-  }
-
-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                  size_t *actual_total) {
-    {
-      CUDADeviceGuard guard(dev_id_);
-#ifdef PADDLE_WITH_HIP
-      auto result = hipMemGetInfo(actual_avail, actual_total);
-#else
-      auto result = cudaMemGetInfo(actual_avail, actual_total);
-#endif
-      if (result != gpuSuccess) {
-        *actual_avail = 0;
-      }
-      RaiseNonOutOfMemoryError(&result);
-    }
-
-    if (NeedRecord()) {
-      std::lock_guard<std::mutex> guard(*mtx_);
-      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
-      *total = std::min(*actual_total, limit_size_);
-      return *total < *actual_total;
-    } else {
-      *avail = *actual_avail;
-      *total = *actual_total;
-      return false;
-    }
-  }
-
-  inline bool NeedRecord() const { return limit_size_ != 0; }
-
-  uint64_t RecordedSize() const { return cur_size_.load(); }
-
-  uint64_t LimitSize() const { return limit_size_; }
-
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                     const CUmemAllocationProp *prop,
-                     unsigned long long flags) {  // NOLINT
-    auto result =
-        paddle::platform::dynload::cuMemCreate(handle, size, prop, flags);
-    if (result == CUDA_SUCCESS) {
-      cur_size_.fetch_add(size);
-    }
-    return result;
-  }
-
-  CUresult MemRelease(CUmemGenericAllocationHandle handle, size_t size) {
-    auto result = paddle::platform::dynload::cuMemRelease(handle);
-    if (result == CUDA_SUCCESS) {
-      cur_size_.fetch_sub(size);
-    }
-    return result;
-  }
-
-#endif
-#endif
-
- private:
-  const int dev_id_;
-  const uint64_t limit_size_;
-  std::atomic<uint64_t> cur_size_{0};
-
-  mutable std::unique_ptr<std::mutex> mtx_;
-
-  static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
-};  // NOLINT
-
-std::once_flag RecordedCudaMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
-    RecordedCudaMallocHelper::instances_;
-
-gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
-}
-
-void RecordedCudaFree(void *p, size_t size, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->Free(p, size);
-}
-
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10020
-CUresult RecordedCuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
-                             const CUmemAllocationProp *prop,
-                             unsigned long long flags, int dev_id) {  // NOLINT
-  return RecordedCudaMallocHelper::Instance(dev_id)->MemCreate(handle, size,
-                                                               prop, flags);
-}
-
-CUresult RecordedCuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
-                              int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->MemRelease(handle, size);
-}
-#endif
-#endif
-
-bool RecordedCudaMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                            size_t *actual_total, int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->GetMemInfo(
-      avail, total, actual_avail, actual_total);
-}
-
-uint64_t RecordedCudaMallocSize(int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->RecordedSize();
-}
-
-bool IsCudaMallocRecorded(int dev_id) {
-  return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
-}
-
-void EmptyCache(void) {
-  std::vector<int> devices = GetSelectedDevices();
-  for (auto device : devices) {
-    memory::Release(CUDAPlace(device));
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index 965fe7b6db45cc..dbca7d15495461 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -32,7 +32,7 @@ TEST(InitDevices, CUDA) {
   using paddle::platform::DeviceContextPool;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int count = paddle::platform::GetCUDADeviceCount();
+  int count = paddle::platform::GetGPUDeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 02930627d41e34..5d1caffd45326d 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -29,7 +29,7 @@ __global__ void DummyKernel(int *a) { a[0] = 0; }
 
 static void ForEachDevice(std::function<void(int)> func) {
   auto original_device = platform::GetCurrentDeviceId();
-  int count = platform::GetCUDADeviceCount();
+  int count = platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     platform::SetDeviceId(i);
     func(i);
@@ -43,13 +43,13 @@ void DummyKernelAndEvent() {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
       hipStream_t stream;
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
       Mark("_cuda_startup_");
       int *ptr;
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&ptr, sizeof(int)));
       hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
 #else
@@ -57,13 +57,13 @@ void DummyKernelAndEvent() {
     ForEachDevice([](int d) {
       platform::SetDeviceId(d);
       cudaStream_t stream;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
       Mark("_cuda_startup_");
       int *ptr;
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
       DummyKernel<<<1, 1, 0, stream>>>(ptr);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(ptr));
     });
   }
 #endif
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index de814faec2523e..317991160b7988 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.pb.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 3408971efa4115..4277f7d4dc63e2 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -121,17 +121,17 @@ std::vector<std::vector<MemEvent>> GetMemEvents() {
 
 void SynchronizeAllDevice() {
 #ifdef PADDLE_WITH_CUDA
-  int count = GetCUDADeviceCount();
+  int count = GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
   }
 #endif
 #ifdef PADDLE_WITH_HIP
-  int count = GetCUDADeviceCount();
+  int count = GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
     SetDeviceId(i);
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
   }
 #endif
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index 212d99f6a78ed5..dafb61fe0aaf43 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -30,18 +30,18 @@ bool CUDAStream::Init(const Place& place, const Priority& priority,
   CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
   if (priority == Priority::kHigh) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), -1));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), -1));
 #endif
   } else if (priority == Priority::kNormal) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreateWithPriority(
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream_, static_cast<unsigned int>(flag), 0));
 #endif
   }
@@ -58,9 +58,9 @@ void CUDAStream::Destroy() {
   WaitCallback();
   if (stream_) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
   }
   stream_ = nullptr;
@@ -89,7 +89,7 @@ void CUDAStream::Wait() const {
 #endif
 #endif  // PADDLE_WITH_HIP
 
-  PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
+  PADDLE_ENFORCE_GPU_SUCCESS(e_sync);
 }
 
 CUDAStream* get_current_stream(int deviceId) {
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index 472d6bbab0c6cb..36f31c46673b2f 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cstdint>
 #include <memory>
 
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream_callback_manager.h"
@@ -64,32 +64,32 @@ class CUDAStream final {
 #ifdef PADDLE_WITH_HIP
   void RecordEvent(hipEvent_t ev, Callback callback) const {
     callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
   void RecordEvent(cudaEvent_t ev, Callback callback) const {
     callback();
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
 #endif
 
 #ifdef PADDLE_WITH_HIP
   void RecordEvent(hipEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream_));
   }
 #else
   void RecordEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream_));
   }
 #endif
 
 #ifdef PADDLE_WITH_HIP
   void WaitEvent(hipEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
   }
 #else
   void WaitEvent(cudaEvent_t ev) const {
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
   }
 #endif
 
@@ -122,17 +122,11 @@ class CUDAStream final {
     }
 #endif
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
     return false;
   }
 
-  void Synchronize() const {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
-#endif
-  }
+  void Synchronize() const { platform::GpuStreamSync(stream_); }
 
   const Place& GetPlace() const { return place_; }
 
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 3f0c5ace900d11..28aa022fe2f132 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
-#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
-#endif
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -59,15 +59,15 @@ void StreamCallbackManager<Stream>::AddCallback(
   });
 
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(
+  PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 #endif
@@ -81,11 +81,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#endif
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+  platform::GpuStreamSync(stream_);
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   NPUStreamSync(stream_);
diff --git a/paddle/fluid/platform/test_limit_gpu_memory.cu b/paddle/fluid/platform/test_limit_gpu_memory.cu
index 81b766182337fc..684cb780735514 100644
--- a/paddle/fluid/platform/test_limit_gpu_memory.cu
+++ b/paddle/fluid/platform/test_limit_gpu_memory.cu
@@ -15,7 +15,7 @@
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 DECLARE_uint64(gpu_memory_limit_mb);
 
@@ -30,32 +30,24 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
   size_t limit = FLAGS_gpu_memory_limit_mb << 20;
 
   {
-    ASSERT_TRUE(IsCudaMallocRecorded(DEVICE_ID));
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    ASSERT_TRUE(IsGpuMallocRecorded(DEVICE_ID));
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 
   size_t avail, total;
   {
     size_t actual_avail, actual_total;
-    RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
-                           DEVICE_ID);
+    RecordedGpuMemGetInfo(&avail, &total, &actual_avail, &actual_total,
+                          DEVICE_ID);
     ASSERT_EQ(total, limit);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
   }
 
   {
     CUDADeviceGuard guard(DEVICE_ID);
     GpuMemoryUsage(&avail, &total);
     ASSERT_EQ(total, limit);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
   }
 
   gpuError_t err = gpuSuccess;
@@ -63,54 +55,41 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
   void *p1 = nullptr;
   size_t size1 = limit / 4 * 3;
   {
-    err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
+    err = platform::RecordedGpuMalloc(&p1, size1, DEVICE_ID);
     ASSERT_EQ(err, gpuSuccess);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_NE(p1, nullptr);
 
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1);
   }
 
   void *p2 = nullptr;
   size_t size2 = limit / 2;
   {
-    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(err, hipErrorOutOfMemory);
-    ASSERT_EQ(hipGetLastError(), gpuSuccess);
-#else
-    ASSERT_EQ(err, cudaErrorMemoryAllocation);
-    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
-#endif
+    err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID);
+    ASSERT_EQ(err, gpuErrorOutOfMemory);
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_EQ(p2, nullptr);
 
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size1);
   }
 
   {
-    platform::RecordedCudaFree(p1, size1, DEVICE_ID);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    platform::RecordedGpuFree(p1, size1, DEVICE_ID);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 
   {
-    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
+    err = platform::RecordedGpuMalloc(&p2, size2, DEVICE_ID);
     ASSERT_EQ(err, gpuSuccess);
-#ifdef PADDLE_WITH_HIP
-    ASSERT_EQ(hipGetLastError(), hipSuccess);
-#else
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
-#endif
+    ASSERT_EQ(paddle::platform::GpuGetLastError(), gpuSuccess);
     ASSERT_NE(p2, nullptr);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), size2);
   }
 
   {
-    platform::RecordedCudaFree(p2, size2, DEVICE_ID);
-    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), 0UL);
+    platform::RecordedGpuFree(p2, size2, DEVICE_ID);
+    ASSERT_EQ(RecordedGpuMallocSize(DEVICE_ID), 0UL);
   }
 }
 
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 311fb872ac1039..21571e17a2b48b 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -61,9 +61,9 @@ void BindCudaStream(py::module *m_ptr) {
     int curr_device_id = paddle::platform::GetCurrentDeviceId();
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
     paddle::platform::SetDeviceId(curr_device_id);
 #else
@@ -264,7 +264,7 @@ void BindCudaStream(py::module *m_ptr) {
             auto stream_flag =
                 paddle::platform::stream::StreamFlag::kStreamNonBlocking;
 
-            int device_count = platform::GetCUDADeviceCount();
+            int device_count = platform::GetGPUDeviceCount();
             if (device < 0) {
               device = platform::GetCurrentDeviceId();
             }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index cad0c1e70cc03c..f03acc3808468d 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1744,7 +1744,7 @@ void BindImperative(py::module *m_ptr) {
                  "Cannot copy this Tensor to GPU in CPU version Paddle, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
-             int device_count = platform::GetCUDADeviceCount();
+             int device_count = platform::GetGPUDeviceCount();
              int device_id = 0;
              if (handle == py::none()) {
                if (platform::is_gpu_place(self->Place())) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5fc1f27eff36f6..9ff9377abb2624 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -114,9 +114,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
 #ifndef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/cuda_profiler.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -507,7 +507,7 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
 static int GetNCCLVersion() {
 #if NCCL_VERSION_CODE >= 2304
   int ver;
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetVersion(&ver));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver));
   return ver;
 #else
   PADDLE_THROW(platform::errors::External(
@@ -556,7 +556,7 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("disable_signal_handler", &DisableSignalHandler);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("cudnn_version", &platform::CudnnVersion);
+  m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
     size_t total = 0;
@@ -564,6 +564,7 @@ PYBIND11_MODULE(core_noavx, m) {
     return available;
   });
 #endif
+
 #ifdef PADDLE_WITH_NCCL
   m.def("nccl_version", &GetNCCLVersion);
 #endif
@@ -1646,8 +1647,8 @@ All parameter, weight, gradient are variables in Paddle.
                std::exit(-1);
              }
 
-             if (UNLIKELY(dev_id >= platform::GetCUDADeviceCount())) {
-               if (platform::GetCUDADeviceCount() == 0) {
+             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
+               if (platform::GetGPUDeviceCount() == 0) {
                  LOG(ERROR) << "Cannot use GPU because there is no GPU "
                                "detected on your "
                                "machine.";
@@ -1656,8 +1657,8 @@ All parameter, weight, gradient are variables in Paddle.
                  LOG(ERROR) << string::Sprintf(
                      "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
                      "number on your machine is %d",
-                     dev_id, platform::GetCUDADeviceCount(),
-                     platform::GetCUDADeviceCount());
+                     dev_id, platform::GetGPUDeviceCount(),
+                     platform::GetGPUDeviceCount());
                  std::exit(-1);
                }
              }
@@ -2239,7 +2240,7 @@ All parameter, weight, gradient are variables in Paddle.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetCUDAComputeCapability(place.device) >= 53;
+    return platform::GetGPUComputeCapability(place.device) >= 53;
   });
 #endif
 
@@ -2419,7 +2420,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+  m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("cuda_empty_cache", [] {
     for (int dev_id : platform::GetSelectedDevices()) {
       auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
diff --git a/paddle/pten/api/lib/ext_compat_utils.cc b/paddle/pten/api/lib/ext_compat_utils.cc
index b7250d15794319..791a8526f3847a 100644
--- a/paddle/pten/api/lib/ext_compat_utils.cc
+++ b/paddle/pten/api/lib/ext_compat_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/api/lib/ext_compat_utils.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 8b54813eadf327..e457c57d59e55c 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor_py.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h b/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
index 435da644356f94..1bf5bb288e8320 100644
--- a/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
+++ b/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/platform/aligned_vector.h"
-#include "paddle/fluid/platform/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 namespace pten {
 namespace detail {
 using CUDAContext = paddle::platform::CUDADeviceContext;
diff --git a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
index e292a56d08a8db..21663ee0388c0b 100644
--- a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
@@ -36,7 +36,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
@@ -479,9 +479,9 @@ struct ReduceConfig {
       reduce_num_per_thread = details::AlignUp(reduce_num, block_dim->y);
     }
     int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        paddle::platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     int num_threads = block_dim->x * block_dim->y;
     int max_num_blocks = max_threads / num_threads;
@@ -521,9 +521,9 @@ struct ReduceConfig {
     left_num = last_dim_num;
     grid_dim->z = grid_z;
     int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetCUDAMultiProcessors(device_id);
+    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
     int max_threads_per_mp =
-        paddle::platform::GetCUDAMaxThreadsPerMultiProcessor(device_id);
+        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
     int max_threads = max_threads_per_mp * max_mp;
     // init
     int num_block = (max_threads / left_num);
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e77b71e4da1c9c..482e65a726ef54 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -225,7 +225,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_CUDA_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
+    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
     check_approval 1 6836917 47554610 22561442
 fi
 

From a3b3ec684c5f6d62b62bfb5c8a61ea2f59697135 Mon Sep 17 00:00:00 2001
From: jianghaicheng <jianghaicheng@users.noreply.github.com>
Date: Fri, 3 Dec 2021 18:07:58 +0800
Subject: [PATCH 064/124] add ipu_backend (#36322)

---
 CMakeLists.txt                                |   1 +
 cmake/configure.cmake                         |   5 +
 cmake/external/poplar.cmake                   |  61 +++
 cmake/external/protobuf.cmake                 |   5 +
 cmake/flags.cmake                             |   7 +
 cmake/third_party.cmake                       |   5 +
 paddle/fluid/platform/device/CMakeLists.txt   |   5 +
 .../fluid/platform/device/ipu/CMakeLists.txt  |   8 +
 paddle/fluid/platform/device/ipu/common.h     |  35 ++
 paddle/fluid/platform/device/ipu/device.cc    |  39 ++
 paddle/fluid/platform/device/ipu/device.h     |  44 ++
 .../fluid/platform/device/ipu/ipu_backend.cc  | 195 +++++++++
 .../fluid/platform/device/ipu/ipu_backend.h   | 103 +++++
 .../fluid/platform/device/ipu/ipu_compiler.cc | 397 ++++++++++++++++++
 .../fluid/platform/device/ipu/ipu_compiler.h  |  93 ++++
 .../fluid/platform/device/ipu/ipu_executor.cc | 209 +++++++++
 .../fluid/platform/device/ipu/ipu_executor.h  |  83 ++++
 .../platform/device/ipu/ipu_optimizer.cc      | 136 ++++++
 .../fluid/platform/device/ipu/ipu_optimizer.h |  76 ++++
 .../fluid/platform/device/ipu/ipu_strategy.cc |  21 +
 .../fluid/platform/device/ipu/ipu_strategy.h  |  39 ++
 paddle/fluid/platform/device/ipu/ipu_utils.cc | 155 +++++++
 paddle/fluid/platform/device/ipu/ipu_utils.h  | 101 +++++
 .../device/ipu/supported_ops_autogen.h        | 197 +++++++++
 24 files changed, 2020 insertions(+)
 create mode 100644 cmake/external/poplar.cmake
 create mode 100644 paddle/fluid/platform/device/ipu/CMakeLists.txt
 create mode 100644 paddle/fluid/platform/device/ipu/common.h
 create mode 100644 paddle/fluid/platform/device/ipu/device.cc
 create mode 100644 paddle/fluid/platform/device/ipu/device.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_backend.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_backend.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_compiler.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_compiler.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_executor.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_executor.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_optimizer.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_optimizer.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_strategy.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_strategy.h
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_utils.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_utils.h
 create mode 100644 paddle/fluid/platform/device/ipu/supported_ops_autogen.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 334a6cfcd0ee14..55f1e4cd224b32 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,7 @@ option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
 # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f15db6e094c17a..a77f9f72ca6ade 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -97,6 +97,11 @@ if(WITH_XPU)
     add_definitions(-DPADDLE_WITH_XPU)
 endif()
 
+if(WITH_IPU)
+    message(STATUS "Compile with IPU!")
+    add_definitions(-DPADDLE_WITH_IPU)
+endif()
+
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
diff --git a/cmake/external/poplar.cmake b/cmake/external/poplar.cmake
new file mode 100644
index 00000000000000..7947a54f8b5f11
--- /dev/null
+++ b/cmake/external/poplar.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(WITH_IPU)
+  set(POPLAR_DIR CACHE PATH "Path to a Poplar install")
+  set(POPART_DIR CACHE PATH "Path to a Popart install")
+  set(POPLAR_SDK_DIR CACHE PATH "Path to an extracted SDK archive or to a Poplar & Popart install directory (Will populate POPLAR_DIR and POPART_DIR)")
+
+  if(DEFINED ENV{POPLAR_SDK_DIR})
+    set(POPLAR_SDK_DIR $ENV{POPLAR_SDK_DIR})
+    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "popart*"
+      OUTPUT_VARIABLE POPART_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(COMMAND find ${POPLAR_SDK_DIR}/ -maxdepth 1 -type d -name "poplar-*" -o -name "poplar"
+      OUTPUT_VARIABLE POPLAR_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT IS_DIRECTORY "${POPLAR_DIR}")
+      message(FATAL_ERROR "Couldn't find a \"poplar\" or \"poplar-*\" folder in '${POPLAR_SDK_DIR}'")
+    endif()
+    if(NOT IS_DIRECTORY "${POPART_DIR}")
+      message(FATAL_ERROR "Couldn't find a \"popart*\" folder in '${POPLAR_SDK_DIR}'")
+    endif()
+  else()
+    message(FATAL_ERROR "You must provide a path to a Poplar install using export POPLAR_SDK_DIR=/path/to/poplar_sdk")
+  endif()
+
+  message("POPLAR_DIR is ${POPLAR_DIR}")
+  message("POPART_DIR is ${POPART_DIR}")
+
+  if(EXISTS ${POPLAR_DIR})
+    list(APPEND CMAKE_PREFIX_PATH ${POPLAR_DIR})
+    set(ENABLE_POPLAR_CMD "source ${POPLAR_DIR}/enable.sh")
+    find_package(poplar REQUIRED)
+    include_directories("${POPLAR_DIR}/include")
+    link_directories("${POPLAR_DIR}/lib")
+  endif()
+  if(NOT poplar_FOUND)
+      message(FATAL_ERROR "You must provide a path to a Poplar install using -DPOPLAR_DIR=/path/to/popart/build/install")
+  endif()
+
+  if(EXISTS ${POPART_DIR})
+    list(APPEND CMAKE_PREFIX_PATH ${POPART_DIR})
+    set(ENABLE_POPART_CMD "source ${POPART_DIR}/enable.sh")
+    find_package(popart REQUIRED COMPONENTS popart-only)
+    include_directories("${POPART_DIR}/include")
+    link_directories("${POPART_DIR}/lib")
+  endif()
+  if(NOT popart_FOUND)
+    message(FATAL_ERROR "You must provide a path to a Popart build using -DPOPART_DIR=/path/to/popart/build")
+  endif()
+  add_definitions(-DONNX_NAMESPACE=onnx)
+  add_custom_target(extern_poplar DEPENDS poplar popart-only)
+endif()
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 86acd1a0012504..2a028b8dc7e7f0 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -204,6 +204,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
         SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
         SET(PROTOBUF_TAG         v3.8.0)
+    elseif(WITH_IPU)
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         d750fbf648256c7c631f51ffdbf67d7c18b0114e)
     else()
         SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
         SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
@@ -243,6 +246,8 @@ ENDFUNCTION()
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
     SET(PROTOBUF_VERSION 3.8.0)
+elseif(WITH_IPU)
+    SET(PROTOBUF_VERSION 3.6.1)
 else()
     SET(PROTOBUF_VERSION 3.1.0)
 endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7afff25664bbbb..7495ee32bab95c 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -151,6 +151,13 @@ set(COMMON_FLAGS
     ${fsanitize}
 )
 
+if(WITH_IPU)
+    set(COMMON_FLAGS ${COMMON_FLAGS} 
+        -Wno-sign-compare # Warnings in Popart
+        -Wno-non-virtual-dtor # Warnings in Popart
+    )
+endif()
+
 if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
         set(COMMON_FLAGS
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index f2efc974073e59..7aa1e78abb9a3c 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -391,4 +391,9 @@ if (WIN32)
     list(APPEND third_party_deps extern_dirent)
 endif (WIN32)
 
+if (WITH_IPU)
+    include(external/poplar)
+    list(APPEND third_party_deps extern_poplar)
+endif()
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 5b67473b77eadd..0cd07dec20e3ed 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -10,3 +10,8 @@ ENDIF()
 IF(WITH_ASCEND OR WITH_ASCEND_CL)
   add_subdirectory(npu)
 ENDIF()
+
+# IPU
+IF(WITH_IPU)
+  add_subdirectory(ipu)
+ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
new file mode 100644
index 00000000000000..c4595e22d6cd24
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -0,0 +1,8 @@
+cc_library(ipu_device SRCS device.cc DEPS enforce popart)
+cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
+cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
+cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
+cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
+cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
+cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
+cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
diff --git a/paddle/fluid/platform/device/ipu/common.h b/paddle/fluid/platform/device/ipu/common.h
new file mode 100644
index 00000000000000..7d62f10abd201d
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/common.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/names.hpp>
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+static constexpr const char *sIpuIndexAttr = "ipu_index";
+static constexpr const char *sIpuStageAttr = "ipu_stage";
+static constexpr const char *sOpIdentifyIdAttr = "op_identify_id";
+static constexpr const char *sDebugInfoId = "__debug_info_id";
+
+static constexpr const char *sBeta1 = "beta1";
+static constexpr const char *sBeta2 = "beta2";
+static constexpr const char *sBeta1Pow = "Beta1Pow";
+static constexpr const char *sBeta2Pow = "Beta2Pow";
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc
new file mode 100644
index 00000000000000..4aa9ab56d92f80
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/device.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/device.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+Device::Device(const popart::DeviceInfo& device_info)
+    : id_(device_info.getId()), is_attached_(device_info.isAttached()) {
+  popart::DeviceType popart_device_type = device_info.getType();
+  switch (popart_device_type) {
+    case popart::DeviceType::IpuModel:
+      device_type_ = DeviceType::IpuModel;
+      break;
+    case popart::DeviceType::Ipu:
+      device_type_ = DeviceType::Ipu;
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "popart::DeviceType:Unsupported type %d", popart_device_type));
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/device.h b/paddle/fluid/platform/device/ipu/device.h
new file mode 100644
index 00000000000000..24a8bdec3087cc
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/device.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/devicemanager.hpp>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+enum class DeviceType { IpuModel = 0, Cpu, Ipu, OfflineIpu, Sim };
+
+class Device {
+ public:
+  Device() {}
+  explicit Device(const popart::DeviceInfo& device_info);
+
+  int getId() const { return id_; }
+  bool isAttached() const { return is_attached_; }
+  DeviceType getType() const { return device_type_; }
+
+ private:
+  int id_;
+  bool is_attached_;
+  DeviceType device_type_;
+  /* TODO:: Add more elements in the future */
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
new file mode 100644
index 00000000000000..cd0f5ae554cb40
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
+
+IpuBackend::IpuBackend() {
+  compiler_ = std::make_shared<Compiler>();
+  executor_ = std::make_unique<Executor>();
+}
+
+void IpuBackend::Clear() {
+  executor_.reset();
+  // detach device
+  if (device_ != nullptr && device_->isAttached()) {
+    device_->detach();
+    device_.reset();
+    device_ = nullptr;
+  }
+}
+
+IpuBackend::~IpuBackend() { Clear(); }
+
+std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
+  if (!instance_) {
+    instance_.reset(new IpuBackend());
+  }
+  return instance_;
+}
+
+// This api should only call from python, always return a new object
+std::shared_ptr<IpuBackend> IpuBackend::GetNewInstance() {
+  instance_.reset(new IpuBackend());
+  return instance_;
+}
+
+void IpuBackend::Compile(framework::ir::Graph* graph,
+                         const std::vector<std::string>& feed_list,
+                         const std::vector<std::string>& fetch_list) {
+  VLOG(10) << "enter IpuBackend::Compile";
+  compiler_->InitInputs(graph, feed_list);
+  compiler_->LowerWeights(graph, scope_);
+  compiler_->LowerBody(graph);
+  compiler_->InitOutputs(fetch_list);
+  executor_->SetWeights(compiler_->GetWeights());
+  VLOG(10) << "leave IpuBackend::Compile";
+}
+
+void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
+                     const std::vector<framework::Tensor*>& outputs,
+                     const framework::ExecutionContext& ctx) {
+  Prepare();
+  auto inputs_id = compiler_->GetInputs();
+  auto outputs_id = compiler_->GetOutputs();
+  executor_->Run(inputs_id, inputs, outputs_id, outputs, ctx);
+}
+
+void IpuBackend::Prepare() {
+  if (is_prepared_) {
+    return;
+  } else {
+    is_prepared_ = true;
+  }
+  // convert Model to fp16
+  if (ipu_strategy_->enable_fp16) {
+    compiler_->ConvertProtoToFp16();
+  }
+  auto proto = compiler_->GetModelProto();
+  auto tensors = compiler_->GetTensors();
+  auto outputs = compiler_->GetOutputs();
+  executor_->Prepare(proto, tensors, outputs, device_);
+}
+
+void IpuBackend::SetScope(const framework::Scope& scope) {
+  scope_ = &scope;
+  executor_->SetScope(&scope);
+}
+
+void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
+  ipu_strategy_ = &strategy;
+  executor_->SetIpuStrategy(strategy);
+  compiler_->SetIpuStrategy(strategy);
+}
+
+size_t IpuBackend::GetNumDevices() {
+  // IpuModel
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) return 1;
+  // Real dev
+  size_t num_devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices().size();
+  PADDLE_ENFORCE_GT(
+      num_devices, 0,
+      platform::errors::Unavailable(
+          "Do not found any IPU devices, please make "
+          "sure Poplar sdk is enabled or enable ENV \"POPLAR_IPUMODEL=1\""));
+  return num_devices;
+}
+
+std::vector<int> IpuBackend::GetDeviceIds() {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return {0};
+  }
+  std::vector<int> device_ids;
+  auto devices =
+      popart::DeviceManager::createDeviceManager().enumerateDevices();
+  PADDLE_ENFORCE_GT(
+      devices.size(), 0,
+      platform::errors::Unavailable("Do not found any IPU devices, please make "
+                                    "sure Poplar sdk is enabled."));
+
+  for (auto device : devices) {
+    device_ids.push_back(device->getId());
+  }
+
+  return device_ids;
+}
+
+Device IpuBackend::GetDevice(int id) {
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    std::map<std::string, std::string> deviceOpts{{"numIPUs", "1 "}};
+    device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
+        deviceOpts);
+    Device device(*device_.get());
+    return device;
+  }
+  size_t num_devices = GetNumDevices();
+  if (id < 0 || id >= num_devices) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "device id %d is invalid, number devices is %d", id, num_devices));
+  }
+  std::shared_ptr<popart::DeviceInfo> popart_device_info =
+      popart::DeviceManager::createDeviceManager().getDevice(
+          popart::SyncPattern::Full, id);
+  Device device(*popart_device_info.get());
+  return device;
+}
+
+void IpuBackend::AttachDevice(int id) {
+  // trick here
+  // Compiler ipu is not same as the runtime ipu.
+  VLOG(10) << "comile ipu id = " << id;
+  bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
+  if (ipu_model) {
+    return;
+  }
+  device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
+      UpperIpuNum());
+  PADDLE_ENFORCE_NOT_NULL(
+      device_, platform::errors::Unavailable("Can't attach IPU, ipu_num = %d.",
+                                             UpperIpuNum()));
+}
+
+bool IpuBackend::DeviceIsAttached() { return device_ != nullptr; }
+
+// num_ipus must be pow(2,n);
+int IpuBackend::UpperIpuNum() {
+  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
+                    platform::errors::Unavailable(
+                        "The ipu num get is wrong, please make sure the "
+                        "sharding or pipline parameter is right."));
+  int i = 0;
+  while (std::pow(2, i) < ipu_strategy_->num_ipus) {
+    i++;
+  }
+  return std::pow(2, i);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
new file mode 100644
index 00000000000000..769a1b5b52ab8b
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include <popart/devicemanager.hpp>
+#include <popart/names.hpp>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/ipu/device.h"
+#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/ipu/ipu_executor.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class IpuBackend {
+  // IpuBackend is the center of paddle-ipu, its function include:
+  //   1. Compile paddle model to popart model
+  //   2. Run popart model, inference or training
+  //   3. Request and release device
+  //   4. Other helper function
+
+ public:
+  IpuBackend();
+  ~IpuBackend();
+
+  void Clear();
+
+  // return if exsits, else create and return
+  static std::shared_ptr<IpuBackend> GetInstance();
+
+  // always return a new instance_
+  static std::shared_ptr<IpuBackend> GetNewInstance();
+
+  // what compile does include(call compiler_):
+  //   1. map paddle-op -> poart op
+  //   2. construct popart onnx compute graph
+  void Compile(framework::ir::Graph *graph,
+               const std::vector<std::string> &feed_list,
+               const std::vector<std::string> &fetch_list);
+
+  // what run does include:
+  //   1. construct forward onnx graph
+  //   2. graph-level optimization
+  //   3. autodiff
+  void Run(const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<framework::Tensor *> &outputs,
+           const framework::ExecutionContext &ctx);
+
+  Executor &GetExecutor() { return *executor_; }
+
+  void SetScope(const framework::Scope &scope);
+  const framework::Scope *GetScope() { return scope_; }
+  void SetIpuStrategy(const IpuStrategy &strategy);
+  const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
+
+  // Device
+  size_t GetNumDevices();
+  std::vector<int> GetDeviceIds();
+  Device GetDevice(int id);
+  void AttachDevice(int id);
+  bool DeviceIsAttached();
+
+ private:
+  int UpperIpuNum();
+  void Prepare();
+
+ private:
+  std::shared_ptr<Compiler> compiler_;
+  std::unique_ptr<Executor> executor_;
+  std::shared_ptr<popart::DeviceInfo> device_;
+  bool is_prepared_ = false;
+
+  // not own
+  const framework::Scope *scope_ = nullptr;
+  const IpuStrategy *ipu_strategy_ = nullptr;
+
+ private:
+  static std::shared_ptr<IpuBackend> instance_;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
new file mode 100644
index 00000000000000..a1c5ed4fefbf3e
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -0,0 +1,397 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/ipu/ipu_compiler.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+template <typename T>
+T GetAttrAllowNull(std::string attr, framework::OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
+  } else {
+    return {};
+  }
+}
+
+template <typename T>
+nonstd::optional<T> GetOptAttrAllowNull(std::string attr,
+                                        framework::OpDesc* op_desc) {
+  if (op_desc->HasAttr(attr)) {
+    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
+  } else {
+    return {};
+  }
+}
+
+Compiler::Compiler() {
+  builder_ = popart::Builder::create();
+  RegisterOpFunc();
+}
+
+Compiler::~Compiler() {}
+
+void Compiler::RegisterOpFunc() {
+  VLOG(10) << "enter Compiler::RegisterOpFunc";
+#define INT_VEC std::vector<std::int64_t>
+#define FLOAT_VEC std::vector<float>
+#define FLOAT float
+#define INT std::int64_t
+#define BOOL bool
+#define STRING std::string
+#define STRING_VEC std::vector<std::string*>
+#define NONE
+
+#define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
+#define OPT_ARG(Type, Name) , GetOptAttrAllowNull<Type>(#Name, op_desc)
+#define POPART_CONST_ARG(Name) , const PopartConstant& Name
+#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
+#define POPART_ATTRIB_VEC_ARG(Name)
+#define BODY_ARG(Name) NONE
+
+  name_function_ = {
+#define OP_DECL(FuncName, OnnxImpl, Args)                     \
+  {#FuncName, [&](framework::OpDesc* op_desc) {               \
+     auto op_type = op_desc->Type();                          \
+     VLOG(10) << "build op:" << op_type << " args " << #Args; \
+     auto inputs = GetOpInputs(op_desc);                      \
+     auto output_names = GetOpOutputs(op_desc);               \
+     auto debug_context = BuildDebugContext(op_desc);         \
+     auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
+     auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     auto output_ids = OnnxImpl(inputs Args, debug_context);  \
+     SetIpuIndexStage(output_ids, op_desc);                   \
+     InsertTensors(output_names, output_ids);                 \
+   }},  // NOLINT
+#include "paddle/fluid/platform/ipu/supported_ops_autogen.h"
+  };
+
+#undef OP_DECL
+#undef BODY_ARG
+#undef POPART_ATTRIB_VEC_ARG
+#undef HOST_SIDE_CONST_ARG
+#undef POPART_CONST_ARG
+#undef OPT_ARG
+#undef ARG
+#undef NONE
+#undef STRING_VEC
+#undef STRING
+#undef BOOL
+#undef INT
+#undef FLOAT
+#undef FLOAT_VEC
+#undef INT_VEC
+}
+
+void Compiler::LowerBody(const framework::ir::Graph* graph) {
+  VLOG(10) << "enter Compiler::LowerBody";
+  auto nodes = framework::ir::TopologySortOperations(*graph);
+  for (auto* node : nodes) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "node->type: " << op_type;
+
+    if (op_type == "popart_constant") {
+      auto dims =
+          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
+      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
+      auto dtype = OnnxDtype2PopartType(dtype_);
+      popart::TensorInfo tensor_info{dtype, dims};
+      auto value_attr = op_desc->GetAttr("value");
+      auto const_data = std::unique_ptr<popart::ConstVoidData>{};
+      switch (dtype) {
+        case popart::DataType::FLOAT:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<float>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT32:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::DOUBLE:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<double>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT64:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int64_t>, value_attr).data(),
+              tensor_info));
+          break;
+        default:
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The popart datatype is not supported, popart::DataType is %d",
+              dtype));
+      }
+      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_batchnormalization") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto num_outputs = outputs.size();
+      auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon"));
+      auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+      auto result = builder_->aiOnnxOpset11().batchnormalization(
+          inputs, num_outputs, epsilon, momentum);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_nllloss") {
+      auto inputs = GetOpInputs(op_desc);
+      auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex"));
+      auto result = builder_->aiGraphcoreOpset1().nllloss(
+          inputs, popart::ReductionType::NoReduction, ignoreIndex);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_topk") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      int64_t axis = BOOST_GET_CONST(int64_t, op_desc->GetAttr("axis"));
+      int sorted_INT32 = BOOST_GET_CONST(int, op_desc->GetAttr("sorted"));
+      int64_t sorted = int64_t{sorted_INT32};
+
+      auto aiOnnxOpset = builder_->aiOnnxOpset11();
+
+      popart::ConvInputs result;
+      if (inputs.size() == 2) {
+        VLOG(10)
+            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 2";
+        result = aiOnnxOpset.topk(inputs, axis, sorted);
+      } else if (inputs.size() == 1) {
+        VLOG(10)
+            << "[Compiler::LowerBody] size of inputs for <popart_topk> is 1";
+        int64_t k = BOOST_GET_CONST(int64_t, op_desc->GetAttr("k"));
+        popart::TensorInfo kShape{"INT64", std::vector<int64_t>{1}};
+        popart::ConstVoidData kData = {&k, kShape};
+        auto K_t = aiOnnxOpset.constant(kData);
+        result = aiOnnxOpset.topk({inputs[0], K_t}, axis, sorted);
+      }
+      result[1] = aiOnnxOpset.cast({result[1]}, "INT32");
+      SetIpuIndexStage(result, op_desc);
+      VLOG(10) << "[Compiler::LowerBody] output[1]: " << outputs[1];
+      VLOG(10) << "[Compiler::LowerBody] output[1]: "
+               << GetOpOutputs(op_desc)[1] << " -> " << result[1];
+      tensors_.emplace(GetOpOutputs(op_desc)[1], result[1]);  // topk indices
+      VLOG(10) << "[Compiler::LowerBody] output[0]: " << outputs[0];
+      VLOG(10) << "[Compiler::LowerBody] output[0]: "
+               << GetOpOutputs(op_desc)[0] << " -> " << result[0];
+      tensors_.emplace(GetOpOutputs(op_desc)[0], result[0]);  // topk values
+    } else {
+      auto itr = name_function_.find(op_type);
+      if (itr != name_function_.end()) {
+        itr->second(node->Op());
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "Op %s is not registered in popart canonicalization", op_type));
+      }
+    }
+  }
+  VLOG(10) << "leave Compiler::LowerBody";
+}
+
+void Compiler::InitInputs(framework::ir::Graph* graph,
+                          const std::vector<std::string>& feed_list) {
+  for (const auto& feed_name : feed_list) {
+    feed_list_.push_back(feed_name);
+    for (const framework::ir::Node* n : graph->Nodes()) {
+      if (n->IsVar()) {
+        auto* var_desc = n->Var();
+        if (feed_name == var_desc->Name()) {
+          VLOG(10) << "feed_name= " << var_desc->Name();
+          auto data_type = VarType2PopartType(var_desc->GetDataType());
+          if (ipu_strategy_->enable_fp16) {
+            data_type = popart::DataType::FLOAT16;
+          }
+          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
+          VLOG(10) << "popart input_info = " << input_info;
+          popart::TensorId tensor_id =
+              builder_->addInputTensor(input_info, feed_name);
+          VLOG(10) << "popart input tensor id = " << tensor_id;
+          inputs_.push_back(tensor_id);
+          tensors_.emplace(var_desc->Name(), tensor_id);
+        }
+      }
+    }
+  }
+}
+
+void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
+  for (const auto& fetch_name : fetch_list) {
+    fetch_list_.push_back(fetch_name);
+    auto tensor = tensors_.find(fetch_name);
+    PADDLE_ENFORCE_NE(tensor, tensors_.end(),
+                      platform::errors::NotFound(
+                          "output tensor %s does not exist.", fetch_name));
+    VLOG(10) << "fetch_name= " << fetch_name;
+    VLOG(10) << "popart output tensor id = " << tensor->second;
+    builder_->addOutputTensor(tensor->second);
+    outputs_.push_back(tensor->second);
+  }
+}
+
+void Compiler::LowerWeights(const framework::ir::Graph* graph,
+                            const framework::Scope* scope_) {
+  PADDLE_ENFORCE_NOT_NULL(scope_,
+                          platform::errors::PreconditionNotMet(
+                              "You should call set_scope before LowerWeights"));
+  // at this step, the graph doesn't contains optimizer related states
+  for (const auto* node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      if (node->Var()->Persistable() && node->inputs.empty()) {
+        auto var_name = node->Var()->Name();
+        // workround: https://github.com/graphcore/Paddle/issues/151
+        if (tensors_.count(var_name) != 0) {
+          continue;
+        }
+
+        auto var = scope_->FindVar(var_name);
+        if (var) {
+          auto tensor = var->Get<framework::LoDTensor>();
+          auto dtype = VarType2PopartType(tensor.type());
+          auto shape = std::vector<int64_t>();
+          for (size_t i = 0; i < tensor.dims().size(); ++i) {
+            shape.push_back(tensor.dims().at(i));
+          }
+          popart::TensorInfo tensor_info(dtype, shape);
+          popart::ConstVoidData const_data{tensor.data<void>(), tensor_info};
+          popart::TensorId result =
+              builder_->addInitializedInputTensor(const_data, var_name);
+          tensors_.emplace(var_name, result);
+          weights_.push_back(result);
+        }
+      }
+    }
+  }
+}
+
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::vector<std::string>& tensor_ids) {
+  PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(),
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  for (int i = 0; i < tensor_ids.size(); i++) {
+    std::string tensor_id = tensor_ids[i];
+    tensors_.emplace(output_names[i], tensor_ids[i]);
+  }
+}
+
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::string& tensor_id) {
+  PADDLE_ENFORCE_EQ(output_names.size(), 1,
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  tensors_.emplace(output_names[0], tensor_id);
+}
+
+void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
+                                const framework::OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+  auto tensor_ids_set =
+      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
+
+  if (op_desc->HasAttr(sIpuIndexAttr)) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
+    builder_->virtualGraph(tensor_ids_set, ipu_index);
+    VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr(sIpuStageAttr)) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
+      builder_->pipelineStage(tensor_ids_set, ipu_stage);
+      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+               << " for op: " << op_desc->Type();
+    }
+  }
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+}
+
+void Compiler::SetIpuIndexStage(const std::string& tensor_id,
+                                const framework::OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+
+  if (op_desc->HasAttr(sIpuIndexAttr)) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
+    builder_->virtualGraph(tensor_id, ipu_index);
+    VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr(sIpuStageAttr)) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
+      builder_->pipelineStage(tensor_id, ipu_stage);
+      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+               << " for op: " << op_desc->Type();
+    }
+  }
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+}
+
+std::vector<popart::TensorId>& Compiler::GetWeights() { return weights_; }
+
+// convertFloatsToHalfs
+void Compiler::ConvertProtoToFp16() {
+  popart::GraphTransformer graph_transformer(builder_->getModelProto());
+  graph_transformer.convertFloatsToHalfs();
+  converted_proto_ = graph_transformer.getModelProto();
+}
+
+std::string Compiler::GetModelProto() {
+  if (converted_proto_.length()) {
+    return converted_proto_;
+  }
+  return builder_->getModelProto();
+}
+
+void Compiler::SaveModelProto(const std::string& path) {
+  builder_->saveModelProto(path);
+}
+
+void Compiler::SaveModelProtoNoCheck(const std::string& path) {
+  auto proto = GetModelProto();
+  std::ofstream onnxfile(path, std::ios_base::binary);
+  onnxfile.write(proto.data(), proto.size());
+  onnxfile.close();
+}
+
+std::vector<std::string> Compiler::GetOpInputs(const framework::OpDesc* op) {
+  auto ins = op->Input("__inputs__");
+  std::vector<std::string> inputs;
+  for (const auto& in : ins) {
+    if (tensors_.find(in) != tensors_.end()) {
+      inputs.push_back(tensors_[in]);
+    } else {
+      inputs.push_back(in);
+    }
+  }
+  return inputs;
+}
+
+const std::vector<std::string>& Compiler::GetOpOutputs(
+    const framework::OpDesc* op) {
+  return op->Output("__outputs__");
+}
+
+popart::DebugContext Compiler::BuildDebugContext(const framework::OpDesc* op) {
+  auto op_identify_id =
+      BOOST_GET_CONST(std::string, op->GetAttr(sOpIdentifyIdAttr));
+  VLOG(10) << "op_identify_id of op: " << op->Type() << " is "
+           << op_identify_id;
+  return popart::DebugContext(op_identify_id);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
new file mode 100644
index 00000000000000..ecee1595bb8923
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <popart/builder.hpp>
+#include <popart/graphtransformer.hpp>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/ipu/common.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class Compiler {
+ public:
+  Compiler();
+  ~Compiler();
+  void RegisterOpFunc();
+  void LowerBody(const framework::ir::Graph *graph);
+  void InitInputs(framework::ir::Graph *graph,
+                  const std::vector<std::string> &feed_list);
+  void InitOutputs(const std::vector<std::string> &fetch_list);
+  void LowerWeights(const framework::ir::Graph *graph,
+                    const framework::Scope *scope_);
+
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
+  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
+                        const framework::OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id,
+                        const framework::OpDesc *op_desc);
+
+  std::vector<popart::TensorId> GetInputs() { return inputs_; }
+  std::vector<popart::TensorId> GetOutputs() { return outputs_; }
+  std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
+  std::vector<popart::TensorId> &GetWeights();
+
+  std::string GetModelProto();
+  void SetIpuStrategy(const IpuStrategy &strategy) {
+    ipu_strategy_ = &strategy;
+  };
+  void SaveModelProto(const std::string &path);
+  void SaveModelProtoNoCheck(const std::string &path);
+  void ConvertProtoToFp16();
+
+ private:
+  std::vector<std::string> GetOpInputs(const framework::OpDesc *op);
+  const std::vector<std::string> &GetOpOutputs(const framework::OpDesc *op);
+  popart::DebugContext BuildDebugContext(const framework::OpDesc *op);
+
+ private:
+  std::unique_ptr<popart::Builder> builder_;
+
+  using OpFunc = std::function<void(framework::OpDesc *op_desc)>;
+  std::unordered_map<std::string, OpFunc> name_function_;
+
+  // stateful variable
+  std::map<std::string, popart::TensorId> tensors_;
+
+  // feed_list_ & fetch_list save paddle tensor id
+  std::vector<std::string> feed_list_;
+  std::vector<std::string> fetch_list_;
+
+  // inputs_ & outputs_ save popart tensor id
+  std::vector<popart::TensorId> inputs_;
+  std::vector<popart::TensorId> outputs_;
+
+  // weights info map
+  std::vector<popart::TensorId> weights_;
+
+  std::string converted_proto_ = "";
+  const IpuStrategy *ipu_strategy_ = nullptr;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
new file mode 100644
index 00000000000000..a7978ba6f37b13
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_executor.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+Executor::Executor() {}
+
+Executor::~Executor() {}
+
+void Executor::Prepare(const std::string &proto,
+                       const std::map<std::string, popart::TensorId> &tensors,
+                       const std::vector<popart::TensorId> &outputs,
+                       std::shared_ptr<popart::DeviceInfo> device) {
+  auto art = popart::AnchorReturnType("All");
+  std::map<popart::TensorId, popart::AnchorReturnType> anchor_ids;
+  for (const auto &id : outputs) {
+    anchor_ids.emplace(id, art);
+  }
+
+  auto dataFlow = popart::DataFlow(ipu_strategy_->batches_per_step, anchor_ids);
+
+  PADDLE_ENFORCE_NOT_NULL(device, platform::errors::Unavailable(
+                                      "IPU device isn't attached, please call "
+                                      "IpuBackend::AttachDevice(id) first."));
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    VLOG(10) << "Creating TrainingSession from Onnx Model...";
+    auto popart_optimizer = GetPopartOptimizer(opt_info);
+
+    auto it = tensors.find(opt_info.GetLoss());
+    PADDLE_ENFORCE_NE(
+        it, tensors.end(),
+        paddle::platform::errors::InvalidArgument(
+            "loss_id = %s doesn't exist in popart graph.", opt_info.GetLoss()));
+
+    session_ = popart::TrainingSession::createFromOnnxModel(
+        proto, dataFlow, it->second, *popart_optimizer, device,
+        popart::InputShapeInfo(), ipu_strategy_->popart_options_,
+        popart::Patterns(popart::PatternsLevel::Default));
+  } else {
+    VLOG(10) << "Creating InferenceSession from Onnx Model...";
+    session_ = popart::InferenceSession::createFromOnnxModel(
+        proto, dataFlow, device, popart::InputShapeInfo(),
+        ipu_strategy_->popart_options_,
+        popart::Patterns(popart::PatternsLevel::Default));
+  }
+  VLOG(10) << "Creating session from Onnx Model...done";
+
+  VLOG(10) << "Preparing session device...";
+  session_->prepareDevice();
+  VLOG(10) << "Preparing session device...done";
+
+  SetWeightsIO();
+
+  VLOG(10) << "Copy weights from paddle to popart...";
+  WeightsFromPaddle();
+  VLOG(10) << "Copy weights from paddle to popart...done";
+
+  VLOG(10) << "Copy weights from host to device...";
+  session_->weightsFromHost();
+  VLOG(10) << "Copy weights from host to device...done";
+
+  if (ipu_strategy_->save_init_onnx) {
+    session_->modelToHost("test_init.onnx");
+  }
+}
+
+void Executor::Run(const std::vector<popart::TensorId> &inputs_id,
+                   const std::vector<const framework::Tensor *> &inputs,
+                   const std::vector<popart::TensorId> &outputs_id,
+                   const std::vector<framework::Tensor *> &outputs,
+                   const framework::ExecutionContext &ctx) {
+  // inputs
+  std::map<popart::TensorId, popart::IArray &> popart_inputs;
+  std::map<popart::TensorId, PaddleIArray> input_wrappers;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto tensor_id = inputs_id[i];
+    framework::Tensor *tensor = nullptr;
+    tensor->ShareDataWith(*inputs[i]);
+    input_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
+  }
+  // anchors
+  std::map<popart::TensorId, popart::IArray &> popart_anchors;
+  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto tensor_id = outputs_id[i];
+    framework::Tensor *tensor = nullptr;
+    tensor->ShareDataWith(*outputs[i]);
+    // get dims & dtype from session
+    auto fetch_info = session_->getInfo(tensor_id);
+    auto output_shape = fetch_info.shape();
+    if (ipu_strategy_->batches_per_step > 1) {
+      output_shape.insert(output_shape.begin(),
+                          ipu_strategy_->batches_per_step);
+    }
+    tensor->Resize(framework::make_ddim(output_shape));
+    auto fetch_dtype = fetch_info.dataType();
+    auto paddle_type = PopartType2VarType(fetch_dtype);
+    tensor->mutable_data(ctx.GetPlace(), paddle_type);
+    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
+  }
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    VLOG(10) << "Update optimizer learning rate...";
+    SetLR(GetLRFromScope());
+    auto popart_optimizer = GetPopartOptimizer(opt_info);
+    auto &session = dynamic_cast<popart::TrainingSession &>(*session_);
+    session.updateOptimizerFromHost(popart_optimizer.get());
+  }
+
+  popart::StepIO stepio(popart_inputs, popart_anchors);
+  VLOG(10) << "Running...";
+  session_->run(stepio);
+  VLOG(10) << "Running...done";
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    session_->weightsToHost();
+    WeightsToPaddle();
+    if (ipu_strategy_->save_last_onnx) {
+      session_->modelToHost("test_last.onnx");
+    }
+  }
+}
+
+void Executor::SetOptimizerType(const std::string &type) {
+  opt_info.SetType(type);
+}
+
+void Executor::SetLR(float lr_rate) { opt_info.SetLR(lr_rate); }
+
+void Executor::SetOptimizerAttr(const std::string &attr, float value) {
+  opt_info.SetAttr(attr, value);
+}
+
+void Executor::SetLoss(const std::string &loss) { opt_info.SetLoss(loss); }
+
+void Executor::SetLRVarName(const std::string &name) {
+  opt_info.SetLRVarName(name);
+}
+
+void Executor::SetWeights(const std::vector<popart::TensorId> &weights) {
+  weights_ = weights;
+}
+
+void Executor::SetWeightsIO() {
+  auto opt_type = opt_info.GetType();
+  auto pre_post_fix = GetOptPrePostfix(opt_type);
+  for (const auto &weight_id : weights_) {
+    for (const auto &pair : pre_post_fix) {
+      if (!IsOptimizerSupported(opt_type)) {
+        continue;
+      }
+
+      // pair.first : popart prefix, pair.second : paddle postfix
+      auto popart_var_name = pair.first + weight_id;
+      auto paddle_var_name = weight_id + pair.second;
+
+      if (scope_->FindVar(paddle_var_name) == nullptr) {
+        continue;
+      }
+
+      auto var = scope_->GetVar(paddle_var_name);
+      auto data_ptr = var->GetMutable<framework::LoDTensor>()->data<float>();
+
+      auto tensor_info = session_->getInfo(popart_var_name);
+      weights_io_.insert(popart_var_name, {data_ptr, tensor_info});
+    }
+  }
+}
+
+void Executor::WeightsFromPaddle() { session_->writeWeights(weights_io_); }
+
+void Executor::WeightsToPaddle() { session_->readWeights(weights_io_); }
+
+void Executor::SetIpuStrategy(const IpuStrategy &strategy) {
+  ipu_strategy_ = &strategy;
+}
+
+float Executor::GetLRFromScope() {
+  auto lr_var = scope_->GetVar(opt_info.GetLRVarName());
+  auto tensor = lr_var->Get<framework::LoDTensor>();
+
+  PADDLE_ENFORCE_EQ(tensor.type(), framework::proto::VarType::FP32,
+                    platform::errors::InvalidArgument(
+                        "LR requiree float, but got (%s).", tensor.type()));
+
+  return tensor.data<float>()[0];
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
new file mode 100644
index 00000000000000..400884a2c2b0fd
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/dataflow.hpp>
+#include <popart/names.hpp>
+#include <popart/session.hpp>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/ipu/common.h"
+#include "paddle/fluid/platform/ipu/ipu_optimizer.h"
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+class Executor {
+ public:
+  Executor();
+  ~Executor();
+
+  void Prepare(const std::string &proto,
+               const std::map<std::string, popart::TensorId> &tensors,
+               const std::vector<popart::TensorId> &outputs,
+               std::shared_ptr<popart::DeviceInfo> device);
+
+  void Run(const std::vector<popart::TensorId> &inputs_id,
+           const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<popart::TensorId> &outputs_id,
+           const std::vector<framework::Tensor *> &outputs,
+           const framework::ExecutionContext &ctx);
+
+  // Optimizer
+  void SetOptimizerType(const std::string &type);
+  void SetOptimizerAttr(const std::string &attr, float value);
+  void SetLoss(const std::string &loss);
+  void SetLR(float lr_rate);
+  void SetLRVarName(const std::string &name);
+
+  void SetWeights(const std::vector<popart::TensorId> &info);
+
+  void SetWeightsIO();
+  void WeightsFromPaddle();
+  void WeightsToPaddle();
+
+  // Scope
+  void SetScope(const framework::Scope *scope) { scope_ = scope; }
+
+  // Strategy
+  void SetIpuStrategy(const IpuStrategy &strategy);
+
+ private:
+  float GetLRFromScope();
+
+ public:
+  OptmizerMetaInfo opt_info;
+  std::unique_ptr<popart::Session> session_;
+
+ private:
+  const framework::Scope *scope_ = nullptr;
+  const IpuStrategy *ipu_strategy_ = nullptr;
+  popart::WeightsIO weights_io_;
+  std::vector<popart::TensorId> weights_;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
new file mode 100644
index 00000000000000..ea8ae8e1f026a3
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_optimizer.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+OptmizerMetaInfo::OptmizerMetaInfo() {}
+
+OptmizerMetaInfo::~OptmizerMetaInfo() {}
+
+void OptmizerMetaInfo::SetType(const std::string &type) {
+  type_ = OptTypeStr2Enum(type);
+}
+
+float OptmizerMetaInfo::GetAttr(const std::string &attr,
+                                float default_value) const {
+  if (attrs_.count(attr) == 0) {
+    return default_value;
+  }
+  return attrs_.at(attr);
+}
+
+void OptmizerMetaInfo::SetAttr(const std::string &attr, float value) {
+  attrs_[attr] = value;
+}
+
+OptimizerType OptTypeStr2Enum(const std::string type) {
+  if (type == "sgd") {
+    return OptimizerType::SGD;
+  } else if (type == "adam") {
+    return OptimizerType::Adam;
+  } else if (type == "lamb") {
+    return OptimizerType::Lamb;
+  } else {
+    return OptimizerType::Undefined;
+  }
+}
+
+std::unique_ptr<popart::Optimizer> GetPopartOptimizer(
+    const OptmizerMetaInfo &opt_meta_info) {
+  auto opt_type = opt_meta_info.GetType();
+  PADDLE_ENFORCE_NE(
+      opt_type, OptimizerType::Undefined,
+      platform::errors::InvalidArgument("Optimizer type have not been set."));
+
+  if (opt_type == OptimizerType::SGD) {
+    auto optimizer = std::make_unique<popart::SGD>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(popart::SGD::getUnsetWeightDecay()),
+        popart::OptimizerValue(popart::SGD::getUnsetMomentum()),
+        popart::OptimizerValue(popart::SGD::getUnsetDampening()),
+        popart::OptimizerValue(popart::SGD::getUnsetVelocityScaling()),
+        popart::OptimizerValue(popart::SGD::getUnsetLossScaling()));
+    return optimizer;
+  } else if (opt_type == OptimizerType::Adam) {
+    auto optimizer = std::make_unique<popart::Adam>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(popart::Adam::getUnsetWeightDecay()),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false),
+        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
+        popart::AdamMode::Adam, popart::WeightDecayMode::Decay,
+        popart::DataType::FLOAT, popart::DataType::FLOAT,
+        popart::DataType::FLOAT);
+    return optimizer;
+  } else if (opt_type == OptimizerType::Lamb) {
+    auto optimizer = std::make_unique<popart::Adam>(
+        popart::OptimizerValue(opt_meta_info.GetLR(), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("weight_decay"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta1"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("beta2"), false),
+        popart::OptimizerValue(opt_meta_info.GetAttr("epsilon"), false),
+        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
+        popart::AdamMode::Lamb, popart::WeightDecayMode::Decay,
+        popart::DataType::FLOAT, popart::DataType::FLOAT,
+        popart::DataType::FLOAT);
+    return optimizer;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Optimizer %d is not implemented now.", static_cast<int>(opt_type)));
+  }
+}
+
+bool IsOptimizerSupported(OptimizerType type) {
+  switch (type) {
+    case OptimizerType::SGD:
+    case OptimizerType::Adam:
+    case OptimizerType::Lamb:
+      return true;
+    default:
+      return false;
+  }
+}
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    OptimizerType opt_type) {
+  // format: {popart_tensor_id, paddle_tensor_id}, ...
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+
+  switch (opt_type) {
+    case OptimizerType::SGD:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      break;
+    case OptimizerType::Adam:
+    case OptimizerType::Lamb:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+      pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+      pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+      break;
+    default:
+      pre_post_fix.push_back(std::make_pair("", ""));
+      break;
+  }
+
+  return pre_post_fix;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.h b/paddle/fluid/platform/device/ipu/ipu_optimizer.h
new file mode 100644
index 00000000000000..ee16abce398fb6
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/adam.hpp>
+#include <popart/names.hpp>
+#include <popart/optimizer.hpp>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+enum class OptimizerType { SGD = 0, Adam, Lamb, Undefined };
+
+class OptmizerMetaInfo {
+ public:
+  OptmizerMetaInfo();
+  ~OptmizerMetaInfo();
+
+  void SetType(const std::string &type);
+  OptimizerType GetType() const { return type_; }
+
+  void SetAttr(const std::string &attr, float value);
+  float GetAttr(const std::string &attr, float default_value = 0.0f) const;
+
+  void SetLoss(const std::string &loss) { loss_ = loss; }
+  std::string GetLoss() const { return loss_; }
+
+  void SetLR(float lr_rate) { lr_rate_ = lr_rate; }
+  float GetLR() const { return lr_rate_; }
+
+  void SetLRVarName(const std::string &name) { lr_var_name_ = name; }
+  std::string GetLRVarName() const { return lr_var_name_; }
+
+ private:
+  // type: adam, sgd, ...
+  OptimizerType type_ = OptimizerType::Undefined;
+
+  // loss: loss TensorId
+  std::string loss_;
+
+  // attrs: beta1, beta2, ...
+  std::map<std::string, float> attrs_;
+
+  // learning rate
+  float lr_rate_ = 1.0;
+  std::string lr_var_name_;
+};
+
+OptimizerType OptTypeStr2Enum(const std::string type);
+
+std::unique_ptr<popart::Optimizer> GetPopartOptimizer(
+    const OptmizerMetaInfo &info);
+
+bool IsOptimizerSupported(OptimizerType type);
+
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    OptimizerType type);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
new file mode 100644
index 00000000000000..47e7e332c8fbaf
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_strategy.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
new file mode 100644
index 00000000000000..7e07d517e10318
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/sessionoptions.hpp>
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using VirtualGraphMode = popart::VirtualGraphMode;
+
+struct IpuStrategy {
+  int num_ipus = 1;
+  int batches_per_step = 1;
+  int batch_size = 1;
+  bool is_training = true;
+  bool save_init_onnx = false;
+  bool save_last_onnx = true;
+  popart::SessionOptions popart_options_;
+  bool need_avg_shard = false;
+  bool enable_fp16 = false;
+};
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
new file mode 100644
index 00000000000000..08ba50415dd5ff
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+void* PaddleIArray::data() { return tensor_->data<void>(); }
+
+popart::DataType PaddleIArray::dataType() const {
+  return VarType2PopartType(tensor_->type());
+}
+
+std::size_t PaddleIArray::rank() const { return tensor_->dims().size(); }
+
+int64_t PaddleIArray::dim(size_t index) const {
+  return tensor_->dims().at(index);
+}
+
+std::size_t PaddleIArray::nelms() const {
+  return std::accumulate(shape_.begin(), shape_.end(), static_cast<int64_t>(1),
+                         std::multiplies<int64_t>());
+}
+
+const popart::Shape PaddleIArray::shape() const { return shape_; }
+
+popart::DataType VarType2PopartType(
+    const framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::UINT8:
+      return popart::DataType::UINT8;
+    case framework::proto::VarType::INT8:
+      return popart::DataType::INT8;
+    case framework::proto::VarType::INT16:
+      return popart::DataType::INT16;
+    case framework::proto::VarType::INT32:
+      return popart::DataType::INT32;
+    case framework::proto::VarType::INT64:
+      return popart::DataType::INT64;
+    case framework::proto::VarType::BOOL:
+      return popart::DataType::BOOL;
+    case framework::proto::VarType::FP64:
+      return popart::DataType::DOUBLE;
+    case framework::proto::VarType::FP32:
+      return popart::DataType::FLOAT;
+    case framework::proto::VarType::FP16:
+      return popart::DataType::FLOAT16;
+    case framework::proto::VarType::BF16:
+      return popart::DataType::BFLOAT16;
+    case framework::proto::VarType::COMPLEX64:
+      return popart::DataType::COMPLEX64;
+    case framework::proto::VarType::COMPLEX128:
+      return popart::DataType::COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "Unsupported Paddle var type."));
+  }
+}
+
+framework::proto::VarType::Type PopartType2VarType(
+    const popart::DataType type) {
+  switch (type) {
+    case popart::DataType::UINT8:
+      return framework::proto::VarType::UINT8;
+    case popart::DataType::INT8:
+      return framework::proto::VarType::INT8;
+    case popart::DataType::INT16:
+      return framework::proto::VarType::INT16;
+    case popart::DataType::INT32:
+      return framework::proto::VarType::INT32;
+    case popart::DataType::INT64:
+      return framework::proto::VarType::INT64;
+    case popart::DataType::BOOL:
+      return framework::proto::VarType::BOOL;
+    case popart::DataType::DOUBLE:
+      return framework::proto::VarType::FP64;
+    case popart::DataType::FLOAT:
+      return framework::proto::VarType::FP32;
+    case popart::DataType::FLOAT16:
+      return framework::proto::VarType::FP16;
+    case popart::DataType::BFLOAT16:
+      return framework::proto::VarType::BF16;
+    case popart::DataType::COMPLEX64:
+      return framework::proto::VarType::COMPLEX64;
+    case popart::DataType::COMPLEX128:
+      return framework::proto::VarType::COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "Unsupported Paddle var type."));
+  }
+}
+
+popart::DataType OnnxDtype2PopartType(const int type) {
+  auto dtype = static_cast<ONNXDataType>(type);
+  switch (dtype) {
+    case ONNXDataType::BOOL:
+      return popart::DataType::BOOL;
+    case ONNXDataType::INT16:
+      return popart::DataType::INT16;
+    case ONNXDataType::INT32:
+      return popart::DataType::INT32;
+    case ONNXDataType::INT64:
+      return popart::DataType::INT64;
+    case ONNXDataType::FLOAT16:
+      return popart::DataType::FLOAT16;
+    case ONNXDataType::FLOAT:
+      return popart::DataType::FLOAT;
+    case ONNXDataType::DOUBLE:
+      return popart::DataType::DOUBLE;
+    case ONNXDataType::UINT8:
+      return popart::DataType::UINT8;
+    case ONNXDataType::INT8:
+      return popart::DataType::INT8;
+    case ONNXDataType::BFLOAT16:
+      return popart::DataType::BFLOAT16;
+    case ONNXDataType::COMPLEX64:
+      return popart::DataType::COMPLEX64;
+    case ONNXDataType::COMPLEX128:
+      return popart::DataType::COMPLEX128;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported ONNX data type: %d.", dtype));
+  }
+}
+
+// count num should > 0
+bool GetBoolEnv(std::string str) {
+  char* str_val = getenv(str.c_str());
+  if (str_val == NULL) {
+    return false;
+  } else {
+    bool val = false;
+    if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 ||
+        strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0)
+      val = true;
+    return val;
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
new file mode 100644
index 00000000000000..670427128b8704
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <popart/ndarraywrapper.hpp>
+#include <popart/tensordata.hpp>
+#include <popart/tensorinfo.hpp>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// onnx dtype
+// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
+enum ONNXDataType : int {
+  UNDEFINED = 0,
+  FLOAT = 1,
+  UINT8 = 2,
+  INT8 = 3,
+  UINT16 = 4,
+  INT16 = 5,
+  INT32 = 6,
+  INT64 = 7,
+  STRING = 8,
+  BOOL = 9,
+  FLOAT16 = 10,
+  DOUBLE = 11,
+  UINT32 = 12,
+  UINT64 = 13,
+  COMPLEX64 = 14,
+  COMPLEX128 = 15,
+  BFLOAT16 = 16
+};
+
+class PaddleIArray final : public popart::IArray {
+ public:
+  explicit PaddleIArray(framework::Tensor *tensor) : tensor_(tensor) {
+    for (int i = 0; i < tensor->dims().size(); ++i) {
+      shape_.push_back(tensor->dims().at(i));
+    }
+  }
+
+ public:
+  void *data();
+  popart::DataType dataType() const;
+  std::size_t rank() const;
+  int64_t dim(size_t index) const;
+  std::size_t nelms() const;
+  const popart::Shape shape() const;
+
+ private:
+  framework::Tensor *tensor_;
+  std::vector<int64_t> shape_;
+};
+
+popart::DataType VarType2PopartType(const framework::proto::VarType::Type type);
+framework::proto::VarType::Type PopartType2VarType(const popart::DataType type);
+popart::DataType OnnxDtype2PopartType(const int type);
+bool GetBoolEnv(std::string str);
+
+template <typename T>
+std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(
+    const framework::Tensor &tensor) {
+  auto dtype = VarType2PopartType(tensor.type());
+  auto shape = std::vector<int64_t>();
+  for (size_t i = 0; i < tensor.dims().size(); ++i) {
+    shape.push_back(tensor.dims().at(i));
+  }
+  popart::TensorInfo tensor_info(dtype, shape);
+
+  return std::make_unique<popart::NDArrayWrapper<T>>(
+      reinterpret_cast<T *>(tensor.data<void>()), tensor_info);
+}
+
+template <typename T>
+std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
+    framework::LoDTensor const &lod_tensor) {
+  PADDLE_ENFORCE_EQ(
+      lod_tensor.lod().size(), 0UL,
+      platform::errors::InvalidArgument("LoDTensor2IArray is Unimplemented"));
+  return Tensor2IArray<T>(lod_tensor);
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
new file mode 100644
index 00000000000000..4cd7f928f6e22b
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -0,0 +1,197 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// clang-format off
+
+#pragma once
+
+// Ops from AiGraphcoreOpset1
+OP_DECL(popart_groupnormalization_v2, aiGraphcoreOpset.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) ) // NOLINT
+OP_DECL(popart_subsample_v2, aiGraphcoreOpset.subsample, ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_nop_v2, aiGraphcoreOpset.nop, NONE) // NOLINT
+OP_DECL(popart_scale_v2, aiGraphcoreOpset.scale, ARG(FLOAT,scale) ) // NOLINT
+OP_DECL(popart_scaledadd_v2, aiGraphcoreOpset.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) ) // NOLINT
+OP_DECL(popart_gelu_v2, aiGraphcoreOpset.gelu, NONE) // NOLINT
+OP_DECL(popart_detach_v2, aiGraphcoreOpset.detach, NONE) // NOLINT
+OP_DECL(popart_depthtospace_v2, aiGraphcoreOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_round_v2, aiGraphcoreOpset.round, NONE) // NOLINT
+OP_DECL(popart_dynamicslice_v2, aiGraphcoreOpset.dynamicslice, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT
+OP_DECL(popart_dynamicupdate_v2, aiGraphcoreOpset.dynamicupdate, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ARG(INT,noOverlap) ) // NOLINT
+OP_DECL(popart_dynamiczero_v2, aiGraphcoreOpset.dynamiczero, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT
+OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT
+OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT
+OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT
+OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT
+OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT
+OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT
+OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT
+OP_DECL(popart_log1p_v2, aiGraphcoreOpset.log1p, NONE) // NOLINT
+OP_DECL(popart_fmod_v2, aiGraphcoreOpset.fmod, NONE) // NOLINT
+OP_DECL(popart_remainder_v2, aiGraphcoreOpset.remainder, NONE) // NOLINT
+OP_DECL(popart_reverse_v2, aiGraphcoreOpset.reverse, ARG(INT_VEC,dimensions) ) // NOLINT
+OP_DECL(popart_bitwisenot_v2, aiGraphcoreOpset.bitwisenot, NONE) // NOLINT
+OP_DECL(popart_bitwiseand_v2, aiGraphcoreOpset.bitwiseand, NONE) // NOLINT
+OP_DECL(popart_bitwiseor_v2, aiGraphcoreOpset.bitwiseor, NONE) // NOLINT
+OP_DECL(popart_bitwisexor_v2, aiGraphcoreOpset.bitwisexor, NONE) // NOLINT
+OP_DECL(popart_bitwisexnor_v2, aiGraphcoreOpset.bitwisexnor, NONE) // NOLINT
+OP_DECL(popart_reducemedian_v2, aiGraphcoreOpset.reducemedian, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+// Ops from AiOnnxOpset11
+OP_DECL(popart_argmax, aiOnnxOpset.argmax, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_argmin, aiOnnxOpset.argmin, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_averagepool, aiOnnxOpset.averagepool, ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT,count_include_pad) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_bitshift, aiOnnxOpset.bitshift, ARG(STRING,direction) ) // NOLINT
+OP_DECL(popart_clip, aiOnnxOpset.clip, NONE) // NOLINT
+OP_DECL(popart_compress, aiOnnxOpset.compress, OPT_ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_concat, aiOnnxOpset.concat, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_concatfromsequence, aiOnnxOpset.concatfromsequence, ARG(INT,axis) ARG(INT,new_axis) ) // NOLINT
+OP_DECL(popart_conv, aiOnnxOpset.conv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_convtranspose, aiOnnxOpset.convtranspose, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,output_padding) ARG(INT_VEC,output_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_cumsum, aiOnnxOpset.cumsum, ARG(INT,exclusive) ARG(INT,reverse) ) // NOLINT
+OP_DECL(popart_depthtospace, aiOnnxOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_det, aiOnnxOpset.det, NONE) // NOLINT
+OP_DECL(popart_dynamicquantizelinear, aiOnnxOpset.dynamicquantizelinear, NONE) // NOLINT
+OP_DECL(popart_equal, aiOnnxOpset.equal, NONE) // NOLINT
+OP_DECL(popart_flatten, aiOnnxOpset.flatten, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gather, aiOnnxOpset.gather, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gatherelements, aiOnnxOpset.gatherelements, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_gathernd, aiOnnxOpset.gathernd, NONE) // NOLINT
+OP_DECL(popart_gemm, aiOnnxOpset.gemm, ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(INT,transA) ARG(INT,transB) ) // NOLINT
+OP_DECL(popart_hardmax, aiOnnxOpset.hardmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_logsoftmax, aiOnnxOpset.logsoftmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_lppool, aiOnnxOpset.lppool, ARG(INT_VEC,kernel_shape) ARG(INT,p) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_maxpool, aiOnnxOpset.maxpool, ARG(INT,num_outputs) ARG(INT_VEC,kernel_shape) ARG(INT,ceil_mode) ARG(INT_VEC,dilations) ARG(INT_VEC,pads) ARG(INT,storage_order) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_maxunpool, aiOnnxOpset.maxunpool, ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_nonmaxsuppression, aiOnnxOpset.nonmaxsuppression, ARG(INT,center_point_box) ) // NOLINT
+OP_DECL(popart_onehot, aiOnnxOpset.onehot, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_pad, aiOnnxOpset.pad, ARG(STRING,mode) ) // NOLINT
+OP_DECL(popart_range, aiOnnxOpset.range, NONE) // NOLINT
+OP_DECL(popart_reducel1, aiOnnxOpset.reducel1, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducel2, aiOnnxOpset.reducel2, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducelogsum, aiOnnxOpset.reducelogsum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducelogsumexp, aiOnnxOpset.reducelogsumexp, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemax, aiOnnxOpset.reducemax, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemean, aiOnnxOpset.reducemean, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducemin, aiOnnxOpset.reducemin, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reduceprod, aiOnnxOpset.reduceprod, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducesum, aiOnnxOpset.reducesum, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_reducesumsquare, aiOnnxOpset.reducesumsquare, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_resize, aiOnnxOpset.resize, ARG(STRING,coordinate_transformation_mode) ARG(FLOAT,cubic_coeff_a) ARG(INT,exclude_outside) ARG(FLOAT,extrapolation_value) ARG(STRING,mode) ARG(STRING,nearest_mode) ) // NOLINT
+OP_DECL(popart_round, aiOnnxOpset.round, NONE) // NOLINT
+OP_DECL(popart_scatter, aiOnnxOpset.scatter, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_scatterelements, aiOnnxOpset.scatterelements, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_scatternd, aiOnnxOpset.scatternd, NONE) // NOLINT
+OP_DECL(popart_sequenceat, aiOnnxOpset.sequenceat, NONE) // NOLINT
+OP_DECL(popart_sequenceconstruct, aiOnnxOpset.sequenceconstruct, NONE) // NOLINT
+OP_DECL(popart_sequenceerase, aiOnnxOpset.sequenceerase, NONE) // NOLINT
+OP_DECL(popart_sequenceinsert, aiOnnxOpset.sequenceinsert, NONE) // NOLINT
+OP_DECL(popart_sequencelength, aiOnnxOpset.sequencelength, NONE) // NOLINT
+OP_DECL(popart_slice, aiOnnxOpset.slice, NONE) // NOLINT
+OP_DECL(popart_softmax, aiOnnxOpset.softmax, ARG(INT,axis) ) // NOLINT
+OP_DECL(popart_split, aiOnnxOpset.split, ARG(INT,num_outputs) ARG(INT,axis) ARG(INT_VEC,split) ) // NOLINT
+OP_DECL(popart_splittosequence, aiOnnxOpset.splittosequence, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_squeeze, aiOnnxOpset.squeeze, ARG(INT_VEC,axes) ) // NOLINT
+OP_DECL(popart_topk, aiOnnxOpset.topk, ARG(INT,axis) ARG(INT,largest) ARG(INT,sorted) ) // NOLINT
+OP_DECL(popart_unique, aiOnnxOpset.unique, ARG(INT,num_outputs) OPT_ARG(INT,axis) ARG(INT,sorted) ) // NOLINT
+OP_DECL(popart_unsqueeze, aiOnnxOpset.unsqueeze, ARG(INT_VEC,axes) ) // NOLINT
+// Ops from AiOnnxOpset10
+OP_DECL(popart_convinteger, aiOnnxOpset.convinteger, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_dequantizelinear, aiOnnxOpset.dequantizelinear, NONE) // NOLINT
+OP_DECL(popart_dropout, aiOnnxOpset.dropout, ARG(INT,num_outputs) ARG(FLOAT,ratio) ) // NOLINT
+OP_DECL(popart_isinf, aiOnnxOpset.isinf, ARG(INT,detect_negative) ARG(INT,detect_positive) ) // NOLINT
+OP_DECL(popart_matmulinteger, aiOnnxOpset.matmulinteger, NONE) // NOLINT
+OP_DECL(popart_mod, aiOnnxOpset.mod, ARG(INT,fmod) ) // NOLINT
+OP_DECL(popart_qlinearconv, aiOnnxOpset.qlinearconv, ARG(INT_VEC,dilations) ARG(INT,group) ARG(INT_VEC,kernel_shape) ARG(INT_VEC,pads) ARG(INT_VEC,strides) ) // NOLINT
+OP_DECL(popart_qlinearmatmul, aiOnnxOpset.qlinearmatmul, NONE) // NOLINT
+OP_DECL(popart_quantizelinear, aiOnnxOpset.quantizelinear, NONE) // NOLINT
+OP_DECL(popart_reversesequence, aiOnnxOpset.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) ) // NOLINT
+OP_DECL(popart_roialign, aiOnnxOpset.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) ) // NOLINT
+OP_DECL(popart_thresholdedrelu, aiOnnxOpset.thresholdedrelu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_upsample, aiOnnxOpset.upsample, ARG(STRING,mode) ) // NOLINT
+// Ops from AiOnnxOpset9
+OP_DECL(popart_acosh, aiOnnxOpset.acosh, NONE) // NOLINT
+OP_DECL(popart_asinh, aiOnnxOpset.asinh, NONE) // NOLINT
+OP_DECL(popart_atanh, aiOnnxOpset.atanh, NONE) // NOLINT
+OP_DECL(popart_batchnormalization, aiOnnxOpset.batchnormalization, ARG(INT,num_outputs) ARG(FLOAT,epsilon) ARG(FLOAT,momentum) ) // NOLINT
+OP_DECL(popart_cast, aiOnnxOpset.cast, ARG(STRING,to) ) // NOLINT
+OP_DECL(popart_cosh, aiOnnxOpset.cosh, NONE) // NOLINT
+OP_DECL(popart_erf, aiOnnxOpset.erf, NONE) // NOLINT
+OP_DECL(popart_eyelike, aiOnnxOpset.eyelike, OPT_ARG(INT,dtype) ARG(INT,k) ) // NOLINT
+OP_DECL(popart_greater, aiOnnxOpset.greater, NONE) // NOLINT
+OP_DECL(popart_isnan, aiOnnxOpset.isnan, NONE) // NOLINT
+OP_DECL(popart_less, aiOnnxOpset.less, NONE) // NOLINT
+OP_DECL(popart_matmul, aiOnnxOpset.matmul, NONE) // NOLINT
+OP_DECL(popart_meanvariancenormalization, aiOnnxOpset.meanvariancenormalization, ARG(INT_VEC,axes) ) // NOLINT
+OP_DECL(popart_nonzero, aiOnnxOpset.nonzero, NONE) // NOLINT
+OP_DECL(popart_prelu, aiOnnxOpset.prelu, NONE) // NOLINT
+OP_DECL(popart_shrink, aiOnnxOpset.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) ) // NOLINT
+OP_DECL(popart_sign, aiOnnxOpset.sign, NONE) // NOLINT
+OP_DECL(popart_sinh, aiOnnxOpset.sinh, NONE) // NOLINT
+OP_DECL(popart_where, aiOnnxOpset.where, NONE) // NOLINT
+// Ops from AiOnnxOpset8
+OP_DECL(popart_expand, aiOnnxOpset.expand, NONE) // NOLINT
+OP_DECL(popart_max, aiOnnxOpset.max, NONE) // NOLINT
+OP_DECL(popart_mean, aiOnnxOpset.mean, NONE) // NOLINT
+OP_DECL(popart_min, aiOnnxOpset.min, NONE) // NOLINT
+OP_DECL(popart_sum, aiOnnxOpset.sum, NONE) // NOLINT
+// Ops from AiOnnxOpset7
+OP_DECL(popart_acos, aiOnnxOpset.acos, NONE) // NOLINT
+OP_DECL(popart_add, aiOnnxOpset.add, NONE) // NOLINT
+OP_DECL(popart_logical_and, aiOnnxOpset.logical_and, NONE) // NOLINT
+OP_DECL(popart_asin, aiOnnxOpset.asin, NONE) // NOLINT
+OP_DECL(popart_atan, aiOnnxOpset.atan, NONE) // NOLINT
+OP_DECL(popart_cos, aiOnnxOpset.cos, NONE) // NOLINT
+OP_DECL(popart_div, aiOnnxOpset.div, NONE) // NOLINT
+OP_DECL(popart_mul, aiOnnxOpset.mul, NONE) // NOLINT
+OP_DECL(popart_multinomial, aiOnnxOpset.multinomial, ARG(INT,dtype) ARG(INT,sample_size) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_logical_or, aiOnnxOpset.logical_or, NONE) // NOLINT
+OP_DECL(popart_pow, aiOnnxOpset.pow, NONE) // NOLINT
+OP_DECL(popart_sin, aiOnnxOpset.sin, NONE) // NOLINT
+OP_DECL(popart_sub, aiOnnxOpset.sub, NONE) // NOLINT
+OP_DECL(popart_tan, aiOnnxOpset.tan, NONE) // NOLINT
+OP_DECL(popart_logical_xor, aiOnnxOpset.logical_xor, NONE) // NOLINT
+// Ops from AiOnnxOpset6
+OP_DECL(popart_abs, aiOnnxOpset.abs, NONE) // NOLINT
+OP_DECL(popart_ceil, aiOnnxOpset.ceil, NONE) // NOLINT
+OP_DECL(popart_elu, aiOnnxOpset.elu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_exp, aiOnnxOpset.exp, NONE) // NOLINT
+OP_DECL(popart_floor, aiOnnxOpset.floor, NONE) // NOLINT
+OP_DECL(popart_globalaveragepool, aiOnnxOpset.globalaveragepool, NONE) // NOLINT
+OP_DECL(popart_globallppool, aiOnnxOpset.globallppool, ARG(INT,p) ) // NOLINT
+OP_DECL(popart_globalmaxpool, aiOnnxOpset.globalmaxpool, NONE) // NOLINT
+OP_DECL(popart_hardsigmoid, aiOnnxOpset.hardsigmoid, ARG(FLOAT,alpha) ARG(FLOAT,beta) ) // NOLINT
+OP_DECL(popart_identity, aiOnnxOpset.identity, NONE) // NOLINT
+OP_DECL(popart_instancenormalization, aiOnnxOpset.instancenormalization, ARG(FLOAT,epsilon) ) // NOLINT
+OP_DECL(popart_lrn, aiOnnxOpset.lrn, ARG(INT,size) ARG(FLOAT,alpha) ARG(FLOAT,beta) ARG(FLOAT,bias) ) // NOLINT
+OP_DECL(popart_leakyrelu, aiOnnxOpset.leakyrelu, ARG(FLOAT,alpha) ) // NOLINT
+OP_DECL(popart_log, aiOnnxOpset.log, NONE) // NOLINT
+OP_DECL(popart_lpnormalization, aiOnnxOpset.lpnormalization, ARG(INT,axis) ARG(INT,p) ) // NOLINT
+OP_DECL(popart_maxroipool, aiOnnxOpset.maxroipool, ARG(INT_VEC,pooled_shape) ARG(FLOAT,spatial_scale) ) // NOLINT
+OP_DECL(popart_neg, aiOnnxOpset.neg, NONE) // NOLINT
+OP_DECL(popart_logical_not, aiOnnxOpset.logical_not, NONE) // NOLINT
+OP_DECL(popart_randomnormallike, aiOnnxOpset.randomnormallike, OPT_ARG(INT,dtype) ARG(FLOAT,mean) ARG(FLOAT,scale) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_randomuniformlike, aiOnnxOpset.randomuniformlike, OPT_ARG(INT,dtype) ARG(FLOAT,high) ARG(FLOAT,low) OPT_ARG(FLOAT,seed) ) // NOLINT
+OP_DECL(popart_reciprocal, aiOnnxOpset.reciprocal, NONE) // NOLINT
+OP_DECL(popart_relu, aiOnnxOpset.relu, NONE) // NOLINT
+OP_DECL(popart_reshape, aiOnnxOpset.reshape, NONE) // NOLINT
+OP_DECL(popart_selu, aiOnnxOpset.selu, ARG(FLOAT,alpha) ARG(FLOAT,gamma) ) // NOLINT
+OP_DECL(popart_shape, aiOnnxOpset.shape, NONE) // NOLINT
+OP_DECL(popart_sigmoid, aiOnnxOpset.sigmoid, NONE) // NOLINT
+OP_DECL(popart_size, aiOnnxOpset.size, NONE) // NOLINT
+OP_DECL(popart_softplus, aiOnnxOpset.softplus, NONE) // NOLINT
+OP_DECL(popart_softsign, aiOnnxOpset.softsign, NONE) // NOLINT
+OP_DECL(popart_spacetodepth, aiOnnxOpset.spacetodepth, ARG(INT,blocksize) ) // NOLINT
+OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
+OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
+OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
+OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT

From 075a02d24e0f77d37cf628c348311843a1e6698a Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Fri, 3 Dec 2021 18:57:26 +0800
Subject: [PATCH 065/124] Fix _numel func logic and add test (#37810)

---
 paddle/fluid/pybind/imperative.cc                    | 4 ----
 python/paddle/fluid/tests/unittests/test_var_base.py | 8 +++++++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index f03acc3808468d..29a1f0eafcb219 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1963,10 +1963,6 @@ void BindImperative(py::module *m_ptr) {
       .def("_numel",
            [](std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
-             PADDLE_ENFORCE_EQ(
-                 t->IsInitialized(), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor %s has not been initialized!", self->Name()));
              return t->numel();
            })
       .def_property("name", &imperative::VarBase::Name,
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 0e50a20a04e651..ab6e8003833ec8 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1279,7 +1279,7 @@ def test_varbase_init(self):
 
 
 class TestVarBaseNumel(unittest.TestCase):
-    def test_numel(self):
+    def test_numel_normal(self):
         paddle.disable_static()
         np_x = np.random.random((3, 8, 8))
         x = paddle.to_tensor(np_x, dtype="float64")
@@ -1287,6 +1287,12 @@ def test_numel(self):
         x_expected_numel = np.product((3, 8, 8))
         self.assertEqual(x_actual_numel, x_expected_numel)
 
+    def test_numel_without_holder(self):
+        paddle.disable_static()
+        x_without_holder = core.VarBase()
+        x_actual_numel = x_without_holder._numel()
+        self.assertEqual(x_actual_numel, 0)
+
 
 class TestVarBaseCopyGradientFrom(unittest.TestCase):
     def test_copy_gradient_from(self):

From 5d0ce171bb28d54c20a45f2e60989a71348f70bd Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Fri, 3 Dec 2021 21:59:12 +0800
Subject: [PATCH 066/124] add time wait for message bus (#37809)

---
 .../fleet_executor/compute_interceptor.cc     | 71 +++++++++++++------
 .../fleet_executor/fleet_executor.cc          |  9 +++
 .../distributed/fleet_executor/message_bus.cc | 33 ++++++++-
 3 files changed, 88 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 084e91c11caa73..41c77c1ead045f 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -46,6 +46,8 @@ void ComputeInterceptor::PrepareDeps() {
                           "Source ComputeInterceptor must run at least one "
                           "times, but now max_run_times=%ld",
                           node_->max_run_times()));
+    in_readys_.emplace(-1,
+                       std::make_pair(std::numeric_limits<int64_t>::max(), 0));
   }
 
   // If there is no downstream or every downstream is in different rank,
@@ -55,14 +57,17 @@ void ComputeInterceptor::PrepareDeps() {
 }
 
 void ComputeInterceptor::IncreaseReady(int64_t up_id) {
-  // source node has no upstream, data_is_ready is send by carrier or others
-  if (is_source_ && up_id == -1) return;
-
   auto it = in_readys_.find(up_id);
   PADDLE_ENFORCE_NE(it, in_readys_.end(),
                     platform::errors::NotFound(
                         "Cannot find upstream=%lld in in_readys.", up_id));
 
+  // source node has no upstream, data_is_ready is send by carrier or others
+  if (is_source_ && up_id == -1) {
+    it->second.second = GetTaskNode()->max_run_times();
+    return;
+  }
+
   auto max_ready_size = it->second.first;
   auto ready_size = it->second.second;
   ready_size += 1;
@@ -93,7 +98,11 @@ bool ComputeInterceptor::IsInputReady() {
   for (auto& ins : in_readys_) {
     auto ready_size = ins.second.second;
     // not ready, return false
-    if (ready_size == 0) return false;
+    if (ready_size == 0) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << "'s upstreams aren't all ready.";
+      return false;
+    }
   }
   return true;
 }
@@ -103,14 +112,23 @@ bool ComputeInterceptor::CanWriteOutput() {
     auto max_buffer_size = outs.second.first;
     auto used_size = outs.second.second;
     // full, return false
-    if (used_size == max_buffer_size) return false;
+    if (used_size == max_buffer_size) {
+      VLOG(3) << "Interceptor " << GetInterceptorId()
+              << "'s out buffer is full.";
+      return false;
+    }
   }
   return true;
 }
 
 // only source node need reset
 bool ComputeInterceptor::ShouldReset() {
-  return is_source_ && (step_ == node_->max_run_times());
+  if (is_source_ && step_ == node_->max_run_times()) {
+    VLOG(3) << "Interceptor " << GetInterceptorId()
+            << " should reset for step: " << step_ << ".";
+    return true;
+  }
+  return false;
 }
 
 void ComputeInterceptor::SendDataReadyToDownStream() {
@@ -130,7 +148,8 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
     InterceptorMessage ready_msg;
     ready_msg.set_message_type(DATA_IS_READY);
     VLOG(3) << "ComputeInterceptor " << interceptor_id_
-            << " Send data_is_ready msg to " << down_id;
+            << " Send data_is_ready msg to " << down_id
+            << " for step: " << step_;
     Send(down_id, ready_msg);
   }
 }
@@ -147,23 +166,43 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
             ready_size));
     ins.second.second = ready_size;
 
+    VLOG(3) << "ComputeInterceptor " << interceptor_id_
+            << " Reply data_is_useless msg to " << up_id
+            << " for step: " << step_;
+    if (up_id == -1) return;
+
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATE_IS_USELESS);
-    VLOG(3) << "ComputeInterceptor " << interceptor_id_
-            << " Reply data_is_useless msg to " << up_id;
     Send(up_id, reply_msg);
   }
 }
 
 void ComputeInterceptor::RunOps() {
   VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
-          << step_ << " time.";
+          << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
     op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
   }
 }
 
 void ComputeInterceptor::Run() {
+  // If there is no limit, source interceptor can be executed
+  // an unlimited number of times.
+  // Now source node can only run max_run_times.
+  if (ShouldReset()) {
+    for (auto& out_buff : out_buffs_) {
+      // buffer is using
+      if (out_buff.second.second != 0) {
+        VLOG(3) << "Interceptor " << GetInterceptorId()
+                << " out buffer for downstream: " << out_buff.first
+                << "'s counter is: " << out_buff.second.second
+                << ". Cannot be reset.";
+        return;
+      }
+    }
+    step_ = 0;  // reset
+  }
+
   while (IsInputReady() && CanWriteOutput() && !ShouldReset()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
@@ -181,18 +220,6 @@ void ComputeInterceptor::Run() {
       StopCarrier();
     }
   }
-
-  // If there is no limit, source interceptor can be executed
-  // an unlimited number of times.
-  // Now source node can only run max_run_times.
-  if (ShouldReset()) {
-    for (auto& out_buff : out_buffs_) {
-      // buffer is using
-      if (out_buff.second.second != 0) return;
-    }
-    step_ = 0;  // reset
-    return;
-  }
 }
 
 void ComputeInterceptor::ReceivedStop(int64_t up_id) {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 3479157de5c454..3a823674d842c5 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -109,6 +109,15 @@ void FleetExecutor::Run() {
       message_bus_instance.IsInit(), true,
       platform::errors::Unavailable("MessageBus has not been init yet."));
   carrier_instance.Start();
+  for (auto* micro_scop : microbatch_scopes_) {
+    // By default, we should delete all kid scopes after run executor because
+    // some operators may create local scope when running, such as while_op.
+    // But when while_op also create a local executor to run it's sub block,
+    // the sub scopes it created should not be dropped immediately, because
+    // while_grad_op will use some variables created during while_op run, so
+    // we need to keep the kids and wait for the outer executor to drop them.
+    micro_scop->DropKids();
+  }
 }
 
 void FleetExecutor::CopyParameters(int microbatch_id,
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 688a6f3a388218..f087de69fa96b2 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -14,6 +14,7 @@
 
 #include <chrono>
 #include <memory>
+#include <set>
 #include <thread>
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
@@ -56,11 +57,11 @@ void MessageBus::Init(
 bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
-  VLOG(3) << "Message bus releases resource.";
   // NOTE: fleet_executor inits carrier before message bus,
   // therefore the message bus's destructor will be called first
   Carrier& carrier = Carrier::Instance();
   carrier.Release();
+  VLOG(3) << "Message bus releases resource.";
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
     !defined(PADDLE_WITH_ASCEND_CL)
   server_.Stop(1000);
@@ -90,6 +91,8 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
                 << retry_time << " times retries.";
         return true;
       }
+      VLOG(3) << "Message bus sends failed, retry after 1 seconds.";
+      std::this_thread::sleep_for(std::chrono::milliseconds(1000));
     }
     VLOG(3) << "Message bus sends inter rank fail after 10 times retries.";
     return false;
@@ -121,16 +124,40 @@ void MessageBus::ListenPort() {
   brpc::ServerOptions options;
   options.idle_timeout_sec = -1;
   int retry_times = 0;
-  int interval = 1000;
+  int interval = 100;
   while (server_.Start(ip_for_brpc, &options) != 0) {
     ++retry_times;
     LOG(INFO) << "Message bus is retring for starting brpc for " << retry_times
               << " times. And will retry after " << interval / 1000
               << " seconds.";
     std::this_thread::sleep_for(std::chrono::milliseconds(interval));
-    interval += 2000;
+    interval += 500;
   }
   LOG(INFO) << "Message bus's listen port thread starts successful.";
+
+  std::set<int64_t> visit;
+  InterceptorMessage tmp_msg;
+  tmp_msg.set_ctrl_message(true);
+  for (auto pair : interceptor_id_to_rank_) {
+    if (rank_to_addr_.at(pair.second) == addr_) {
+      tmp_msg.set_src_id(pair.first);
+    }
+  }
+  for (auto pair : interceptor_id_to_rank_) {
+    int64_t rank = pair.second;
+    if (rank_to_addr_.at(rank) == addr_) {
+      continue;
+    }
+    tmp_msg.set_dst_id(pair.first);
+    if (visit.find(rank) == visit.end()) {
+      VLOG(3) << "Message bus is testing connection for rank: " << rank << ".";
+      visit.insert(rank);
+      while (!Send(tmp_msg)) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+      }
+      VLOG(3) << "Message bus has connected to rank: " << rank << ".";
+    }
+  }
 #else
   LOG(WARNING)
       << "Fleet executor's ListenPort() is a fake function when Paddle is "

From 21b307ca4e59173a7588281b92a037cd9cf1dcd6 Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Sat, 4 Dec 2021 10:15:08 +0800
Subject: [PATCH 067/124] fix distributed.service (#37843)

this pr fix distributed service
---
 paddle/fluid/distributed/service/service.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc
index 29941e36ea0513..698ceb1578f47e 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -51,7 +51,6 @@ void PSCore::init_gflag(const std::string& gflags) {
   std::vector<std::string> flags = paddle::string::split_string(gflags);
   if (flags.size() < 1) {
     flags.push_back("-max_body_size=314217728");
-    flags.push_back("-bthread_concurrency=200");
     flags.push_back("-socket_max_unwritten_bytes=2048000000");
     flags.push_back("-max_connection_pool_size=1950");
   }

From 1bdb857827b467739a3d8d4a4d68d070fc3e527c Mon Sep 17 00:00:00 2001
From: kuizhiqing <kuizhiqing@baidu.com>
Date: Mon, 6 Dec 2021 09:01:15 +0800
Subject: [PATCH 068/124] heter for collective (#37613)

---
 .../framework/distributed_strategy.proto      |   1 +
 paddle/fluid/imperative/CMakeLists.txt        |   3 +
 paddle/fluid/imperative/bkcl_context.cc       |  17 ++
 paddle/fluid/imperative/bkcl_context.h        |   2 +
 paddle/fluid/imperative/gloo_context.cc       |   7 +-
 paddle/fluid/imperative/gloo_context.h        |   2 +
 paddle/fluid/imperative/hccl_context.cc       |  23 ++
 paddle/fluid/imperative/hccl_context.h        |   2 +
 paddle/fluid/imperative/heter_ccl_context.cc  | 203 ++++++++++++++++++
 paddle/fluid/imperative/heter_ccl_context.h   |  78 +++++++
 paddle/fluid/imperative/nccl_context.cc       |  22 ++
 paddle/fluid/imperative/nccl_context.h        |   2 +
 paddle/fluid/imperative/parallel_context.h    |   2 +
 paddle/fluid/imperative/reducer.cc            |  76 ++++++-
 paddle/fluid/imperative/reducer.h             |   5 +-
 paddle/fluid/imperative/tests/CMakeLists.txt  |   2 +
 .../tests/heter_ccl_context_test.cc           |  89 ++++++++
 .../imperative/tests/nccl_context_test.cc     |  50 +++++
 paddle/fluid/pybind/CMakeLists.txt            |  11 +-
 paddle/fluid/pybind/imperative.cc             |  10 +
 .../fleet/base/distributed_strategy.py        |  31 +++
 .../distributed/fleet/base/fleet_base.py      |  45 ++--
 .../fleet/base/meta_optimizer_factory.py      |   3 +-
 python/paddle/distributed/fleet/launch.py     |  56 ++---
 .../paddle/distributed/fleet/launch_utils.py  |  82 +++++--
 .../fleet/meta_optimizers/__init__.py         |   1 +
 .../dygraph_optimizer/__init__.py             |   1 +
 .../heter_parallel_optimizer.py               |  66 ++++++
 python/paddle/distributed/parallel.py         |  42 ++--
 .../paddle/fluid/dygraph/parallel_helper.py   |   4 +-
 30 files changed, 859 insertions(+), 79 deletions(-)
 create mode 100644 paddle/fluid/imperative/heter_ccl_context.cc
 create mode 100644 paddle/fluid/imperative/heter_ccl_context.h
 create mode 100644 paddle/fluid/imperative/tests/heter_ccl_context_test.cc
 create mode 100755 python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index ae5c9504ecb6ee..739e05e1d79712 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -305,6 +305,7 @@ message DistributedStrategy {
   optional bool semi_auto = 35 [ default = false ];
   optional bool adam_d2sum = 36 [ default = true ];
   optional bool auto_search = 37 [ default = false ];
+  optional bool heter_ccl_mode = 38 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 9121610d29eaa0..594b0d48a8aad8 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -30,6 +30,9 @@ if(NOT WIN32)
         cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
         cc_library(reducer SRCS reducer.cc DEPS layer)
     endif()
+    if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL)
+        cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits)
+    endif()
     cc_library(data_loader SRCS data_loader.cc DEPS enforce)
 endif(NOT WIN32)
 if(WITH_GLOO)
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 8c6b840f60a591..6569929d6f5d74 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -150,6 +150,23 @@ void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
   }
 }
 
+void BKCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::BKCLComm *comm =
+      platform::BKCLCommContext::Instance().Get(ring_id, place);
+  XPUStream stream = comm->stream();
+
+  void *src_ptr = src_tensor->data<void>();
+  auto data_type = platform::ToBKCLDataType(src_tensor->type());
+
+  PADDLE_ENFORCE_EQ(bkcl_broadcast(comm->comm(), src_ptr, src_ptr,
+                                   src_tensor->numel(), data_type, 0, stream),
+                    BKCL_SUCCESS,
+                    platform::errors::Unavailable("bkcl_broadcast failed"));
+}
+
 paddle::platform::DeviceContext *BKCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index 652b7689666c6c..a5a10b19389c0d 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -42,6 +42,8 @@ class BKCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index ef1bf0d158787e..1eaf0c6538043f 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -37,7 +37,7 @@ void GLOOParallelContext::Init() {
   gloo_wrapper->SetSize(strategy_.nranks_);
   gloo_wrapper->SetRank(strategy_.local_rank_);
   gloo_wrapper->SetPrefix("");
-  gloo_wrapper->SetIface("lo");
+  gloo_wrapper->SetIface("");
   auto addr = paddle::string::Split(strategy_.trainer_endpoints_[0], ':');
   VLOG(4) << "Server is" << strategy_.trainer_endpoints_[0];
   std::string host = addr[0];
@@ -176,6 +176,11 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
   }
 }
 
+void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Unimplemented inter-broadcast for CPU now."));
+}
+
 paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext(
     int ring_id) {
   // return the CPUDeviceContext
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 305a75a881153f..e7c9ba4cfddb65 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -47,6 +47,8 @@ class GLOOParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/hccl_context.cc b/paddle/fluid/imperative/hccl_context.cc
index 4f1135fa9ddd48..55c52ae6c11de8 100644
--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -158,6 +158,29 @@ void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
   }
 }
 
+void HCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  if (src->IsType<framework::LoDTensor>()) {
+    framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+    const auto &place = src_tensor->place();
+    platform::HCCLComm *comm =
+        platform::HCCLCommContext::Instance().Get(ring_id, place);
+    aclrtStream stream = comm->stream();
+
+    void *src_ptr =
+        reinterpret_cast<void *>(const_cast<void *>(src_tensor->data<void>()));
+    auto hccl_dtype = platform::ToHCCLDataType(src_tensor->type());
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        src_ptr, src_tensor->numel(), hccl_dtype, 0, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unsupported variable type %s for imperative allreduce, only "
+        "LoDTensor is supported.",
+        platform::demangle(framework::ToTypeName(src->Type()))));
+  }
+}
+
 paddle::platform::DeviceContext *HCCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
diff --git a/paddle/fluid/imperative/hccl_context.h b/paddle/fluid/imperative/hccl_context.h
index b7f22f3a0b0f16..e5f58dea9fb061 100644
--- a/paddle/fluid/imperative/hccl_context.h
+++ b/paddle/fluid/imperative/hccl_context.h
@@ -50,6 +50,8 @@ class HCCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/heter_ccl_context.cc b/paddle/fluid/imperative/heter_ccl_context.cc
new file mode 100644
index 00000000000000..a62c1da7815979
--- /dev/null
+++ b/paddle/fluid/imperative/heter_ccl_context.cc
@@ -0,0 +1,203 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
+// NCCL first
+#ifdef PADDLE_WITH_NCCL
+#include "paddle/fluid/imperative/all_reduce.h"
+#endif
+
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
+                                           const int &device_id)
+#ifdef PADDLE_WITH_NCCL
+    : ParallelContext(strategy, platform::CUDAPlace(device_id))
+#elif PADDLE_WITH_XPU_BKCL
+    : ParallelContext(strategy, platform::XPUPlace(device_id))
+#elif PADDLE_WITH_ASCEND_CL
+    : ParallelContext(strategy, platform::NPUPlace(device_id))
+#else
+    : ParallelContext(strategy, platform::CPUPlace())
+#endif
+{
+  // construct node_strategy_ from global strategy by selecting the
+  // endpoints with same ip address.
+  std::string node_ip = strategy_.current_endpoint_.substr(
+      0, strategy_.current_endpoint_.find(':'));
+  int node_nranks = 0;
+  int inter_rank = -1;
+
+  std::vector<std::string> all_eps = strategy_.trainer_endpoints_;
+  std::vector<std::string> inter_endpoints;
+  std::set<std::string> nodes_ips;
+  for (auto ep : all_eps) {
+    std::string ip = ep.substr(0, ep.find(':'));
+    // record ip of different nodes
+    if (nodes_ips.find(ip) == nodes_ips.end()) {
+      if (ep == strategy_.current_endpoint_) {
+        inter_rank = nodes_ips.size();
+      }
+      inter_endpoints.push_back(ep);
+      nodes_ips.emplace(ip);
+    }
+
+    if (ip == node_ip) {
+      if (ep == strategy_.current_endpoint_) {
+        node_strategy_.local_rank_ = node_nranks;
+      }
+      node_nranks++;
+      node_strategy_.trainer_endpoints_.push_back(ep);
+    }
+  }
+
+  VLOG(0) << "init node size " << node_nranks << " rank "
+          << node_strategy_.local_rank_;
+
+  PADDLE_ENFORCE_NE(node_nranks, 0,
+                    platform::errors::InvalidArgument(
+                        "The number of local nranks should not be zero."));
+  node_strategy_.nranks_ = node_nranks;
+  node_strategy_.current_endpoint_ = strategy_.current_endpoint_;
+
+  if (inter_rank >= 0 && inter_endpoints.size() > 1) {
+    inter_strategy_.nranks_ = inter_endpoints.size();
+    inter_strategy_.local_rank_ = inter_rank;
+    inter_strategy_.current_endpoint_ = strategy_.current_endpoint_;
+    inter_strategy_.trainer_endpoints_ = inter_endpoints;
+    inter_parallel_ctx_ = std::make_shared<GLOOParallelContext>(
+        inter_strategy_, platform::CPUPlace());
+  }
+
+  VLOG(0) << "init inter size " << inter_endpoints.size() << " rank "
+          << inter_rank;
+
+#ifdef PADDLE_WITH_NCCL
+  node_place_ = platform::CUDAPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<NCCLParallelContext>(node_strategy_, node_place_);
+#endif
+#ifdef PADDLE_WITH_XPU_BKCL
+  node_place_ = platform::XPUPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<BKCLParallelContext>(node_strategy_, node_place_);
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  node_place_ = platform::NPUPlace(device_id);
+  node_parallel_ctx_ =
+      std::make_shared<HCCLParallelContext>(node_strategy_, node_place_);
+#endif
+}
+
+void HeterParallelContext::Init() {
+  PADDLE_ENFORCE_NE(
+      node_parallel_ctx_, nullptr,
+      platform::errors::Unavailable(
+          "The heter parallel context has not been initialized."));
+
+  if (inter_parallel_ctx_ != nullptr) {
+    inter_parallel_ctx_->Init();
+  }
+
+  node_parallel_ctx_->Init();
+
+  VLOG(3) << "/// DEBUG /// heter parallel env init done..." << std::endl;
+}
+
+void HeterParallelContext::InitWithRingID(int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Unimplemented InitWithRingID from heter ctx."));
+}
+
+void HeterParallelContext::AllReduceByStream(const framework::Variable &src,
+                                             framework::Variable *dst,
+                                             int ring_id,
+                                             bool use_calc_stream) {
+  // step 1: call reduce within node
+  VLOG(3) << "/// DEBUG /// step 1: reduce in node... ";
+  node_parallel_ctx_->AllReduceByStream(src, dst, ring_id, false);
+  node_parallel_ctx_->WaitComm(ring_id);
+
+  // step 2: call allreduce between nodes with gloo
+  if (inter_parallel_ctx_ != nullptr) {
+    // copy src to cpu
+    // dst is now the src
+    auto src_tensor = dst->Get<framework::LoDTensor>();
+    framework::Variable src_cpu;
+    auto src_cpu_tensor = src_cpu.GetMutable<framework::LoDTensor>();
+    framework::TensorCopySync(src_tensor, platform::CPUPlace(), src_cpu_tensor);
+
+    // allreduce src/cpu to dst/cpu
+    framework::Variable dst_cpu;
+    inter_parallel_ctx_->AllReduceByStream(src_cpu, &dst_cpu, ring_id, false);
+    inter_parallel_ctx_->WaitComm(ring_id);
+
+    // copy dst/cpu to dst
+    auto dst_cpu_tensor = dst_cpu.Get<framework::LoDTensor>();
+    auto dst_tensor = dst->GetMutable<framework::LoDTensor>();
+    framework::TensorCopySync(dst_cpu_tensor, dst_tensor->place(), dst_tensor);
+
+    inter_parallel_ctx_->WaitComm(ring_id);
+  }
+
+  // step 3: call broadcast within node
+  VLOG(3) << "/// DEBUG /// step 3: broadcast within node... ";
+  node_parallel_ctx_->WaitComm(ring_id);
+  node_parallel_ctx_->Broadcast(dst, ring_id);
+  node_parallel_ctx_->WaitComm(ring_id);
+}
+
+void HeterParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  PADDLE_THROW(platform::errors::Unimplemented("Unimplemented function."));
+}
+
+paddle::platform::DeviceContext *HeterParallelContext::GetDeviceContext(
+    int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  return node_parallel_ctx_->GetDeviceContext(ring_id);
+}
+
+void HeterParallelContext::WaitCompute(int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->WaitCompute(ring_id);
+}
+
+void HeterParallelContext::WaitComm(int ring_id) {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->WaitComm(ring_id);
+}
+
+void HeterParallelContext::SynchronizeCompute() {
+  // directly call the implementation of target parallel ctx.
+  node_parallel_ctx_->SynchronizeCompute();
+}
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/heter_ccl_context.h b/paddle/fluid/imperative/heter_ccl_context.h
new file mode 100644
index 00000000000000..8ea5e85603ab5f
--- /dev/null
+++ b/paddle/fluid/imperative/heter_ccl_context.h
@@ -0,0 +1,78 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#ifdef PADDLE_WITH_NCCL
+#include "paddle/fluid/imperative/nccl_context.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_BKCL
+#include "paddle/fluid/imperative/bkcl_context.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/imperative/hccl_context.h"
+#endif
+
+#include "paddle/fluid/imperative/gloo_context.h"
+#include "paddle/fluid/imperative/parallel_context.h"
+
+namespace paddle {
+namespace framework {
+class Variable;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace imperative {
+
+class HeterParallelContext : public ParallelContext {
+ public:
+  explicit HeterParallelContext(const ParallelStrategy& strategy,
+                                const int& device_id);
+
+  ~HeterParallelContext() override = default;
+
+  void Init() override;
+
+  void InitWithRingID(int ring_id) override;
+
+  void AllReduceByStream(const framework::Variable& src,
+                         framework::Variable* dst, int ring_id,
+                         bool use_calc_stream) override;
+
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
+  paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
+
+  void WaitCompute(int ring_id) override;
+
+  void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
+
+ private:
+  ParallelStrategy inter_strategy_;
+  ParallelStrategy node_strategy_;
+  platform::Place node_place_;
+  std::shared_ptr<imperative::ParallelContext> node_parallel_ctx_{nullptr};
+  std::shared_ptr<imperative::ParallelContext> inter_parallel_ctx_{nullptr};
+};
+
+}  //  namespace imperative
+}  //  namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 0eb06983f409b1..f822894b42b0b5 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -20,7 +20,15 @@
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #endif
 
+#ifdef PADDLE_WITH_NCCL
+#include <nccl.h>
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -127,6 +135,20 @@ void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
   AllReduce(src, dst, strategy_, ring_id, use_calc_stream);
 }
 
+void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
+  VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id;
+  framework::Tensor *src_tensor = src->GetMutable<framework::LoDTensor>();
+  const auto &place = src_tensor->place();
+  platform::NCCLComm *comm =
+      platform::NCCLCommContext::Instance().Get(ring_id, place);
+  gpuStream_t stream = comm->stream();
+
+  void *src_ptr = src_tensor->data<void>();
+  auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type());
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+      src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
+}
+
 paddle::platform::DeviceContext *NCCLParallelContext::GetDeviceContext(
     int ring_id) {
   return static_cast<platform::DeviceContext *>(
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 1938fa08312f61..bb5b8ea32df4f4 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -60,6 +60,8 @@ class NCCLParallelContext : public ParallelContext {
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
 
+  void Broadcast(framework::Variable* src, int ring_id) override;
+
   paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override;
 
   void WaitCompute(int ring_id) override;
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index f537a316014d60..8bdfccc1442436 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -56,6 +56,8 @@ class ParallelContext {
                                  framework::Variable* dst, int ring_id,
                                  bool use_calc_stream) = 0;
 
+  virtual void Broadcast(framework::Variable* src, int ring_id) = 0;
+
   virtual paddle::platform::DeviceContext* GetDeviceContext(int ring_id) = 0;
 
   // comm_stream[ring_id] wait compute_stream.
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 2f023f644fd060..068de4f0435bbe 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -27,8 +27,9 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -41,6 +42,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     DivNRanks(tensor, nranks, context);
 #endif
+  } else if (platform::is_npu_place(tensor->place())) {
+    // TODO(kuizhiqing)
+    VLOG(4) << "divnrank for npu not support yet";
   } else if (platform::is_cpu_place(tensor->place())) {
     VLOG(4) << "before div 2" << *tensor;
     VLOG(4) << "NDiv for cpu devices : rank = " << nranks;
@@ -207,6 +211,70 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
+// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
+// If later the operators::StridedMemcpyWithAxis0 is supported,
+// then this specific SplitTensorsForAllReduce can be removed.
+#ifdef PADDLE_WITH_ASCEND_CL
+template <>
+void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
+    const platform::NPUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors) {
+  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
+  std::vector<framework::Tensor *> outs;
+  std::vector<const framework::Tensor *> shape_refer;
+
+  outs.reserve(p_dense_tensors->size());
+  shape_refer.reserve(p_dense_tensors->size());
+
+  for (auto &tensor : *p_dense_tensors) {
+    outs.emplace_back(&tensor);
+    shape_refer.emplace_back(&tensor);
+  }
+  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
+      split_functor_;
+  split_functor_(context, *in, shape_refer, 0, &outs);
+}
+
+template <>
+void ConcatTensorsWithType<platform::NPUDeviceContext>(
+    const platform::NPUDeviceContext &context,
+    const std::vector<framework::Tensor> &dense_tensors_,
+    framework::Variable *p_dense_contents,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP32:
+      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
+          context, dense_tensors_, p_dense_contents);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+
+template <>
+void SplitTensorsWithType<platform::NPUDeviceContext>(
+    const platform::NPUDeviceContext &context,
+    framework::Variable *p_dense_contents,
+    std::vector<framework::Tensor> *p_dense_tensors,
+    framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType::FP32:
+      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
+          context, p_dense_contents, p_dense_tensors);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors for "
+          "allreduce.",
+          framework::DataTypeToString(type)));
+  }
+}
+#endif
+
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
@@ -831,7 +899,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
       }
     });
 #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_GLOO)
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
     FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -1014,7 +1082,7 @@ void Reducer::FinalizeBackward() {
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_GLOO)
+    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
     ProcessUnusedDenseVars();
 #endif
     // Initialize local used vars
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b5a7dd149f09fe..3c03babc52cbe1 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -48,8 +48,9 @@ class VariableWrapper;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index adb560df77c78f..01a24872fbd7c2 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -3,6 +3,8 @@ if(WIN32)
 else()
     if (WITH_NCCL OR WITH_RCCL)
         cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
+        cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell)
+        #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
     endif()
     if (WITH_XPU_BKCL)
         cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context)
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
new file mode 100644
index 00000000000000..c40a5fc52ceb86
--- /dev/null
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -0,0 +1,89 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/heter_ccl_context.h"
+
+#include "gtest/gtest.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+namespace framework = paddle::framework;
+
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:37580", "127.0.0.1:37581"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = eps.size();
+  strategy.local_rank_ = local_rank;
+  return strategy;
+}
+
+#ifdef PADDLE_WITH_NCCL
+void AllReduceByStream(int local_rank, int device_id) {
+  int data_size = 32;
+  const auto& place = platform::CUDAPlace(device_id);
+  platform::CUDADeviceContext ctx(place);
+
+  // heter_parallel_ctx
+  imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);
+
+  // init
+  hpc.Init();
+
+  // input and output data
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  std::vector<float> src_vec;
+  for (int i = 0; i < data_size; i++) {
+    src_vec.push_back(1.0 + local_rank);
+  }
+  framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  ctx.Wait();
+
+  framework::Variable* dst_dev_var(new framework::Variable());
+  auto* dst_dev_tensor = dst_dev_var->GetMutable<framework::LoDTensor>();
+  dst_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  // call allreduce
+  hpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(dst_vec.size(), src_vec.size());
+  for (int i = 0; i < data_size; i++) {
+    EXPECT_EQ(dst_vec[i], 3.0);
+  }
+}
+
+TEST(AllReduceByStream, Run) {
+  if (platform::GetCUDADeviceCount() >= 2) {
+    std::thread t0(AllReduceByStream, 0, 0);
+    std::thread t1(AllReduceByStream, 1, 1);
+    t0.join();
+    t1.join();
+  }
+}
+#endif
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 2d8a08217b0b83..b56444104f2779 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -14,6 +14,8 @@
 
 #include <thread>  // NOLINT
 
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -21,6 +23,7 @@
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
+namespace framework = paddle::framework;
 
 int nrings = 2;
 imperative::ParallelStrategy GetStrategy(int local_rank) {
@@ -68,4 +71,51 @@ TEST(BcastNCCLId, Run) {
                              NCCL_UNIQUE_ID_BYTES));
   }
 }
+
+void Broadcast(int local_rank, int device_id) {
+  int data_size = 4;
+  float test_data = 7;
+  const auto& place = platform::CUDAPlace(device_id);
+  platform::CUDADeviceContext ctx(place);
+
+  imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);
+
+  // init
+  npc.Init();
+
+  framework::Variable* src_dev_var(new framework::Variable());
+  auto* src_dev_tensor = src_dev_var->GetMutable<framework::LoDTensor>();
+  src_dev_tensor->mutable_data<float>(framework::make_ddim({data_size}), place);
+
+  // fill data for rank 0 only
+  std::vector<float> src_vec;
+  if (local_rank == 0) {
+    for (int i = 0; i < data_size; i++) {
+      src_vec.push_back(test_data);
+    }
+    framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
+  }
+  ctx.Wait();
+
+  npc.Broadcast(src_dev_var, 0);
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+
+  // check result
+  std::vector<float> dst_vec;
+  framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
+  ctx.Wait();
+
+  for (int i = 0; i < data_size; i++) {
+    EXPECT_EQ(dst_vec[i], test_data);
+  }
+}
+
+TEST(Broadcast, Run) {
+  if (platform::GetCUDADeviceCount() >= 2) {
+    std::thread t0(Broadcast, 0, 0);
+    std::thread t1(Broadcast, 1, 1);
+    t0.join();
+    t1.join();
+  }
+}
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 41708ef8611e42..521ca032a50ddb 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -25,6 +25,13 @@ endif()
 if (WITH_XPU_BKCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} bkcl_context)
+  set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
+endif()
+
+if (WITH_ASCEND_CL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
+  set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context)
+  set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
 endif()
 
 if(NOT WIN32)
@@ -32,9 +39,7 @@ if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
   if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
-  endif()
-  if (WITH_ASCEND_CL)
-    set(PYBIND_DEPS ${PYBIND_DEPS} hccl_context)
+    set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
 endif(NOT WIN32)
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 29a1f0eafcb219..2c850f0ca84d5f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/data_loader.h"
 #include "paddle/fluid/imperative/gloo_context.h"
 #include "paddle/fluid/imperative/hccl_context.h"
+#include "paddle/fluid/imperative/heter_ccl_context.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
@@ -2332,6 +2333,15 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
+  py::class_<imperative::HeterParallelContext, imperative::ParallelContext,
+             std::shared_ptr<imperative::HeterParallelContext>>(
+      m, "HeterParallelContext")
+      .def(py::init<const imperative::ParallelStrategy &, const int &>())
+      .def("init", [](imperative::HeterParallelContext &self) { self.Init(); });
+#endif
+
   m.def("pylayer_apply",
         [](const platform::CPUPlace &place, const py::object &cls,
            const py::args args, const py::kwargs kwargs) {
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index cc0a5de233c382..e58b6c312fa1fe 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1758,6 +1758,37 @@ def auto_search(self, flag):
         else:
             print("WARNING: auto-search should have value of bool type")
 
+    @property
+    def heter_ccl_mode(self):
+        """
+        Indicating whether we are using heter_ccl_mode for model training.
+        This feature is currently an experimental feature. Currently,
+        heter_ccl_mode can be used only for dataparallel with dygraph mode.
+        Default Value: False
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle
+            import paddle.distributed.fleet as fleet
+
+            strategy = fleet.DistributedStrategy()
+            strategy.heter_ccl_mode = True
+
+            # for initialize parallel env, only need to call
+            paddle.distributed.init_parallel_env()
+            # then the heterogenous context will be created.
+        """
+        return self.strategy.heter_ccl_mode
+
+    @heter_ccl_mode.setter
+    def heter_ccl_mode(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.heter_ccl_mode = flag
+        else:
+            print("WARNING: heter_ccl_mode should have value of bool type")
+
     @property
     def cudnn_exhaustive_search(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index a1e5ef2ba799fc..0d54a0ea5d3b16 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -33,7 +33,7 @@
 from .topology import ParallelMode
 from ..meta_parallel import TensorParallel, model_parallel_random_seed
 from ..meta_parallel import PipelineParallel, ShardingParallel
-from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
 from paddle import _C_ops
 from paddle.fluid import core
 from paddle.fluid.dygraph import to_variable
@@ -277,13 +277,15 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                         self._user_defined_strategy.nccl_comm_num)
                 paddle.distributed.init_parallel_env()
 
-            # init hybrid parallel environment in dygraph
-            if tp._HYBRID_PARALLEL_GROUP is None:
-                self._init_hybrid_parallel_env()
-            else:
-                warnings.warn(
-                    "The dygraph hybrid parallel environment has been initialized."
-                )
+            # hybrid parallel not support for npu/xpu
+            if self._user_defined_strategy.heter_ccl_mode == False:
+                # init hybrid parallel environment in dygraph
+                if tp._HYBRID_PARALLEL_GROUP is None:
+                    self._init_hybrid_parallel_env()
+                else:
+                    warnings.warn(
+                        "The dygraph hybrid parallel environment has been initialized."
+                    )
         elif self._is_collective:
             use_sharding = self._user_defined_strategy.sharding
 
@@ -872,8 +874,12 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() > 1:
-                return HybridParallelOptimizer(optimizer, self._hcg,
-                                               self._user_defined_strategy)
+                if self._user_defined_strategy.heter_ccl_mode == False:
+                    return HybridParallelOptimizer(optimizer, self._hcg,
+                                                   self._user_defined_strategy)
+                else:
+                    return HeterParallelOptimizer(optimizer,
+                                                  self._user_defined_strategy)
             else:
                 return optimizer
         return self
@@ -938,6 +944,17 @@ def forward(self, x):
         if self.worker_num() <= 1:
             return model
 
+        if self._user_defined_strategy.heter_ccl_mode == True:
+            distributed_model = paddle.DataParallel(
+                model,
+                comm_buffer_size=self._user_defined_strategy.
+                fuse_grad_size_in_MB,
+                last_comm_buffer_size=self._user_defined_strategy.
+                last_comm_group_size_MB,
+                find_unused_parameters=self._user_defined_strategy.
+                find_unused_parameters)
+            return distributed_model
+
         if self._hcg.get_parallel_mode() == ParallelMode.SHARDING_PARALLEL:
             distributed_model = ShardingParallel(
                 model, self._hcg, strategy=self._user_defined_strategy)
@@ -1569,13 +1586,13 @@ def unscale_method(self, optimizer):
                 ]
                 param_grads_fp16 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP16)
+                    if (param._grad_ivar() is not None) and
+                    (param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
                 ]
                 param_grads_fp32 = [
                     param._grad_ivar() for param in optimizer._parameter_list
-                    if (param._grad_ivar() is not None) and (param._grad_ivar(
-                    ).dtype == core.VarDesc.VarType.FP32)
+                    if (param._grad_ivar() is not None) and
+                    (param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
                 ]
             temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
             temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index 52eeebd0c126c2..322989099c856d 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -19,9 +19,10 @@
 meta_optimizer_names = list(
     filter(lambda name: name.endswith("Optimizer"), dir()))
 
-# Because HybridParallelOptimizer is dygraph optimizer, it 
+# Because HybridParallelOptimizer is dygraph optimizer, it
 # should be removed
 meta_optimizer_names.remove("HybridParallelOptimizer")
+meta_optimizer_names.remove("HeterParallelOptimizer")
 
 
 class MetaOptimizerFactory(object):
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 0aae3331793ca7..708ba2816077e1 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -108,9 +108,9 @@ def _parse_args():
     base_group.add_argument(
         "--backend",
         type=str,
-        default="auto",
-        help="Specifize the backend, can be gloo|nccl|bkcl|auto. Default value is auto which perfers nccl or bkcl."
-    )
+        default=os.environ.get('PADDLE_DISTRI_BACKEND', 'auto'),
+        help="Specifize the backend, can be gloo|nccl|bkcl|auto|hccl|heter. "
+        "Default value is auto which perfers nccl or bkcl.")
     base_group.add_argument(
         "--nproc_per_node",
         type=int,
@@ -146,6 +146,16 @@ def _parse_args():
         )
         base_group.add_argument("--selected_xpus", dest="xpus")
 
+    if fluid.core.is_compiled_with_npu():
+        base_group.add_argument(
+            "--npus",
+            type=str,
+            default=None,
+            help="It's for xpu training. For example: "
+            "--npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
+        )
+        base_group.add_argument("--selected_npus", dest="npus")
+
     base_group.add_argument(
         "training_script",
         type=str,
@@ -301,25 +311,23 @@ def get_cluster_info(args):
     # lazy launch for auto-parallel
     if args.enable_auto_mapping == True:
         cluster, pod = get_mapped_cluster_from_args(args, device_mode)
-    else:
+    elif cloud_utils.use_paddlecloud() and trainers_num != 1:
+        cluster, pod = cloud_utils.get_cloud_cluster(
+            args.ips, device_mode, devices_per_proc, start_port)
+        logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
         # for ascend
-        if device_mode == DeviceMode.ASCEND_NPU:
-            cluster, pod = ascend_utils.get_cloud_cluster(
-                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-                device_mode=device_mode,
-                start_port=start_port)
-        elif cloud_utils.use_paddlecloud() and trainers_num != 1:
-            cluster, pod = cloud_utils.get_cloud_cluster(
-                args.ips, device_mode, devices_per_proc, start_port)
-            logger.debug("get cluster from cloud:{}".format(cluster))
-        else:
-            # trainers_num = 1 or not use paddlecloud ips="a,b"
-            cluster, pod = get_cluster_from_args(args, device_mode,
-                                                 devices_per_proc)
-            logger.debug("get cluster from args:{}".format(cluster))
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+            device_mode=device_mode,
+            start_port=start_port)
+    else:
+        # trainers_num = 1 or not use paddlecloud ips="a,b"
+        cluster, pod = get_cluster_from_args(args, device_mode,
+                                             devices_per_proc)
+        logger.debug("get cluster from args:{}".format(cluster))
     return cluster, pod
 
-
 def get_global_envs(args, tmp_dir):
     global_envs = copy.copy(os.environ.copy())
     # add gloo env
@@ -456,15 +464,15 @@ def which_distributed_mode(args):
         ) and not fluid.core.is_compiled_with_xpu():
             if args.servers:
                 logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu. \
-But found args.servers not empty, default use ps mode")
+                    "Not found distinct arguments and not compiled with cuda or xpu or npu. "
+                    "But found args.servers not empty, default use ps mode")
                 return DistributeMode.PS
             else:
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu. Default use collective mode"
-            )
+                "Not found distinct arguments and compiled with cuda or xpu or npu. "
+                "Default use collective mode")
             return DistributeMode.COLLECTIVE
 
 
@@ -651,7 +659,7 @@ def launch():
         check_backend(args.backend)
         distribute_mode = DistributeMode.COLLECTIVE
 
-    assert args.backend in ['gloo', 'nccl', 'bkcl', 'unknown']
+    #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown']
 
     if args.backend == 'gloo':
         logger.warning("launch start with CPUONLY mode")
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index d87bdb47932ef1..569f64c18bf52f 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -690,9 +690,51 @@ def get_xpus(xpus):
     return res_xpus
 
 
+def get_npus(npus):
+    if npus is None:
+        npus_num = fluid.core.get_npu_device_count()
+        res_npus = [str(x) for x in range(0, npus_num)]
+    else:
+        npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        if npu_visible_devices is None or npu_visible_devices == "":
+            res_npus = [x.strip() for x in npus.split(',')]
+        else:
+            # change npus into relative values
+            # e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
+            # therefore npus=0,1,2,3
+            npu_visible_devices_list = npu_visible_devices.split(',')
+            for x in npus.split(','):
+                assert x in npu_visible_devices_list, "Can't find "\
+                    "your npus %s in ASCEND_VISIBLE_DEVICES[%s]."\
+                    % (x, npu_visible_devices)
+            res_npus = [
+                npu_visible_devices_list.index(x.strip())
+                for x in npus.split(',')
+            ]
+            logger.info("Change selected_npus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "ASCEND_VISIBLE_DEVICES:{}".format(
+                            npus, res_npus, npu_visible_devices_list))
+
+    return res_npus
+
+
 def get_device_mode(backend):
-    if fluid.core.is_compiled_with_npu() and \
+    if backend == 'heter':
+        if fluid.core.is_compiled_with_cuda() and \
+            fluid.core.get_cuda_device_count() > 0:
+            print("launch train in heter mode with GPU device.")
+            return DeviceMode.GPU
+        if fluid.core.is_compiled_with_xpu() and \
+            fluid.core.get_xpu_device_count() > 0:
+            print("launch train in heter mode with XPU device.")
+            return DeviceMode.XPU
+        if fluid.core.is_compiled_with_npu() and \
             fluid.core.get_npu_device_count() > 0:
+            print("launch train in heter mode with NPU device.")
+            return DeviceMode.ASCEND_NPU
+
+    if backend == 'hccl' and fluid.core.get_npu_device_count() > 0:
         print("launch train in ascend npu mode!")
         return DeviceMode.ASCEND_NPU
 
@@ -731,7 +773,17 @@ def get_device_proc_info(args):
         else:
             devices_per_proc = gpus
     elif device_mode == DeviceMode.ASCEND_NPU:
-        devices_per_proc = None
+        npus = get_npus(args.npus)
+        if args.nproc_per_node is not None:
+            assert (len(npus) % int(args.nproc_per_node)) ==0, \
+                "npus' number:{} mod args.nproc_per_node:{} must == 0".format(len(npus), args.nproc_per_node)
+
+            n = int(len(npus) / int(args.nproc_per_node))
+            devices_per_proc = [
+                npus[i:i + n] for i in six.moves.range(0, len(npus), n)
+            ]
+        else:
+            devices_per_proc = npus
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
@@ -902,11 +954,8 @@ def get_mapped_cluster_from_args(args, device_mode):
         node_rank = node_ips.index(ip)
         if os.environ.get('FLAGS_START_PORT') is not None:
             start_port = int(os.environ.get('FLAGS_START_PORT'))
-            free_ports = [
-                x
-                for x in range(start_port, start_port + len(node_ranks_mapping[
-                    node_rank]))
-            ]
+            end_port = start_port + len(node_ranks_mapping[node_rank])
+            free_ports = [x for x in range(start_port, end_port)]
         else:
             free_ports = find_free_ports(len(node_ranks_mapping[node_rank]))
         trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
@@ -1527,11 +1576,11 @@ def start_pod_heter_worker(self, args, pod):
 
 
 def check_backend(backend):
-    if backend not in ['nccl', 'gloo', 'bkcl', 'auto']:
-        raise ValueError(
-            "paddle.distributed initialize error, "
-            "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s"
-            % backend)
+    if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']:
+        raise ValueError("paddle.distributed initialize error, "
+                         "backend argument can only be one of "
+                         "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' "
+                         "but got %s" % backend)
 
     if backend == 'nccl' and not fluid.core.is_compiled_with_cuda():
         raise ValueError(
@@ -1545,6 +1594,12 @@ def check_backend(backend):
             "your paddle is not compiled with xpu but you assign 'bkcl' as backend."
         )
 
+    if backend == 'hccl' and not fluid.core.is_compiled_with_npu():
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "your paddle is not compiled with npu but you assign 'hccl' as backend."
+        )
+
 
 def block_windows_and_macos(backend):
     if backend != 'gloo': return
@@ -1565,4 +1620,7 @@ def get_backend_by_compile_flag():
     if fluid.core.is_compiled_with_xpu():
         return 'bkcl'
 
+    if fluid.core.is_compiled_with_npu():
+        return 'hccl'
+
     return 'gloo'
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 739de0de57725f..13496ad8ee5d96 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -28,6 +28,7 @@
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
 from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HeterParallelOptimizer
 from .dygraph_optimizer import HybridParallelGradScaler
 from .tensor_parallel_optimizer import TensorParallelOptimizer
 from .raw_program_optimizer import RawProgramOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
index 28260d7aa18635..3beb8635ba41a7 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -13,5 +13,6 @@
 from .hybrid_parallel_optimizer import HybridParallelOptimizer
 from .hybrid_parallel_gradscaler import HybridParallelGradScaler
 from .dygraph_sharding_optimizer import DygraphShardingOptimizer
+from .heter_parallel_optimizer import HeterParallelOptimizer
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
new file mode 100755
index 00000000000000..9218024be17203
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+
+__all__ = []
+
+
+def _obtain_optimizer_parameters_list(optimizer):
+    if getattr(optimizer, '_param_groups', None) and isinstance(
+            optimizer._param_groups[0], dict):
+        parameters_list = []
+        for group in optimizer._param_groups:
+            for param in group['params']:
+                parameters_list.append(param)
+    else:
+        parameters_list = [param for param in optimizer._parameter_list]
+
+    return parameters_list
+
+
+class HeterParallelOptimizer:
+    # adapter wrapper for optimizer
+    def __init__(self, optimizer, strategy):
+        self._inner_opt = optimizer
+        self._strategy = strategy
+
+        # NOTE(liubo48): In pure DataParallel mode,
+        # the gradient synchronization is achieved through reducer.
+
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        parameters_list = _obtain_optimizer_parameters_list(self._inner_opt)
+        self._inner_opt.step()
+
+    @imperative_base.no_grad
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+
+        # minimize does not support parameters in the form of param_group,
+        # so no need use _obtain_optimizer_parameters_list
+        parameter_list = parameters if parameters \
+            else self._inner_opt._parameter_list
+
+        return self._inner_opt.minimize(loss, startup_program, parameter_list,
+                                        no_grad_set)
+
+    def __getattr__(self, item):
+        return getattr(self._inner_opt, item)
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 7ea479f0fbb14d..177e19194a5227 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -58,7 +58,7 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if backend in ['auto', 'nccl', 'bkcl', 'hccl'] and (
+    if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and (
             core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or
             core.is_compiled_with_npu()):
 
@@ -68,6 +68,14 @@ def _is_cpuonly(backend):
         return True
 
 
+def _check_var_exists(var_name):
+    var = os.environ.get(var_name, None)
+    if var is None:
+        raise ValueError("paddle.distributed initialize error, "
+                         "environment variable %s is needed, but not set." %
+                         var_name)
+
+
 def init_parallel_env():
     """
     Initialize parallel training environment in dynamic graph mode.
@@ -148,27 +156,22 @@ def train():
         raise NotImplementedError(
             "If you want to use CPU-only version, please use 'gloo' as backend")
 
-    # 2. check env
-    def _check_var_exists(var_name):
-        var = os.environ.get(var_name, None)
-        if var is None:
-            raise ValueError("paddle.distributed initialize error, "
-                             "environment variable %s is needed, but not set." %
-                             var_name)
-
     if not is_cpu_only and core.is_compiled_with_cuda():
         _check_var_exists("FLAGS_selected_gpus")
     elif not is_cpu_only and core.is_compiled_with_xpu():
         _check_var_exists('FLAGS_selected_xpus')
+    elif not is_cpu_only and core.is_compiled_with_npu():
+        _check_var_exists('FLAGS_selected_npus')
 
     _check_var_exists("PADDLE_TRAINER_ID")
     _check_var_exists("PADDLE_CURRENT_ENDPOINT")
     _check_var_exists("PADDLE_TRAINERS_NUM")
     _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
 
+    node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
     # 3: init gloo context (step 1: httpsever start)
     init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
-    if is_cpu_only or init_gloo:
+    if is_cpu_only or init_gloo or backend == "heter":
         ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
         manager = Manager()
         # glboal dict to store status
@@ -177,6 +180,8 @@ def _check_var_exists(var_name):
         if parallel_env.rank == 0:
             # The scope for worker used by http server is '_worker'
             size = {'_worker': parallel_env.world_size}
+            if backend == "heter":
+                size = {'_worker': len(node_num)}
             http_server = Process(
                 target=_start_kv_server,
                 args=(int(ep_rank_0[1]), http_server_d, size))
@@ -210,10 +215,13 @@ def _check_var_exists(var_name):
         place = core.NPUPlace(parallel_env.device_id)
 
     _set_expected_place(place)
-    # init nccl or bkcl context
+    # init nccl or hccl or bkcl or heter context
     if is_cpu_only:
         parallel_helper._set_parallel_ctx(
             core.GLOOParallelContext(strategy, place))
+    elif (backend == "heter"):
+        parallel_helper._set_parallel_ctx(
+            core.HeterParallelContext(strategy, parallel_env.device_id))
     elif core.is_compiled_with_cuda():
         parallel_helper._set_parallel_ctx(
             core.NCCLParallelContext(strategy, place))
@@ -224,17 +232,19 @@ def _check_var_exists(var_name):
         parallel_helper._set_parallel_ctx(
             core.HCCLParallelContext(strategy, place))
 
-    other_endpoints = strategy.trainer_endpoints[:]
-    other_endpoints.remove(strategy.current_endpoint)
-    if not is_cpu_only and strategy.local_rank == 0:
-        wait_server_ready(other_endpoints)
+    if backend != "heter":
+        other_endpoints = strategy.trainer_endpoints[:]
+        other_endpoints.remove(strategy.current_endpoint)
+        if not is_cpu_only and strategy.local_rank == 0:
+            wait_server_ready(other_endpoints)
 
     parallel_helper._init_parallel_ctx()
+
     # 5: init gloo context (step 2: gloo init)
     # dividing init_gloo into two part beacause nccl and gloo
     # are separately looking for free ports which sometimes
     # leads to port-conflict.
-    if is_cpu_only and parallel_env.rank == 0:
+    if (is_cpu_only or backend == "heter") and parallel_env.rank == 0:
         # compare to init_gloo, we don't need to 
         # init gloo, because we do this in _init_parallel_ctx;
         http_server_d["running"] = False
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
index 40d5d18c9a40fa..5fe4d4162e6e32 100644
--- a/python/paddle/fluid/dygraph/parallel_helper.py
+++ b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -28,11 +28,11 @@ def _is_parallel_ctx_initialized():
     return __parallel_ctx__clz__ is not None
 
 
-def _set_parallel_ctx(nccl_parallel_context):
+def _set_parallel_ctx(ccl_parallel_context):
     global __parallel_ctx__clz__
     assert __parallel_ctx__clz__ is None, \
         "ParallelContext can only be initialized once."
-    __parallel_ctx__clz__ = nccl_parallel_context
+    __parallel_ctx__clz__ = ccl_parallel_context
 
 
 def _init_parallel_ctx():

From ead812305326f4b5ce003646b013aab66cfe7a32 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 5 Dec 2021 20:44:40 -0600
Subject: [PATCH 069/124] [PTen] Fix reshape move storage using error (#37765)

* fix reshape move storage error

* remove needless set type

* alloc tensor by shared storage
---
 paddle/fluid/operators/reshape_op.cc      | 11 ++++++-----
 paddle/pten/api/lib/utils/tensor_utils.cc | 24 +++++++++++++++++++++++
 paddle/pten/api/lib/utils/tensor_utils.h  |  5 +++++
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 5148e3b0940c92..c12db1293856bc 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -383,13 +383,13 @@ class ReshapeKernel {
     // 3. out tensor is view of input
     // We can't MakePtenDenseTensor for case 2, so we solve this case by
     // creating a temporary tensor here:
-    const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
-        ctx.GetPlace());
     pten::DenseTensorMeta meta{pten::TransToPtenDataType(in->type()),
                                in->dims(),
                                pten::TransToPtenDataLayout(in->layout())};
-    auto pt_out_tmp =
-        std::make_shared<pten::DenseTensor>(alloc, std::move(meta));
+    auto pt_out_tmp = std::make_shared<pten::DenseTensor>(
+        pten::make_intrusive<paddle::experimental::SharedStorage>(
+            ctx.GetPlace()),
+        std::move(meta));
     pten::DenseTensor *pt_out = nullptr;
     if (in == out) {
       pt_out = pt_x.get();
@@ -484,7 +484,8 @@ class ReshapeKernel {
     // non-inplace need move all result from pt_out to out, inplace need set
     // result dims.
     if (in != out) {
-      paddle::experimental::MovesStorage(pt_out, static_cast<Tensor *>(out));
+      paddle::experimental::MovesSharedStorage(pt_out,
+                                               static_cast<Tensor *>(out));
     } else {
       out->Resize(pt_out->dims());
     }
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 0983abfa92137b..f2b6e4841aab24 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 
+#include <utility>
 #include <vector>
 
 #include "paddle/pten/core/compat_utils.h"
@@ -342,6 +343,29 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
   MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
 }
 
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::Tensor* dst) {
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move allocation."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination Tensor is nullptr when move allocation."));
+  dst->Resize(src->dims());
+  auto* storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
+  dst->ResetHolderWithType(storage->GetAllocation(),
+                           pten::TransToProtoVarType(src->dtype()));
+}
+
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::LoDTensor* dst) {
+  MovesSharedStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+  SetLoD(dst->mutable_lod(), src->lod());
+}
+
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            const pten::TensorArgDef& arg_def,
                            pten::DenseTensor* dst) {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 04f0f6c1ff0c8e..6397ca369ce755 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -58,6 +58,11 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
+void MovesSharedStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
+
+void MovesSharedStorage(pten::DenseTensor* src,
+                        paddle::framework::LoDTensor* dst);
+
 /**
  * In order to improve the compatibility state performance, some tricky tool
  * functions are added.

From c042d8f2540d889d988b9cc3099aef3c249b8854 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Mon, 6 Dec 2021 11:07:08 +0800
Subject: [PATCH 070/124] fix ernie (#37839)

* fix

* Update stack_op_plugin.cu

comments
---
 .../fluid/inference/tensorrt/plugin/stack_op_plugin.cu   | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index c3b4a6ff4af1cb..74a6c3cdf3e4e7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -107,8 +107,13 @@ bool StackPluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc& in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
+      return (
+// It's workaround for ernie fix len model.
+// Enabling float, half on the same time will cause trt hang.
+#if IS_TRT_VERSION_LT(8000)
+                 in.type == nvinfer1::DataType::kFLOAT ||
+#endif
+                 in.type == nvinfer1::DataType::kHALF) &&
              (in.format == nvinfer1::TensorFormat::kLINEAR);
     } else {
       return (in.type == nvinfer1::DataType::kFLOAT) &&

From dc0ec6670b3ec186770efbe1709994506f4078c2 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Mon, 6 Dec 2021 11:08:42 +0800
Subject: [PATCH 071/124] UT for TestAdaptivePool2dConvertGlobalPass (#37764)

* adaptive_pool2d

* timeout

* merge develop

* rename ut

* new test

* remove old file
---
 .../unittests/ir/inference/CMakeLists.txt     |   1 +
 ...ive_pool2d_convert_global_pass_autoscan.py | 118 ++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 2b45a8c5db33d0..67d300fe186a8b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -79,6 +79,7 @@ if (WITH_MKLDNN AND TENSORRT_FOUND AND WITH_GPU)
   set_tests_properties(test_emb_eltwise_layernorm_fuse_pass PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fc_fuse_pass PROPERTIES TIMEOUT 240)
   set_tests_properties(test_simplify_with_basic_ops_pass_autoscan PROPERTIES TIMEOUT 60)
+  set_tests_properties(test_adaptive_pool2d_convert_global_pass_autoscan PROPERTIES TIMEOUT 60)
 endif()
 
 if (WITH_MKLDNN)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
new file mode 100644
index 00000000000000..8cb6af1dcf0441
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+        pooling_type = draw(st.sampled_from(["max", "avg"]))
+
+        data_format = "NCHW"  #trt support this format only
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        paddings = [0, 0]  # only 0 0 is right
+        ceil_mode = draw(st.booleans())
+        exclusive = draw(st.booleans())
+        global_pooling = False  #only false is right
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VAILD"]))
+
+        pool_op = OpConfig(
+            "pool2d",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["pool_output"]},
+            ksize=[1, 1],
+            adaptive=True,
+            pooling_type=pooling_type,
+            data_format=data_format,
+            strides=strides,
+            paddings=paddings,
+            ceil_mode=ceil_mode,
+            global_pooling=global_pooling,
+            padding_algorithm=padding_algorithm,
+            exclusive=exclusive)
+        ops = [pool_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={"input_data": TensorConfig(shape=x_shape), },
+            outputs=["pool_output"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['pool2d'], (1e-5, 1e-5)
+
+    def add_ignore_pass_case(self):
+        # Here we put some skip rules to avoid known bugs
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs["pooling_type"] == "max":
+                x_shape = list(program_config.inputs["input_data"].shape)
+                if x_shape[-1] != 1 or x_shape[-2] != 1:
+                    return True
+            return False
+
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs["padding_algorithm"] == "SAME":
+                return True
+            return False
+
+        self.add_ignore_check_case(
+            teller1,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "max pooling has diff if H or W is not equals to 1", )
+        self.add_ignore_check_case(
+            teller2,
+            IgnoreReasons.PASS_ACCURACY_ERROR,
+            "output has wrong result if padding_algorithm equals to SAME", )
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["adaptive_pool2d_convert_global_pass"],
+            min_success_num=40)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 364376e5fd2a629a4e9194a433af3cbcd7e6045d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 6 Dec 2021 11:13:50 +0800
Subject: [PATCH 072/124] [new-exec] enable sequential run for debug (#37835)

* enable sequential_run for standalone_executor

* add ut

* fix ut
---
 .../new_executor/interpretercore_util.cc        | 17 +++++++++++++++++
 .../new_executor/interpretercore_util.h         |  4 +---
 .../interpreter/test_standalone_executor.py     | 10 ++++++++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 0501522a7a810c..3817a11b9afe4e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -20,9 +20,26 @@
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+PADDLE_DEFINE_EXPORTED_bool(
+    new_executor_sequential_run, false,
+    "Enable sequential execution for standalone executor, used for debug");
 namespace paddle {
 namespace framework {
 namespace interpreter {
+
+void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
+                             std::function<void()> fn) {
+  // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used.
+  if (FLAGS_new_executor_sequential_run) {
+    VLOG(4) << "FLAGS_new_executor_sequential_run:"
+            << FLAGS_new_executor_sequential_run;
+    queue_group_->AddTask(static_cast<size_t>(OpFuncType::kQueueAsync),
+                          std::move(fn));
+  } else {
+    queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
+  }
+}
+
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
 AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index c92cea6c97c863..8f27c7e1811fb8 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -77,9 +77,7 @@ class AsyncWorkQueue {
 
   // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
 
-  void AddTask(const OpFuncType& op_func_type, std::function<void()> fn) {
-    queue_group_->AddTask(static_cast<size_t>(op_func_type), std::move(fn));
-  }
+  void AddTask(const OpFuncType& op_func_type, std::function<void()> fn);
 
   void Cancel() { queue_group_->Cancel(); }
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 23dee7338ae18b..01b2cccfc48b25 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -130,6 +130,10 @@ def test_result(self):
         for gt, out in zip(ground_truths, res):
             self.assertEqual(gt[0], out[0])
 
+        res_sequential = self.run_new_executor_sequential()
+        for gt, out in zip(ground_truths, res_sequential):
+            self.assertEqual(gt[0], out[0])
+
     def run_raw_executor(self):
         paddle.seed(2020)
         main_program, startup_program, fetch_list = build_program()
@@ -158,6 +162,12 @@ def run_new_executor(self):
                 np.array(inter_core.run({}, fetch_list)._move_to_list()[0]))
         return outs
 
+    def run_new_executor_sequential(self):
+        os.environ['FLAGS_new_executor_sequential_run'] = '1'
+        res = self.run_new_executor()
+        del os.environ['FLAGS_new_executor_sequential_run']
+        return res
+
 
 class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase):
     def run_new_executor(self):

From 5b1ad1400353943d5cb2605ce8e1f8041c3f2ff3 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 6 Dec 2021 13:07:11 +0800
Subject: [PATCH 073/124] edit pten_util vlog print level;test=develop (#37828)

---
 paddle/fluid/framework/pten_utils.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index b423d0e05e1744..51a2d641bb00a5 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -136,17 +136,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(3) << "Parse PtenKernel input: skip extra & quant input - "
               << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
     // GetExpectedPtenKernelArgs method self
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(1) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(1) << "Parse PtenKernel input: " << in_name;
+    VLOG(3) << "Parse PtenKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -158,7 +158,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
     // TODO(chenweihang): outputs also need skip some cases
-    VLOG(1) << "Parse PtenKernel output: " << out_name;
+    VLOG(3) << "Parse PtenKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -172,17 +172,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
     if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
         attr_name == "op_role_var" || attr_name == "op_namescope" ||
         attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(1) << "Parse PtenKernel attribute: skip needless attr - "
+      VLOG(3) << "Parse PtenKernel attribute: skip needless attr - "
               << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(1) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(1) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(3) << "Parse PtenKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 

From 6a1e4de23881e762d35287358906c679f4b34852 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 6 Dec 2021 14:01:14 +0800
Subject: [PATCH 074/124] Fix default behavior if block=None in static mode
 (#37827)

---
 python/paddle/fluid/initializer.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 930995cee6d08b..a7631848cd38cb 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -53,11 +53,7 @@ def __call__(self, param, block=None):
 
     def _check_block(self, block):
         if block is None:
-            if in_dygraph_mode():
-                block = default_main_program().global_block()
-            else:
-                raise ValueError(
-                    "The parameter 'block' is needed in static graph mode.")
+            block = default_main_program().global_block()
 
         return block
 

From 1432e3d2571156be61ce0255d196275a1f72cd79 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Mon, 6 Dec 2021 14:36:22 +0800
Subject: [PATCH 075/124] Fix bug (#37868)

---
 paddle/fluid/imperative/nccl_context.cc                 | 4 ++--
 paddle/fluid/imperative/tests/CMakeLists.txt            | 2 +-
 paddle/fluid/imperative/tests/heter_ccl_context_test.cc | 2 +-
 paddle/fluid/imperative/tests/nccl_context_test.cc      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index f822894b42b0b5..15146f6c1204e6 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -27,8 +27,8 @@
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -145,7 +145,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
 
   void *src_ptr = src_tensor->data<void>();
   auto nccl_dtype = platform::ToNCCLDataType(src_tensor->type());
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
       src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
 }
 
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 01a24872fbd7c2..32e982f1f15caa 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WIN32)
     cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
 else()
-    if (WITH_NCCL OR WITH_RCCL)
+    if (WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
         cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
         cc_test(heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell)
         #set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
index c40a5fc52ceb86..d36743510e5ba3 100644
--- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -79,7 +79,7 @@ void AllReduceByStream(int local_rank, int device_id) {
 }
 
 TEST(AllReduceByStream, Run) {
-  if (platform::GetCUDADeviceCount() >= 2) {
+  if (platform::GetGPUDeviceCount() >= 2) {
     std::thread t0(AllReduceByStream, 0, 0);
     std::thread t1(AllReduceByStream, 1, 1);
     t0.join();
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index b56444104f2779..401e4e324eb892 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -111,7 +111,7 @@ void Broadcast(int local_rank, int device_id) {
 }
 
 TEST(Broadcast, Run) {
-  if (platform::GetCUDADeviceCount() >= 2) {
+  if (platform::GetGPUDeviceCount() >= 2) {
     std::thread t0(Broadcast, 0, 0);
     std::thread t1(Broadcast, 1, 1);
     t0.join();

From 7b1bb874041aa1c1470ddac2c4c91585b69dc343 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 6 Dec 2021 15:17:52 +0800
Subject: [PATCH 076/124] test inference_api_test when run in windows-inference
 ci (#37710)

* test inference_api_test when run in windows-inference ci

* test if test failed the code run correctly

* put the failed test back
---
 .../ir/inference/test_trt_pool_op.py          |  6 ++--
 tools/windows/run_unittests.sh                | 34 ++++++++++++++++---
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
index 3d317446f00f3e..26ad45db7a18d6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool_op.py
@@ -27,9 +27,9 @@
 class TensorRTPoolTest(InferencePassTest):
     def setUp(self):
         self.bs = 1
-        self.channel = 3
-        self.height = 8
-        self.width = 8
+        self.channel = 2
+        self.height = 2
+        self.width = 2
         self.pool_size = 2
         self.pool_type = 'max'
         self.pool_stride = 1
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index a6aac30a374499..4d62a9a88e1f5c 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -53,10 +53,6 @@ if [ -f "$PADDLE_ROOT/added_ut" ];then
         echo "========================================"
         exit 8;
     fi
-    if nvcc --version | grep 11.2; then
-        echo "Only test added_ut temporarily when running in CI-Windows-inference of CUDA 11.2."
-        exit 0;
-    fi
 fi
 set -e
 
@@ -107,7 +103,6 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\
 ^test_trt_convert_emb_eltwise_layernorm$|\
 ^test_trt_convert_pool2d$|\
 ^test_trt_conv3d_op$|\
-^test_trt_matmul_quant_dequant$|\
 ^test_trt_subgraph_pass$|\
 ^test_trt_convert_dropout$|\
 ^test_trt_convert_hard_sigmoid$|\
@@ -121,6 +116,16 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\
 ^test_trt_convert_matmul$|\
 ^test_trt_convert_scale$"
 
+# /*==================Fixed Disabled Windows GPU inference_api_test unittests==============================*/
+disable_win_inference_api_test="^test_analyzer_capi_exp_pd_config$|\
+^trt_quant_int8_yolov3_r50_test$|\
+^test_trt_dynamic_shape_ernie$|\
+^test_trt_dynamic_shape_ernie_fp16_ser_deser$|\
+^lite_resnet50_test$|\
+^test_trt_dynamic_shape_transformer_prune$|\
+^lite_mul_model_test$|\
+^paddle_infer_api_copy_tensor_tester$"
+
 # /*============================================================================*/
 
 # /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
@@ -180,6 +185,7 @@ long_time_test="^test_gru_op$|\
 ^test_transformer$|\
 ^test_imperative_auto_mixed_precision$|\
 ^test_imperative_optimizer_v2$|\
+^test_trt_matmul_quant_dequant$|\
 ^test_strided_slice_op$"
 
 if [ ${WITH_GPU:-OFF} == "ON" ];then
@@ -331,6 +337,24 @@ function show_ut_retry_result() {
 set +e
 
 export FLAGS_call_stack_level=2
+
+if nvcc --version | grep 11.2; then
+    echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2."
+    export CUDA_VISIBLE_DEVICES=0
+    tmpfile=$tmp_dir/$RANDOM
+    inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g')
+    (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) &
+    wait;
+    collect_failed_tests
+    set -e
+    rm -f $tmp_dir/*
+    if [[ "$failed_test_lists" != "" ]]; then
+        unittests_retry
+        show_ut_retry_result
+    fi
+    exit 0;
+fi
+
 if [ "${WITH_GPU:-OFF}" == "ON" ];then
     run_unittest_gpu $cpu_parallel_job 10
     run_unittest_gpu $tetrad_parallel_job 4

From f5609de4d72b3d18ebc93de74a8c83a93629a6e9 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Mon, 6 Dec 2021 17:02:13 +0800
Subject: [PATCH 077/124] [fleet_executor] remove should reset (#37862)

---
 .../fleet_executor/compute_interceptor.cc     | 31 ++-----------------
 .../fleet_executor/compute_interceptor.h      |  1 -
 2 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 41c77c1ead045f..6a4fadd1304363 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -64,7 +64,7 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
 
   // source node has no upstream, data_is_ready is send by carrier or others
   if (is_source_ && up_id == -1) {
-    it->second.second = GetTaskNode()->max_run_times();
+    it->second.second += GetTaskNode()->max_run_times();
     return;
   }
 
@@ -121,16 +121,6 @@ bool ComputeInterceptor::CanWriteOutput() {
   return true;
 }
 
-// only source node need reset
-bool ComputeInterceptor::ShouldReset() {
-  if (is_source_ && step_ == node_->max_run_times()) {
-    VLOG(3) << "Interceptor " << GetInterceptorId()
-            << " should reset for step: " << step_ << ".";
-    return true;
-  }
-  return false;
-}
-
 void ComputeInterceptor::SendDataReadyToDownStream() {
   for (auto& outs : out_buffs_) {
     auto down_id = outs.first;
@@ -186,24 +176,7 @@ void ComputeInterceptor::RunOps() {
 }
 
 void ComputeInterceptor::Run() {
-  // If there is no limit, source interceptor can be executed
-  // an unlimited number of times.
-  // Now source node can only run max_run_times.
-  if (ShouldReset()) {
-    for (auto& out_buff : out_buffs_) {
-      // buffer is using
-      if (out_buff.second.second != 0) {
-        VLOG(3) << "Interceptor " << GetInterceptorId()
-                << " out buffer for downstream: " << out_buff.first
-                << "'s counter is: " << out_buff.second.second
-                << ". Cannot be reset.";
-        return;
-      }
-    }
-    step_ = 0;  // reset
-  }
-
-  while (IsInputReady() && CanWriteOutput() && !ShouldReset()) {
+  while (IsInputReady() && CanWriteOutput()) {
     VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
 
     RunOps();
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index ae253f844aab4e..fb82ce76c7bdb8 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -39,7 +39,6 @@ class ComputeInterceptor : public Interceptor {
   void DecreaseBuff(int64_t down_id);
   bool IsInputReady();
   bool CanWriteOutput();
-  bool ShouldReset();
 
   void Run();
   void Compute(const InterceptorMessage& msg);

From 6ff19d6678f30bd1fc5307fb24a58ff554d38e42 Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Mon, 6 Dec 2021 17:27:42 +0800
Subject: [PATCH 078/124] [New API]add rot90 api (#37634)

* update

* update. test=develop

* fix. test=develop

* fix ut. test=develop

* fix ut. test=develop

* fix ut. test=develop

* update. test=develop

* fix ut. test=develop

* fix ut. test=develop

* fix sample code. test=develop

* fix ut. test=develop

* fix ut. test=develop

* fix ut. test=develop

* fix ut. test=develop

* fix paddle.rot90 doc. test=develop

* update ut. test=develop

* fix. test=develop

* fix .test=develop

* fix .test=develop

* fix doc. test=develop
---
 .../framework/heter_pipeline_trainer_test.cc  |   2 -
 python/paddle/__init__.py                     |   2 +
 .../fluid/tests/unittests/test_rot90_op.py    | 262 ++++++++++++++++++
 python/paddle/tensor/__init__.py              |   2 +
 python/paddle/tensor/manipulation.py          |  86 ++++++
 5 files changed, 352 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_rot90_op.py

diff --git a/paddle/fluid/framework/heter_pipeline_trainer_test.cc b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
index af8eca32ee2f4e..417c7685bcbeb4 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
@@ -115,8 +115,6 @@ TEST(HeterPipelineTrainerTest, GPU) {
   t3.add_trainers(1);
   t3.add_trainers(1);
   t3.add_trainers(1);
-  t3.add_dump_fields("hello");
-  t3.add_dump_param("fc_0");
   auto* heter_section_param3 = t3.mutable_heter_section_param();
   heter_section_param3->set_num_pipeline_stages(3);
   heter_section_param3->set_pipeline_stage(2);
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index c37c331bae4a6e..661cd495b53e8c 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -150,6 +150,7 @@
 from .tensor.manipulation import unsqueeze_  # noqa: F401
 from .tensor.manipulation import unstack  # noqa: F401
 from .tensor.manipulation import flip  # noqa: F401
+from .tensor.manipulation import rot90  # noqa: F401
 from .tensor.manipulation import unbind  # noqa: F401
 from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
@@ -408,6 +409,7 @@
            'bitwise_not',
            'mm',
            'flip',
+           'rot90',
            'bincount',
            'histogram',
            'multiplex',
diff --git a/python/paddle/fluid/tests/unittests/test_rot90_op.py b/python/paddle/fluid/tests/unittests/test_rot90_op.py
new file mode 100644
index 00000000000000..4ab7c4f14f96ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rot90_op.py
@@ -0,0 +1,262 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+
+
+class TestRot90_API(unittest.TestCase):
+    """Test rot90 api."""
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 1])
+            output = paddle.rot90(output, k=1, axes=[0, 1])
+            output = output.rot90(k=1, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_0(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=0, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_2(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=2, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_k_3(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=3, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_1(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-1, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_2(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-2, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[6, 5, 4], [3, 2, 1]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_static_neg_k_3(self):
+        paddle.enable_static()
+        input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=-3, axes=[0, 1])
+            place = fluid.CPUPlace()
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            out_ref = np.array([[3, 6], [2, 5], [1, 4]]).astype(np.float32)
+
+            self.assertTrue(
+                (out_np == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(out_np))
+
+    def test_error_api(self):
+        paddle.enable_static()
+
+        ## dims error
+        def run1():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0])
+
+        self.assertRaises(ValueError, run1)
+
+        ## input dims error
+        def run2():
+            input = fluid.data(name='input', dtype='float32', shape=[2])
+            output = paddle.rot90(input, k=1, axes=[0, 1])
+
+        self.assertRaises(ValueError, run2)
+
+        def run3():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 0])
+
+        self.assertRaises(ValueError, run3)
+
+        def run4():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[3, 1])
+
+        self.assertRaises(ValueError, run4)
+
+        def run5():
+            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.rot90(input, k=1, axes=[0, 3])
+
+        self.assertRaises(ValueError, run5)
+
+    def test_dygraph(self):
+        img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+        with fluid.dygraph.guard():
+            inputs = fluid.dygraph.to_variable(img)
+
+            ret = paddle.rot90(inputs, k=1, axes=[0, 1])
+            ret = ret.rot90(1, axes=[0, 1])
+            ret = paddle.rot90(ret, k=1, axes=[0, 1])
+            out_ref = np.array([[4, 1], [5, 2], [6, 3]]).astype(np.float32)
+
+            self.assertTrue(
+                (ret.numpy() == out_ref).all(),
+                msg='rot90 output is wrong, out =' + str(ret.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 7cc2c7623a9ff6..793fdb89d06776 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -106,6 +106,7 @@
 from .manipulation import unsqueeze_  # noqa: F401
 from .manipulation import unstack  # noqa: F401
 from .manipulation import flip  # noqa: F401
+from .manipulation import rot90  # noqa: F401
 from .manipulation import unbind  # noqa: F401
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
@@ -370,6 +371,7 @@
            'unsqueeze_',
            'unstack',
            'flip',
+           'rot90',
            'unbind',
            'roll',
            'tile',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9b9b2d9431eeb4..f48e5a3a764cd3 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -495,6 +495,92 @@ def flip(x, axis, name=None):
     return out
 
 
+def rot90(x, k=1, axes=[0, 1], name=None):
+    """
+    Rotate a n-D tensor by 90 degrees in the plane specified by dims axis. Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+
+    Args:
+        x (Tensor): The input Tensor(or LoDTensor). The data type of the input Tensor x
+            should be float32, float64, int32, int64, bool.
+        k (int): Number of times to rotate
+        axes (list|tuple): Axis to rotate
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: Tensor or LoDTensor calculated by rot90 layer. The data type is same with input x.
+
+    Raises:
+        TypeError: If the data type of ``x`` is not Variable
+        TypeError: If the dtype of ``x`` is not float16, float32, float64, int32, int64, bool
+        TypeError: If the data type of ``dims`` is not list, tuple
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          data = paddle.arange(4)
+          data = paddle.reshape(data, (2, 2))
+          print(data) ## [[0, 1],[2, 3]]
+          y = paddle.rot90(data, 1, [0, 1])
+          print(y) #[[1, 3],[0, 2]]
+          y= paddle.rot90(data, -1, [0, 1])
+          print(y) #[[2, 0],[3, 1]]
+          data2 = paddle.arange(8)
+          data2 = paddle.reshape(data2, (2,2,2))
+          print(data2) ###[[[0, 1],[2, 3]],[[4, 5],[6, 7]]]
+          y = paddle.rot90(data2, 1, [1, 2])
+          print(y)   ### [[[1, 3],[0, 2]],[[5, 7],[4, 6]]]
+    """
+
+    helper = LayerHelper("rot90", **locals())
+    check_type(x, 'X', (Variable), 'rot90')
+    dtype = helper.input_dtype('x')
+    check_dtype(dtype, 'X',
+                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+                'rot90')
+    check_type(axes, 'axes', (list, tuple), 'rot90')
+
+    input_total_dims = len(x.shape)
+    total_rot_dims = len(axes)
+    if total_rot_dims != 2:
+        raise ValueError("expected total rotation axes == 2, but got axes = {}".
+                         format(total_rot_dims))
+    if input_total_dims < 2:
+        raise ValueError("expected total dims >= 2, but got total dims = {}".
+                         format(input_total_dims))
+
+    if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
+        raise ValueError(
+            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".
+            format(axes[0], axes[1]))
+
+    if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
+        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(axes[
+            0]))
+    if not (axes[1] < input_total_dims and axes[1] >= -input_total_dims):
+        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(axes[
+            1]))
+
+    ## k % 4
+    k = k % 4 if k >= 0 else 4 - (-k % 4)
+    if k == 0:
+        return x
+    if k == 2:
+        return flip(flip(x, axes[0]), axes[1])
+
+    axes_list = list(range(0, input_total_dims))
+    (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
+                                                axes_list[axes[0]])
+    if k == 1:
+        return transpose(flip(x, axes[1]), axes_list)
+    else:
+        # k == 3
+        return flip(transpose(x, axes_list), axes[1])
+
+
 def flatten(x, start_axis=0, stop_axis=-1, name=None):
     r"""
     **Flatten op**

From a73064f2e237b9f05e4e932f0ae5fc04da9877e1 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Mon, 6 Dec 2021 18:01:10 +0800
Subject: [PATCH 079/124] pylayer support tuple/list type args (#37727)

---
 paddle/fluid/imperative/py_layer_fwd.h | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index 1baf73ab3b95da..79251d7bf7ad6b 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -101,6 +101,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
               "`%s` type argument can not be cast into `Tensor`.",
               ptr->ptr()->ob_type->tp_name));
         }
+      } else if (py::isinstance<py::tuple>(*ptr) ||
+                 py::isinstance<py::list>(*ptr)) {
+        try {
+          auto tuple_arg = ptr->cast<py::tuple>();
+          for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) {
+            try {
+              auto t = iter->cast<std::shared_ptr<VarBase>>();
+              input_vars.push_back(t);
+            } catch (py::cast_error& err) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.forward` function contains invalid argument, "
+                  "the "
+                  "`%s` type argument can not be cast into `Tensor`.",
+                  ptr->ptr()->ob_type->tp_name));
+            }
+          }
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->ptr()->ob_type->tp_name));
+        }
       }
     }
   }
@@ -119,6 +141,28 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
               "`%s` type argument can not be cast into `Tensor`.",
               ptr->second.ptr()->ob_type->tp_name));
         }
+      } else if (py::isinstance<py::tuple>(*ptr->second) ||
+                 py::isinstance<py::list>(*ptr->second)) {
+        try {
+          auto tuple_arg = ptr->second.cast<py::tuple>();
+          for (auto iter = tuple_arg.begin(); iter != tuple_arg.end(); ++iter) {
+            try {
+              auto t = iter->cast<std::shared_ptr<VarBase>>();
+              input_vars.push_back(t);
+            } catch (py::cast_error& err) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.forward` function contains invalid argument, "
+                  "the "
+                  "`%s` type argument can not be cast into `Tensor`.",
+                  ptr->second.ptr()->ob_type->tp_name));
+            }
+          }
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->second.ptr()->ob_type->tp_name));
+        }
       }
     }
   }

From 66cd0bbb9d224cfba9fa2970755a1c7f0addb846 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 6 Dec 2021 05:36:30 -0600
Subject: [PATCH 080/124] rename pd_dll_decl to paddle_api (#37876)

---
 paddle/pten/api/ext/dll_decl.h          |  10 +--
 paddle/pten/api/ext/op_meta_info.h      |   8 +-
 paddle/pten/api/include/tensor.h        |   2 +-
 paddle/pten/api/include/utils.h         |   2 +-
 paddle/pten/api/lib/api_registry.h      |   6 +-
 paddle/pten/api/lib/tensor.cc           | 114 ++++++++++++------------
 paddle/pten/api/lib/utils.cc            |   2 +-
 python/paddle/utils/code_gen/api_gen.py |   6 +-
 8 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/paddle/pten/api/ext/dll_decl.h b/paddle/pten/api/ext/dll_decl.h
index 3dbea5e6dffc27..37c637c102c3b8 100644
--- a/paddle/pten/api/ext/dll_decl.h
+++ b/paddle/pten/api/ext/dll_decl.h
@@ -15,13 +15,13 @@
 #pragma once
 
 #if defined(_WIN32)
-#ifndef PD_DLL_DECL
+#ifndef PADDLE_API
 #ifdef PADDLE_DLL_EXPORT
-#define PD_DLL_DECL __declspec(dllexport)
+#define PADDLE_API __declspec(dllexport)
 #else
-#define PD_DLL_DECL __declspec(dllimport)
+#define PADDLE_API __declspec(dllimport)
 #endif  // PADDLE_DLL_EXPORT
-#endif  // PD_DLL_DECL
+#endif  // PADDLE_API
 #else
-#define PD_DLL_DECL
+#define PADDLE_API
 #endif  // _WIN32
diff --git a/paddle/pten/api/ext/op_meta_info.h b/paddle/pten/api/ext/op_meta_info.h
index 140874f93aed72..351e88b57bd8b8 100644
--- a/paddle/pten/api/ext/op_meta_info.h
+++ b/paddle/pten/api/ext/op_meta_info.h
@@ -33,7 +33,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class PD_DLL_DECL OpMetaInfoHelper;
+class PADDLE_API OpMetaInfoHelper;
 }  // namespace framework
 
 using Tensor = paddle::Tensor;
@@ -425,7 +425,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
 
 ////////////////////// Op Meta Info //////////////////////
 
-class PD_DLL_DECL OpMetaInfo {
+class PADDLE_API OpMetaInfo {
  public:
   explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
 
@@ -464,7 +464,7 @@ class PD_DLL_DECL OpMetaInfo {
 
 //////////////// Op Meta Info Map /////////////////
 
-class PD_DLL_DECL OpMetaInfoMap {
+class PADDLE_API OpMetaInfoMap {
  public:
   // this function's impl should keep in header file.
   // if move to cc file, meta info can not be added
@@ -488,7 +488,7 @@ class PD_DLL_DECL OpMetaInfoMap {
 
 //////////////// Op Meta Info Builder /////////////////
 
-class PD_DLL_DECL OpMetaInfoBuilder {
+class PADDLE_API OpMetaInfoBuilder {
  public:
   explicit OpMetaInfoBuilder(std::string&& name, size_t index);
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index 3eb1f89225414e..6693dbf78f4d63 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -84,7 +84,7 @@ class AbstractAutogradMeta {
  * another simple Tensor design may be required for inference.
  */
 
-class PD_DLL_DECL Tensor final {
+class PADDLE_API Tensor final {
  public:
   /* Part 1: Construction and destruction methods */
 
diff --git a/paddle/pten/api/include/utils.h b/paddle/pten/api/include/utils.h
index c038e503d47bb7..b8b955090b99a8 100644
--- a/paddle/pten/api/include/utils.h
+++ b/paddle/pten/api/include/utils.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace experimental {
 
 // TODO(chenweihang): Replace backend by place when place is ready
-PD_DLL_DECL Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
+PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/api_registry.h b/paddle/pten/api/lib/api_registry.h
index abb31451d522eb..d75774a1a12be9 100644
--- a/paddle/pten/api/lib/api_registry.h
+++ b/paddle/pten/api/lib/api_registry.h
@@ -37,10 +37,10 @@ namespace experimental {
 
 // use to declare symbol
 #define PT_REGISTER_API(name) \
-  PD_DLL_DECL int RegisterSymbolsFor##name() { return 0; }
+  PADDLE_API int RegisterSymbolsFor##name() { return 0; }
 
-#define PT_DECLARE_API(name)                         \
-  extern PD_DLL_DECL int RegisterSymbolsFor##name(); \
+#define PT_DECLARE_API(name)                        \
+  extern PADDLE_API int RegisterSymbolsFor##name(); \
   UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 6b4a3b1950a98c..f6cccf0b357ce1 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -159,19 +159,19 @@ T *Tensor::mutable_data() {
   return nullptr;
 }
 
-template PD_DLL_DECL float *Tensor::mutable_data<float>();
-template PD_DLL_DECL double *Tensor::mutable_data<double>();
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex<float>
+template PADDLE_API float *Tensor::mutable_data<float>();
+template PADDLE_API double *Tensor::mutable_data<double>();
+template PADDLE_API int64_t *Tensor::mutable_data<int64_t>();
+template PADDLE_API int32_t *Tensor::mutable_data<int32_t>();
+template PADDLE_API uint8_t *Tensor::mutable_data<uint8_t>();
+template PADDLE_API int8_t *Tensor::mutable_data<int8_t>();
+template PADDLE_API int16_t *Tensor::mutable_data<int16_t>();
+template PADDLE_API bool *Tensor::mutable_data<bool>();
+template PADDLE_API paddle::platform::complex<float>
     *Tensor::mutable_data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex<double>
+template PADDLE_API paddle::platform::complex<double>
     *Tensor::mutable_data<paddle::platform::complex<double>>();
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
 
 template <typename T>
@@ -185,25 +185,25 @@ T *Tensor::mutable_data(const PlaceType &place) {
   return mutable_data<T>();
 }
 
-template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
-template PD_DLL_DECL double *Tensor::mutable_data<double>(
+template PADDLE_API float *Tensor::mutable_data<float>(const PlaceType &place);
+template PADDLE_API double *Tensor::mutable_data<double>(
     const PlaceType &place);
-template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+template PADDLE_API int64_t *Tensor::mutable_data<int64_t>(
     const PlaceType &place);
-template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+template PADDLE_API int32_t *Tensor::mutable_data<int32_t>(
     const PlaceType &place);
-template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+template PADDLE_API uint8_t *Tensor::mutable_data<uint8_t>(
     const PlaceType &place);
-template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+template PADDLE_API int8_t *Tensor::mutable_data<int8_t>(
     const PlaceType &place);
-template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+template PADDLE_API int16_t *Tensor::mutable_data<int16_t>(
     const PlaceType &place);
-template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<float> *
+template PADDLE_API bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PADDLE_API paddle::platform::complex<float> *
 Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex<double> *
+template PADDLE_API paddle::platform::complex<double> *
 Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
 template <typename T>
@@ -214,22 +214,22 @@ const T *Tensor::data() const {
   return nullptr;
 }
 
-template PD_DLL_DECL const float *Tensor::data<float>() const;
-template PD_DLL_DECL const double *Tensor::data<double>() const;
-template PD_DLL_DECL const int64_t *Tensor::data<int64_t>() const;
-template PD_DLL_DECL const int32_t *Tensor::data<int32_t>() const;
-template PD_DLL_DECL const uint8_t *Tensor::data<uint8_t>() const;
-template PD_DLL_DECL const int8_t *Tensor::data<int8_t>() const;
-template PD_DLL_DECL const int16_t *Tensor::data<int16_t>() const;
-template PD_DLL_DECL const uint16_t *Tensor::data<uint16_t>() const;
-template PD_DLL_DECL const bool *Tensor::data<bool>() const;
-template PD_DLL_DECL const paddle::platform::complex<float>
+template PADDLE_API const float *Tensor::data<float>() const;
+template PADDLE_API const double *Tensor::data<double>() const;
+template PADDLE_API const int64_t *Tensor::data<int64_t>() const;
+template PADDLE_API const int32_t *Tensor::data<int32_t>() const;
+template PADDLE_API const uint8_t *Tensor::data<uint8_t>() const;
+template PADDLE_API const int8_t *Tensor::data<int8_t>() const;
+template PADDLE_API const int16_t *Tensor::data<int16_t>() const;
+template PADDLE_API const uint16_t *Tensor::data<uint16_t>() const;
+template PADDLE_API const bool *Tensor::data<bool>() const;
+template PADDLE_API const paddle::platform::complex<float>
     *Tensor::data<paddle::platform::complex<float>>() const;
-template PD_DLL_DECL const paddle::platform::complex<double>
+template PADDLE_API const paddle::platform::complex<double>
     *Tensor::data<paddle::platform::complex<double>>() const;
-template PD_DLL_DECL const paddle::platform::float16 *
+template PADDLE_API const paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>() const;
-template PD_DLL_DECL const paddle::platform::bfloat16 *
+template PADDLE_API const paddle::platform::bfloat16 *
 Tensor::data<paddle::platform::bfloat16>() const;
 
 template <typename T>
@@ -241,19 +241,19 @@ T *Tensor::data() {
   return nullptr;
 }
 
-template PD_DLL_DECL float *Tensor::data<float>();
-template PD_DLL_DECL double *Tensor::data<double>();
-template PD_DLL_DECL int64_t *Tensor::data<int64_t>();
-template PD_DLL_DECL int32_t *Tensor::data<int32_t>();
-template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>();
-template PD_DLL_DECL int8_t *Tensor::data<int8_t>();
-template PD_DLL_DECL int16_t *Tensor::data<int16_t>();
-template PD_DLL_DECL bool *Tensor::data<bool>();
-template PD_DLL_DECL paddle::platform::complex<float>
+template PADDLE_API float *Tensor::data<float>();
+template PADDLE_API double *Tensor::data<double>();
+template PADDLE_API int64_t *Tensor::data<int64_t>();
+template PADDLE_API int32_t *Tensor::data<int32_t>();
+template PADDLE_API uint8_t *Tensor::data<uint8_t>();
+template PADDLE_API int8_t *Tensor::data<int8_t>();
+template PADDLE_API int16_t *Tensor::data<int16_t>();
+template PADDLE_API bool *Tensor::data<bool>();
+template PADDLE_API paddle::platform::complex<float>
     *Tensor::data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex<double>
+template PADDLE_API paddle::platform::complex<double>
     *Tensor::data<paddle::platform::complex<double>>();
-template PD_DLL_DECL paddle::platform::float16 *
+template PADDLE_API paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>();
 
 // TODO(chenweihang): replace slice impl by API
@@ -294,27 +294,27 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
 }
 
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<float>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<double>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
+template PADDLE_API Tensor Tensor::copy_to<paddle::platform::complex<float>>(
     const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
+template PADDLE_API Tensor Tensor::copy_to<paddle::platform::complex<double>>(
     const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor
+template PADDLE_API Tensor
 Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
 
 Tensor Tensor::copy_to(Backend backend, bool blocking) const {
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index 0948ba5d698a67..e17b19d9f689e1 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -34,7 +34,7 @@ PT_DECLARE_MODULE(UtilsCUDA);
 namespace paddle {
 namespace experimental {
 
-PD_DLL_DECL Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
+PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index cd81d001b8f6b0..5506ee95bd7c9e 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -111,7 +111,7 @@ def parse_args(self, args_str):
 
     def gene_api_declaration(self):
         return f"""
-PD_DLL_DECL {self.output} {self.api}({self.args['args_declare']});
+PADDLE_API {self.output} {self.api}({self.args['args_declare']});
 """
 
     def gene_kernel_select(self, input_names, attrs, kernel):
@@ -312,7 +312,7 @@ def gene_kernel_context(self, input_names, attrs, infer_meta, kernel_param):
     def gene_api_code(self):
         if self.is_base_api:
             return f"""
-PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
 {self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 {self.gene_kernel_context(self.args['inputs']['names'], self.args['attrs'], self.infer_meta, self.kernel['param'])}
 
@@ -323,7 +323,7 @@ def gene_api_code(self):
 
         else:
             return f"""
-PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
   return {self.invoke};
 }}
 """

From 224014268e4da3c3dcfcb8f6ce03b0f9be386a4a Mon Sep 17 00:00:00 2001
From: heliqi <heliqi@baidu.com>
Date: Mon, 6 Dec 2021 20:36:01 +0800
Subject: [PATCH 081/124] add test_unsqueeze2_eltwise_fuse_pass (#37647)

* add test_unsqueeze2_eltwise_fuse_pass

* fix name conflictn

* rebuild CI
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 .../test_unsqueeze2_eltwise_fuse_pass.py      | 103 ++++++++++++++++++
 tools/parallel_UT_rule.py                     |   6 +-
 3 files changed, 107 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e384cb46337947..b98a2288682660 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -161,7 +161,7 @@ cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DE
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
 cc_test(test_adaptive_pool2d_convert_global_pass SRCS adaptive_pool2d_convert_global_pass_tester.cc DEPS adaptive_pool2d_convert_global_pass)
-cc_test(test_unsqueeze2_eltwise_fuse_pass SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass)
+cc_test(test_unsqueeze2_eltwise_fuse_pass_cc SRCS unsqueeze2_eltwise_fuse_pass_tester.cc DEPS unsqueeze2_eltwise_fuse_pass)
 cc_test(test_layer_norm_fuse_pass_cc SRCS layer_norm_fuse_pass_tester.cc DEPS layer_norm_fuse_pass pass_test_util naive_executor)
 cc_test(test_generate_pass_cc SRCS generate_pass_tester.cc DEPS generate_pass pass_desc_proto)
 if(WITH_GPU OR WITH_ROCM)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
new file mode 100644
index 00000000000000..81acd9856cf242
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestUnsqueezeEltwiseFusePass(PassAutoScanTest):
+    """
+        y_var  
+          |          
+       unsqueeze2 
+          \
+    unsqueeze2_out_var    x_var
+             \           /
+            elementwise_mul 
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # TRT
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=10,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['elementwise_mul', ], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape and attr of mul
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=10), min_size=4, max_size=4))
+        axis = -1
+
+        # 2. Generate legal shape and attr of input:Y of unsqueeze2
+        y_shape = x_shape[:2]
+        unsqueeze2_axes = [2, 3]
+
+        unsqueeze2_op = OpConfig(
+            "unsqueeze2",
+            inputs={
+                "X": ["unsqueeze2_x"],
+                "AxesTensor": [],
+                "AxesTensorList": []
+            },
+            axes=unsqueeze2_axes,
+            outputs={"Out": ["unsqueeze2_out"],
+                     "XShape": ["xshape"]}, )
+        mul_op = OpConfig(
+            "elementwise_mul",
+            inputs={"Y": ["unsqueeze2_out"],
+                    "X": ["mul_x"]},
+            axis=axis,
+            outputs={"Out": ["mul_out"]}, )
+
+        ops = [
+            unsqueeze2_op,
+            mul_op,
+        ]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "mul_x": TensorConfig(shape=x_shape),
+                "unsqueeze2_x": TensorConfig(shape=y_shape),
+            },
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["unsqueeze2_eltwise_fuse_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index b38a305c1b8230..0d106102aa2675 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -136,7 +136,7 @@
     'test_conv_concat_relu_mkldnn_fuse_pass',
     'test_bf16_utils',
     'test_sum_bf16_mkldnn_op',
-    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'dense_table_test',
     'test_collective_optimizer',
     'test_origin_info',
@@ -1715,7 +1715,7 @@
     'test_recv_save_op',
     'heter_listen_and_server_test',
     'test_analyzer_capi_ner',
-    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'test_dgc_optimizer',
     'test_fleet_cc',
     'test_repeated_fc_relu_fuse_pass_cc',
@@ -1775,7 +1775,7 @@
     'test_fc_gru_fuse_pass_cc',
     'test_conv_bn_fuse_pass_cc',
     'test_adaptive_pool2d_convert_global_pass',
-    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'test_layer_norm_fuse_pass_cc',
     'test_fc_act_mkldnn_fuse_pass',
     'test_fleet_cc',

From 3e33ef5a63a858935341a2686ba210fd425accd1 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Mon, 6 Dec 2021 20:45:45 +0800
Subject: [PATCH 082/124] Update CINN tag (#37870)

1. Modify git tag for CINN
2. Support compile option "-DWITH_CINN=ON, -DWITH_TESTING=OFF"
---
 cmake/external/cinn.cmake                     |  3 +--
 .../framework/paddle2cinn/CMakeLists.txt      | 26 ++++++++++---------
 paddle/fluid/operators/CMakeLists.txt         |  6 +++--
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index 581a5f93768d03..41b90345c8c5f3 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,8 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-# TODO(zhhsplendid): Modify git tag after we have release tag
-set(CINN_GIT_TAG develop)
+set(CINN_GIT_TAG release/v0.1)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index b80a265f8a41b3..b13166cff60aa3 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -4,20 +4,22 @@ cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
 cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
 
-cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
-set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
+if (WITH_TESTING)
+  cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
+  set_tests_properties(cinn_lib_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
-set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key)
+  set_tests_properties(cinn_cache_key_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
-set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler)
+  set_tests_properties(build_cinn_pass_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
-set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc)
+  set_tests_properties(transform_desc_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
-set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization)
+  set_tests_properties(cinn_graph_symbolization_test PROPERTIES LABELS "RUN_TYPE=CINN")
 
-cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn)
-set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN")
+  cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn)
+  set_tests_properties(cinn_compiler_test PROPERTIES LABELS "RUN_TYPE=CINN")
+endif()
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 0c3572ab655381..f0621af9bbda5c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -169,8 +169,10 @@ endif()
 
 if (WITH_CINN)
   op_library(cinn_launch_op SRCS cinn_launch_op.cc cinn_launch_op.cu.cc DEPS transform_desc cinn_compiler cinn ${OP_HEADER_DEPS})
-  cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
-  set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)
+  if (WITH_TESTING)
+    cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op)
+    set_tests_properties(cinn_launch_op_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)
+  endif()
 endif()
 
 # FIXME(typhoonzero): operator deps may not needed.

From 828f87aecd8a47d19f19f0a83155f8dd340eeaa9 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Mon, 6 Dec 2021 20:59:55 +0800
Subject: [PATCH 083/124] sharding_stage2_pfp16 (#37836)

---
 .../sharding_optimizer_stage2.py              | 14 +++++++++
 .../meta_parallel/sharding/sharding_stage2.py | 11 +++++--
 .../unittests/dygraph_sharding_stage2.py      | 31 +++++++++----------
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index ba1b5222394e2d..ffd24add50a4d0 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -83,8 +83,14 @@ def __init__(self,
         # Default information
         self._optim_defaults = kw
         self._optim = optim
+        assert hasattr(self._optim, "_master_weights"
+                       ), "Must use optimizer with _master_weights attribute"
         self._local_params = params
         self._default_device = device
+        self._pfp16 = len(
+            list(
+                filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
+                       self._local_params))) > 0
 
         assert group is not None, "Distributed communication group is must be gived"
         self.group = group
@@ -98,6 +104,12 @@ def __init__(self,
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self.update_opt_status()
 
+    def _generate_master_params(self, trainable_params):
+        for param in trainable_params:
+            if param.dtype == Type.fp16.value:
+                self._optim._master_weights[param.name] = paddle.cast(
+                    param, Type.fp32.value)
+
     def update_opt_status(self):
         """Update optimizer status and parameter storage information, and special functions to be developed.
         """
@@ -207,6 +219,8 @@ def _integration_params(self):
                     # Merge all the trainable params in a single InternalStorage
                     trainable_params = list(
                         filter(lambda x: x.trainable, params))
+                    if self._pfp16 and dst_rank == self.rank:
+                        self._generate_master_params(trainable_params)
                     if trainable_params:
                         param_storage = ParamStorage(
                             size=self.rank_buffer_size[dtype][dst_rank],
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 8ac4a7e99c7d71..329dc9eaa4e575 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -30,6 +30,7 @@
 import paddle.distributed as dist
 
 from ...utils.internal_storage import GradStorage
+from ...meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
 from .sharding_utils import Taskflow, Type
 
 
@@ -70,6 +71,11 @@ def __init__(
         self._layer = layer
         self._sharding_optimizers = [sharding_optimizer] if not isinstance(
             sharding_optimizer, list) else sharding_optimizer
+        assert all(
+            list(
+                map(lambda opt: isinstance(opt, ShardingOptimizerStage2),
+                    self._sharding_optimizers))
+        ), "Please use ShardingOptimizerStage2 optimizer"
         self._sync_buffers = sync_buffers
         self._auto_refresh_trainable = auto_refresh_trainable
 
@@ -88,8 +94,7 @@ def __init__(
 
         # Global statistical parameters
         self._all_params = list(
-            chain(
-                * [optim.local_params for optim in self._sharding_optimizers]))
+            chain(*[optim.local_params for optim in self._sharding_optimizers]))
         self._trainable_params = []
         self._grad_reduced = []
         self._trainable_param2rank = {}
@@ -436,7 +441,7 @@ def _setup_use_grad_storage(self):
                            ._fill))
 
         self._grad_storage_list = list(
-            chain(* [
+            chain(*[
                 self._grad_storages[dtype].values()
                 for dtype in self._grad_storages.keys()
             ]))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index bc62d18c860226..05008a3bc12f7e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -24,7 +24,6 @@
 from paddle.distributed import fleet
 from paddle.fluid.dygraph import nn
 
-from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import DygraphShardingOptimizer
 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
 
@@ -70,7 +69,7 @@ def __reader__():
     return __reader__
 
 
-def optimizer_setting(model, use_pure_fp16, stage=1):
+def optimizer_setting(model, use_pure_fp16):
     clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
     optimizer = paddle.optimizer.AdamW(
         parameters=model.parameters(),
@@ -87,20 +86,16 @@ def train_mlp(model,
               use_pure_fp16=False,
               all_test=False,
               accumulate_grad=False):
-    if sharding_stage == 1:
+    if sharding_stage == "dp":
         hcg = fleet.get_hybrid_communicate_group()
         group = hcg.get_check_parallel_group()
     else:
         group = paddle.distributed.new_group([0, 1])
-    optimizer = optimizer_setting(
-        model=model, use_pure_fp16=use_pure_fp16, stage=sharding_stage)
+    optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
 
     if use_pure_fp16:
-        model, optimizer = paddle.amp.decorate(
-            models=model,
-            optimizers=optimizer,
-            level='O2',
-            save_dtype='float32')
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
 
     if sharding_stage == 2:
         optimizer = ShardingOptimizerStage2(
@@ -164,7 +159,7 @@ def train_mlp(model,
     return model.parameters()
 
 
-def test_stage1_stage2():
+def test_dp_stage2():
     mlp = MLP()
     state_dict = mlp.state_dict()
     mlp1 = MLP()
@@ -175,11 +170,13 @@ def test_stage1_stage2():
     mlp2.set_state_dict(state_dict)
     mlp3.set_state_dict(state_dict)
     mlp4.set_state_dict(state_dict)
-    stage1_params = train_mlp(mlp, sharding_stage=1, use_pure_fp16=False)
-    stage2_params = train_mlp(mlp, sharding_stage=2, use_pure_fp16=False)
-    for i in range(len(stage1_params)):
-        np.testing.assert_allclose(
-            stage1_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6)
+    dp_params = train_mlp(mlp1, sharding_stage="dp", use_pure_fp16=False)
+    stage2_params = train_mlp(mlp2, sharding_stage=2, use_pure_fp16=False)
+    for i in range(len(dp_params)):
+        for j in range(len(stage2_params)):
+            if dp_params[i].name == stage2_params[j].name:
+                np.testing.assert_allclose(
+                    dp_params[i].numpy(), stage2_params[j].numpy(), rtol=1e-6)
 
     stage2_params = train_mlp(
         mlp3, sharding_stage=2, use_pure_fp16=True, all_test=True)
@@ -201,4 +198,4 @@ def test_stage1_stage2():
 
 
 if __name__ == '__main__':
-    test_stage1_stage2()
+    test_dp_stage2()

From c7cb7eecdd6bd4514901f6e92df10b6dee1db896 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Tue, 7 Dec 2021 08:45:53 +0800
Subject: [PATCH 084/124] [fleet_executor] fix python gil problem (#37882)

---
 paddle/fluid/pybind/bind_fleet_executor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 40d325fae458ff..6fc9b2a494f619 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -33,7 +33,8 @@ void BindFleetExecutor(py::module* m) {
   py::class_<FleetExecutor>(*m, "FleetExecutor")
       .def(py::init<const std::string&>())
       .def("init", &FleetExecutor::Init)
-      .def("run", &FleetExecutor::Run);
+      .def("run", &FleetExecutor::Run,
+           py::call_guard<py::gil_scoped_release>());
 
   py::class_<TaskNode>(*m, "TaskNode")
       .def(py::init<const framework::ProgramDesc&, int64_t, int64_t, int64_t>())

From 2b479e17745d37ebc1a2d519c9f0f720602da23a Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 7 Dec 2021 09:58:35 +0800
Subject: [PATCH 085/124] fix import error of GlooParallelContext (#37892)

---
 paddle/fluid/imperative/heter_ccl_context.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/imperative/heter_ccl_context.cc b/paddle/fluid/imperative/heter_ccl_context.cc
index a62c1da7815979..896f29fdd0c256 100644
--- a/paddle/fluid/imperative/heter_ccl_context.cc
+++ b/paddle/fluid/imperative/heter_ccl_context.cc
@@ -92,8 +92,10 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
     inter_strategy_.local_rank_ = inter_rank;
     inter_strategy_.current_endpoint_ = strategy_.current_endpoint_;
     inter_strategy_.trainer_endpoints_ = inter_endpoints;
+#ifdef PADDLE_WITH_GLOO
     inter_parallel_ctx_ = std::make_shared<GLOOParallelContext>(
         inter_strategy_, platform::CPUPlace());
+#endif
   }
 
   VLOG(0) << "init inter size " << inter_endpoints.size() << " rank "

From 27d1f811d5bbed188ef40b049965b90ad2129f2f Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 7 Dec 2021 10:03:09 +0800
Subject: [PATCH 086/124] Fix cflags D_GLIBCXX_USE_CXX11_ABI takes no effect
 problem in customized op (#37878)

---
 python/paddle/utils/cpp_extension/cpp_extension.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 5370de9ed42aa5..3a7804d9012dd0 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -440,7 +440,7 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 # so we add this flag to ensure the symbol names from user compiled
                 # shared library have same ABI suffix with core_(no)avx.so.
                 # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi
-                add_compile_flag(['-D_GLIBCXX_USE_CXX11_ABI=1'], cflags)
+                add_compile_flag(cflags, ['-D_GLIBCXX_USE_CXX11_ABI=1'])
                 # Append this macor only when jointly compiling .cc with .cu
                 if not is_cuda_file(src) and self.contain_cuda_file:
                     if core.is_compiled_with_rocm():

From de874cdd5c35ac278dccfb11d365239500875112 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 7 Dec 2021 10:20:19 +0800
Subject: [PATCH 087/124] Enabled generation for special operators, the
 GradNode/Inputs/Outputs of which are empty (#37837)

---
 .../auto_code_generator/eager_generator.cc    | 343 ++++++++++--------
 .../eager/auto_code_generator/op_list.txt     |   1 -
 2 files changed, 200 insertions(+), 144 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 0d66d8d96a9b49..b3657a9894f82b 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gflags/gflags.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -27,69 +26,21 @@
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
 
-DEFINE_bool(generate_all, false,
-            "Generate all operators currently registered in Paddle");
+namespace paddle {
+namespace framework {
 
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
 static std::unordered_set<std::string> operators_to_skip = {
-    "pull_sparse",     "pull_box_extended_sparse", "pull_sparse_v2",
-    "pull_box_sparse", "fused_attention",          "diag_v2",
-    "c_split"};
+    "chunk_eval",  // Stupid tensor name
+    "minus",          "pull_sparse",     "pull_box_extended_sparse",
+    "pull_sparse_v2", "pull_box_sparse", "fused_attention",
+    "diag_v2",        "c_split"};
 
 static std::unordered_set<std::string> operators_to_codegen = {};
 static std::unordered_set<std::string> skipped_operators = {};
 
-static void PrepareAttrMapForOps() {
-  // Handle "fused_elemwise_add_activation"
-  std::vector<std::string> functor_list = {"a", "b"};
-  operators_with_attrs["fused_elemwise_add_activation"] = {};
-  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "fused_elemwise_activation"
-  operators_with_attrs["fused_elemwise_activation"] = {};
-  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
-      functor_list;
-
-  // Handle "reverse"
-  std::vector<int> axis = {0};
-  operators_with_attrs["reverse"] = {};
-  operators_with_attrs["reverse"]["axis"] = axis;
-
-  // Handle "flip"
-  operators_with_attrs["flip"] = {};
-  operators_with_attrs["flip"]["axis"] = axis;
-
-  // Handle "cast"
-  operators_with_attrs["cast"] = {};
-  operators_with_attrs["cast"]["out_dtype"] = 5;
-  operators_with_attrs["cast"]["in_dtype"] = 5;
-
-  // Handle "transfer_dtype"
-  operators_with_attrs["transfer_dtype"] = {};
-  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
-  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
-}
-
-static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
-  std::string line;
-  std::ifstream op_list_file(op_list_path);
-  if (op_list_file.is_open()) {
-    while (getline(op_list_file, line)) {
-      operators_to_codegen.insert(line);
-    }
-    op_list_file.close();
-  } else {
-    PADDLE_THROW(
-        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
-  }
-}
-
-namespace paddle {
-namespace framework {
-
 static std::string AttrTypeToString(const proto::AttrType& type) {
   std::string ret;
   switch (type) {
@@ -392,10 +343,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   // Only handle matmul_v2 for now
   VLOG(1) << "------ Analyzing Op ------: " << op_type;
 
-  if (!FLAGS_generate_all) {
-    if (!operators_to_codegen.count(op_type)) return false;
-  }
-
+  if (!operators_to_codegen.count(op_type)) return false;
   if (operators_to_skip.count(op_type)) return false;
 
   return true;
@@ -404,21 +352,12 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
 /* --------------------------------------- */
 /* --------- Preprocess Ins/Outs --------- */
 /* --------------------------------------- */
-static void PurifyOpProto(
+static void PurifyForwardOpProto(
     const proto::OpProto& op_proto,
     std::unordered_map<std::string, size_t>* fwd_inputs_name_pos_map,
     std::unordered_map<std::string, size_t>* fwd_outputs_name_pos_map,
-    std::map<std::string, std::string>* grad_outs_slotname_map,
-    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
-    std::map<std::string, std::string>* grad_ins_grad_slotname_map,
     std::vector<proto::OpProto::Var>* in_vars,
-    std::vector<proto::OpProto::Var>* out_vars,
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_ins,
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_outs) {
+    std::vector<proto::OpProto::Var>* out_vars) {
   // Op Name
   const std::string op_name = op_proto.type();
 
@@ -440,6 +379,72 @@ static void PurifyOpProto(
           }
         }
         in_vars->erase(iter);
+      }
+    }
+  }
+
+  for (const proto::OpProto::Var& output : op_proto.outputs()) {
+    std::string output_name = output.name();
+
+    // Delete dispensable tensor unless specified in op_outs_map
+    if (output.dispensable()) {
+      if (!op_outs_map.count(op_name) ||
+          !op_outs_map[op_name].count(output_name)) {
+        VLOG(6) << "Removing Dispensable Output: " << output_name;
+
+        // out_vars
+        auto iter = out_vars->begin();
+        for (iter = out_vars->begin(); iter != out_vars->end(); iter++) {
+          if (iter->name() == output_name) {
+            break;
+          }
+        }
+        out_vars->erase(iter);
+      }
+    }
+  }
+
+  /* ------ Maping forward slot name to fwd position ------ */
+  size_t in_pos = 0;
+  for (const auto& var : *in_vars) {
+    VLOG(6) << "Mapping input tensor: " << var.name()
+            << " To position: " << in_pos;
+    (*fwd_inputs_name_pos_map)[var.name()] = in_pos;
+    in_pos++;
+  }
+
+  size_t out_pos = 0;
+  for (const auto& var : *out_vars) {
+    VLOG(6) << "Mapping output tensor: " << var.name()
+            << " To position: " << out_pos;
+    (*fwd_outputs_name_pos_map)[var.name()] = out_pos;
+    out_pos++;
+  }
+}
+
+static void PurifyGradOpProto(
+    const proto::OpProto& op_proto,
+    std::map<std::string, std::string>* grad_outs_slotname_map,
+    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
+    std::map<std::string, std::string>* grad_ins_grad_slotname_map,
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_ins,
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_outs) {
+  // Op Name
+  const std::string op_name = op_proto.type();
+
+  // Handle dispensable inputs
+  for (const proto::OpProto::Var& input : op_proto.inputs()) {
+    std::string input_name = input.name();
+
+    // Delete dispensable tensor unless specified in op_ins_map
+    if (input.dispensable()) {
+      if (!op_ins_map.count(op_name) ||
+          !op_ins_map[op_name].count(input_name)) {
+        VLOG(6) << "Removing Dispensable Input: " << input_name;
 
         // grad_outs_slotname_map
         auto grad_outs_slotname_map_purified = *grad_outs_slotname_map;
@@ -478,15 +483,6 @@ static void PurifyOpProto(
           !op_outs_map[op_name].count(output_name)) {
         VLOG(6) << "Removing Dispensable Output: " << output_name;
 
-        // out_vars
-        auto iter = out_vars->begin();
-        for (iter = out_vars->begin(); iter != out_vars->end(); iter++) {
-          if (iter->name() == output_name) {
-            break;
-          }
-        }
-        out_vars->erase(iter);
-
         // grad_ins_grad_slotname_map
         auto grad_ins_grad_slotname_map_purified = *grad_ins_grad_slotname_map;
         for (const auto& iter : *grad_ins_grad_slotname_map) {
@@ -514,52 +510,40 @@ static void PurifyOpProto(
       }
     }
   }
-
-  /* ------ Maping forward slot name to fwd position ------ */
-  size_t in_pos = 0;
-  for (const auto& var : *in_vars) {
-    VLOG(6) << "Mapping input tensor: " << var.name()
-            << " To position: " << in_pos;
-    (*fwd_inputs_name_pos_map)[var.name()] = in_pos;
-    in_pos++;
-  }
-
-  size_t out_pos = 0;
-  for (const auto& var : *out_vars) {
-    VLOG(6) << "Mapping output tensor: " << var.name()
-            << " To position: " << out_pos;
-    (*fwd_outputs_name_pos_map)[var.name()] = out_pos;
-    out_pos++;
-  }
 }
 
 /* -------------------------------- */
 /* --------- Collect Info --------- */
 /* -------------------------------- */
-static bool CollectInformationFromOpInfo(
+static void CollectForwardInformationFromOpInfo(
     const paddle::framework::OpInfo& op_info,
-    std::vector<std::string>* grad_op_types,
-    std::map<std::string, std::string>* grad_outs_slotname_map,
-    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,
-    std::map<std::string, std::string>* grad_ins_grad_slotname_map,
     std::vector<proto::OpProto::Var>* in_vars,
-    std::vector<proto::OpProto::Var>* out_vars,
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_ins,
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_outs) {
+    std::vector<proto::OpProto::Var>* out_vars) {
   const proto::OpProto& op_proto = *op_info.proto_;
-  const std::string& op_type = op_proto.type();
-  std::vector<int64_t> dims = {1, 1, 1, 1};
-
   for (const proto::OpProto::Var& input : op_proto.inputs()) {
     in_vars->push_back(input);
   }
   for (const proto::OpProto::Var& output : op_proto.outputs()) {
     out_vars->push_back(output);
   }
+}
+
+static bool CollectGradInformationFromOpInfo(
+    const paddle::framework::OpInfo& op_info, bool* generate_forward_only,
+    std::vector<std::string>* grad_op_types,                         // grad
+    std::map<std::string, std::string>* grad_outs_slotname_map,      // grad
+    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,   // grad
+    std::map<std::string, std::string>* grad_ins_grad_slotname_map,  // grad
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_ins,  // grad
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_outs  // grad
+    ) {
+  const proto::OpProto& op_proto = *op_info.proto_;
+  const std::string& op_type = op_proto.type();
+  std::vector<int64_t> dims = {1, 1, 1, 1};
 
   /* ------ Prepare "ins" ------ */
   std::map<std::string,
@@ -621,8 +605,6 @@ static bool CollectInformationFromOpInfo(
   if (operators_with_attrs.count(op_type)) {
     VLOG(6) << "Found operator " << op_type << " using special AttributeMap";
     attrs = operators_with_attrs[op_type];
-    // default_attrs.insert(operators_with_attrs[op_type].begin(),
-    // operators_with_attrs[op_type].end());
   }
 
   VLOG(6) << "Prepared Default Attributes Map, size = " << default_attrs.size();
@@ -655,8 +637,8 @@ static bool CollectInformationFromOpInfo(
 
   /* ------ Run GradOpMaker ------ */
   if (!op_info.dygraph_grad_op_maker_) {
-    VLOG(6) << op_type << " has no GradOpMaker, skip it";
-    skipped_operators.insert(op_type);
+    VLOG(6) << op_type << " has no GradOpMaker";
+    *generate_forward_only = true;
     return false;
   }
 
@@ -666,17 +648,19 @@ static bool CollectInformationFromOpInfo(
 
   if (!grad_node) {
     VLOG(6) << "Got nullptr GradOpNode for " << op_type
-            << " likely registered EmptyGradOpMaker, skip it";
-    skipped_operators.insert(op_type);
+            << " likely registered EmptyGradOpMaker";
+    *generate_forward_only = true;
     return false;
   }
 
+  /*
   if (grad_node->size() > 1) {
     // Backward attributes can be super complicated
     VLOG(6) << "Skip GradOpNode with multiple OpBases for now: " << op_type;
     skipped_operators.insert(op_type);
     return false;
   }
+  */
 
   VLOG(6) << "Prepared GradOpNode";
 
@@ -901,6 +885,7 @@ static std::string GenerateGradNodeCreationContent(
 /* --------- CodeGen: Forward ----- */
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
+    bool generate_forward_only,
     const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
     const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
@@ -1044,7 +1029,6 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   // [Generation] Get Attrs
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
-  generated_function_body += "\n";
 
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
@@ -1092,16 +1076,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Converted Output VarBase to EagerTensor(s)";
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
-  std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-      fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
-      grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
-  generated_function_body += grad_node_creation_body_str;
-  generated_function_body += "\n";
-  VLOG(6) << "Generated GradNode Creation codes";
+  if (!generate_forward_only) {
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
+        grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
+    generated_function_body += grad_node_creation_body_str;
+    generated_function_body += "\n";
+    VLOG(6) << "Generated GradNode Creation codes";
+  }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
   generated_function_body += "\n";
-  std::string return_str;
+  std::string return_str = "";
   std::string return_type_str = "";
   std::string function_proto_return_type_str = "";
   if (return_contents.size() > 1) {
@@ -1124,14 +1110,20 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const char* FWD_FUNCTION_PROTO_RETURN_TEMPLATE = "std::tuple<%s>";
     function_proto_return_type_str = paddle::string::Sprintf(
         FWD_FUNCTION_PROTO_RETURN_TEMPLATE, return_type_str);
-  } else {
+
+  } else if (return_contents.size() == 1) {
     // Return vector<Tensor> or Tensor
     return_type_str = return_types[0];
     const char* FWD_TENSOR_RETURN_TEMPLATE = "  return %s;";
     return_str =
         paddle::string::Sprintf(FWD_TENSOR_RETURN_TEMPLATE, return_contents[0]);
     function_proto_return_type_str = return_type_str;
+
+  } else {
+    return_str = "return nullptr;";
+    function_proto_return_type_str = "void*";
   }
+
   generated_function_body += return_str;
   generated_function_body += "\n";
   VLOG(6) << "Generated return codes";
@@ -1139,6 +1131,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   // [Generation] Get Full Function
   std::string function_name = op_type + "_dygraph_function";
 
+  if (dygraph_function_args_str.size() > 0) {
+    auto iter = dygraph_function_args_str.begin();
+    if ((*iter) == ',') dygraph_function_args_str.erase(iter);
+  }
+
   const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
@@ -1601,11 +1598,11 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* ---- Collect Information ---- */
     /* ----------------------------- */
     std::vector<std::string> grad_op_types;
+    std::vector<proto::OpProto::Var> in_vars;
+    std::vector<proto::OpProto::Var> out_vars;
     std::map<std::string, std::string> grad_outs_slotname_map;
     std::map<std::string, std::string> grad_ins_fwd_slotname_map;
     std::map<std::string, std::string> grad_ins_grad_slotname_map;
-    std::vector<proto::OpProto::Var> in_vars;
-    std::vector<proto::OpProto::Var> out_vars;
     std::map<std::string,
              std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
         grad_ins;
@@ -1614,20 +1611,31 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
         grad_outs;
 
     VLOG(6) << "-------- CollectInformationFromOpInfo -------";
-    bool is_available = CollectInformationFromOpInfo(
-        op_info, &grad_op_types, &grad_outs_slotname_map,
-        &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map, &in_vars,
-        &out_vars, &grad_ins, &grad_outs);
 
-    if (!is_available) continue;
+    CollectForwardInformationFromOpInfo(op_info, &in_vars, &out_vars);
+
+    bool generate_forward_only = false;
+    bool is_available = CollectGradInformationFromOpInfo(
+        op_info, &generate_forward_only, &grad_op_types,
+        &grad_outs_slotname_map, &grad_ins_fwd_slotname_map,
+        &grad_ins_grad_slotname_map, &grad_ins, &grad_outs);
+
+    if (!is_available && !generate_forward_only) {
+      VLOG(6) << "Skipped operator: " << op_type;
+      continue;
+    }
 
     VLOG(6) << "-------- PurifyOpProto -------";
     std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
     std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
-    PurifyOpProto(*op_proto, &fwd_inputs_name_pos_map,
-                  &fwd_outputs_name_pos_map, &grad_outs_slotname_map,
-                  &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map,
-                  &in_vars, &out_vars, &grad_ins, &grad_outs);
+    PurifyForwardOpProto(*op_proto, &fwd_inputs_name_pos_map,
+                         &fwd_outputs_name_pos_map, &in_vars, &out_vars);
+
+    if (!generate_forward_only) {
+      PurifyGradOpProto(*op_proto, &grad_outs_slotname_map,
+                        &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map,
+                        &grad_ins, &grad_outs);
+    }
 
     /* --------------------------- */
     /* --------- CodeGen --------- */
@@ -1636,16 +1644,19 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
         GenerateForwardFunctionContents(
-            fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
-            grad_ins_fwd_slotname_map, grad_ins_grad_slotname_map,
-            grad_outs_slotname_map, grad_ins, grad_outs, op_type, in_vars,
-            out_vars);
+            generate_forward_only, fwd_inputs_name_pos_map,
+            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
+            grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins,
+            grad_outs, op_type, in_vars, out_vars);
+
     fwd_function_str += body_and_declaration.first + "\n";
 
     /* ---- dygraph_forward_api.h ---- */
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
+    if (generate_forward_only) continue;
+
     /* ---- nodes.h ---- */
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
     grad_node_h_str +=
@@ -1681,6 +1692,52 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
   GenerateNodeCCFile(output_dir, grad_node_cc_str);
 }
 
+static void PrepareAttrMapForOps() {
+  // Handle "fused_elemwise_add_activation"
+  std::vector<std::string> functor_list = {"a", "b"};
+  operators_with_attrs["fused_elemwise_add_activation"] = {};
+  operators_with_attrs["fused_elemwise_add_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "fused_elemwise_activation"
+  operators_with_attrs["fused_elemwise_activation"] = {};
+  operators_with_attrs["fused_elemwise_activation"]["functor_list"] =
+      functor_list;
+
+  // Handle "reverse"
+  std::vector<int> axis = {0};
+  operators_with_attrs["reverse"] = {};
+  operators_with_attrs["reverse"]["axis"] = axis;
+
+  // Handle "flip"
+  operators_with_attrs["flip"] = {};
+  operators_with_attrs["flip"]["axis"] = axis;
+
+  // Handle "cast"
+  operators_with_attrs["cast"] = {};
+  operators_with_attrs["cast"]["out_dtype"] = 5;
+  operators_with_attrs["cast"]["in_dtype"] = 5;
+
+  // Handle "transfer_dtype"
+  operators_with_attrs["transfer_dtype"] = {};
+  operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
+  operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+}
+
+static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
+  std::string line;
+  std::ifstream op_list_file(op_list_path);
+  if (op_list_file.is_open()) {
+    while (getline(op_list_file, line)) {
+      operators_to_codegen.insert(line);
+    }
+    op_list_file.close();
+  } else {
+    PADDLE_THROW(
+        paddle::platform::errors::Fatal("Unable to open op_list.txt file"));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
 
@@ -1693,8 +1750,8 @@ int main(int argc, char* argv[]) {
   std::string eager_root = argv[1];
   std::string op_list_path = argv[2];
 
-  CollectOperatorsToCodeGen(op_list_path);
-  PrepareAttrMapForOps();
+  paddle::framework::CollectOperatorsToCodeGen(op_list_path);
+  paddle::framework::PrepareAttrMapForOps();
 
   paddle::framework::DygraphCodeGeneration(eager_root);
 
diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt
index 6bfba753633f33..2456a7a1846d1e 100644
--- a/paddle/fluid/eager/auto_code_generator/op_list.txt
+++ b/paddle/fluid/eager/auto_code_generator/op_list.txt
@@ -215,7 +215,6 @@ spp
 floor
 gelu
 retinanet_detection_output
-minus
 push_dense
 silu
 sequence_erase

From c9a3c6696dec5aaefb5952bb6238035c5ccb4fef Mon Sep 17 00:00:00 2001
From: jianghaicheng <jianghaicheng@users.noreply.github.com>
Date: Tue, 7 Dec 2021 10:32:49 +0800
Subject: [PATCH 088/124] add ipu device p1 (#37841)

---
 paddle/fluid/framework/garbage_collector.cc   |  9 +++++
 paddle/fluid/framework/garbage_collector.h    | 10 +++++
 paddle/fluid/framework/library_type.h         |  4 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |  2 +
 .../memory/allocation/allocator_facade.cc     | 38 +++++++++++++++++++
 paddle/fluid/memory/memcpy.cc                 | 26 +++++++++++++
 paddle/fluid/platform/CMakeLists.txt          |  8 +++-
 .../fluid/platform/device/ipu/CMakeLists.txt  | 20 ++++++----
 paddle/fluid/platform/device/ipu/device.cc    |  2 +-
 paddle/fluid/platform/device/ipu/ipu_info.cc  | 32 ++++++++++++++++
 paddle/fluid/platform/device/ipu/ipu_info.h   | 24 ++++++++++++
 .../platform/device/ipu/ipu_optimizer.cc      |  2 +-
 paddle/fluid/platform/device_context.h        | 32 +++++++++++++++-
 paddle/fluid/platform/init.cc                 | 16 ++++++++
 paddle/fluid/pybind/CMakeLists.txt            |  3 ++
 15 files changed, 214 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_info.cc
 create mode 100644 paddle/fluid/platform/device/ipu/ipu_info.h

diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 8b6a5747dbfced..06d1ef84c19559 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -53,6 +53,15 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+IPUGarbageCollector::IPUGarbageCollector(const platform::IPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 2c2b57bbe420a8..0cfeda37c222e7 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -80,6 +80,16 @@ class XPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_IPU
+class IPUGarbageCollector : public GarbageCollector {
+ public:
+  IPUGarbageCollector(const platform::IPUPlace &place, size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 8fe314cf5f18c5..f7539aa4859578 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("IPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("NPU")) {
     return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
@@ -68,7 +70,7 @@ inline LibraryType StringToLibraryType(const char* ctype) {
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unknown LibraryType string (%s), only support library type string "
-        "include PLAIN, MKLDNN, CUDNN, CPU and CUDA.",
+        "include PLAIN, MKLDNN, CUDNN, CPU, CUDA and IPU.",
         s.c_str()));
   }
 }
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4d44c533b7456f..b3351f44dc35a4 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -48,6 +48,8 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_IPU)
+    set(AllocatorFacadeDeps ipu_info)
 elseif(WITH_ASCEND)
     set(AllocatorFacadeDeps ascend_npu_info)
 else ()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 8314a1df931cac..13cd980881bd54 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -51,6 +51,10 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
@@ -136,6 +140,11 @@ class AllocatorFacadePrivate {
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (FLAGS_use_stream_safe_cuda_allocator) {
           LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
@@ -186,6 +195,11 @@ class AllocatorFacadePrivate {
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
+#endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -197,6 +211,11 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
 #endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (FLAGS_use_stream_safe_cuda_allocator) {
           LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
@@ -570,6 +589,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -591,6 +616,13 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::IPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
@@ -625,6 +657,12 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::NPUPlace(dev_id));
     }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::IPUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 574b1520543993..fe38200efa8e24 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -33,6 +33,32 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
 }
+#ifdef PADDLE_WITH_IPU
+template <>
+void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+#endif
 
 #ifdef PADDLE_WITH_XPU
 template <>
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 4f3c70f5ea0505..d8d41e9d9185ac 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -71,6 +71,12 @@ IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
+IF(WITH_IPU)
+    set(IPU_CTX_DEPS ipu_backend)
+ELSE()
+    set(IPU_CTX_DEPS)
+ENDIF(WITH_IPU)
+
 IF(WITH_ASCEND_CL)
     set(NPU_CTX_DEPS npu_stream npu_info)
 ENDIF()
@@ -109,7 +115,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index c4595e22d6cd24..25629ba74d9152 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,8 +1,12 @@
-cc_library(ipu_device SRCS device.cc DEPS enforce popart)
-cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
-cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
-cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
-cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
-cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
-cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
-cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+# IPU
+IF(WITH_IPU)
+  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
+  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
+  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
+  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
+  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
+  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
+  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
+  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/device.cc b/paddle/fluid/platform/device/ipu/device.cc
index 4aa9ab56d92f80..47e6475089d3f3 100644
--- a/paddle/fluid/platform/device/ipu/device.cc
+++ b/paddle/fluid/platform/device/ipu/device.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/ipu/device.h"
+#include "paddle/fluid/platform/device/ipu/device.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
new file mode 100644
index 00000000000000..c184149a9d38d0
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedIPUDevices() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetDeviceIds();
+}
+
+//! Get the total number of IPU devices in system.
+int GetIPUDeviceCount() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetNumDevices();
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.h b/paddle/fluid/platform/device/ipu/ipu_info.h
new file mode 100644
index 00000000000000..3d032eeb4bfc16
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_IPU
+#include <memory>
+#include <vector>
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+std::vector<int> GetSelectedIPUDevices();
+int GetIPUDeviceCount();
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
index ea8ae8e1f026a3..92bb2ca3afcf88 100644
--- a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/ipu/ipu_optimizer.h"
+#include "paddle/fluid/platform/device/ipu/ipu_optimizer.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 552d8f1a8c4ffb..875132dfe89c4b 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -62,6 +62,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/device.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -99,8 +102,8 @@ enum DeviceType {
   CUDA = 1,
   XPU = 2,
   NPU = 3,
-
-  MAX_DEVICE_TYPES = 4,
+  IPU = 4,
+  MAX_DEVICE_TYPES = 5,
 };
 
 DeviceType Place2DeviceType(const platform::Place& place);
@@ -109,6 +112,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
+constexpr DeviceType kIPU = DeviceType::IPU;
 
 class DeviceContext {
  public:
@@ -140,6 +144,30 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
   using TYPE = CPUDeviceContext;
 };
 
+// Graphcore IPU
+#ifdef PADDLE_WITH_IPU
+class IPUDeviceContext : public DeviceContext {
+ public:
+  IPUDeviceContext() = delete;
+  explicit IPUDeviceContext(IPUPlace place);
+  virtual ~IPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+  int DeviceId() const { return device_.getId(); }
+
+ private:
+  IPUPlace place_;
+  platform::ipu::Device device_;
+};
+template <>
+struct DefaultDeviceContextType<platform::IPUPlace> {
+  using TYPE = IPUDeviceContext;
+};
+
+#endif
+
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
 class XPUDeviceContext : public DeviceContext {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 698563a53d2558..b642f160da21a5 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -45,6 +45,10 @@ limitations under the License. */
 #include "DbgHelp.h"
 #endif
 
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
     multiple_of_cupti_buffer_size, 1,
@@ -164,6 +168,15 @@ void InitDevices() {
     LOG(WARNING)
         << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_IPU
+  try {
+    // use user specified IPUs.
+    devices = platform::GetSelectedIPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime.";
+  }
 #endif
   InitDevices(devices);
 }
@@ -185,6 +198,9 @@ void InitDevices(const std::vector<int> devices) {
 #ifdef PADDLE_WITH_XPU
     places.emplace_back(platform::XPUPlace(devices[i]));
 #endif
+#ifdef PADDLE_WITH_IPU
+    places.emplace_back(platform::IPUPlace(devices[i]));
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
     places.emplace_back(platform::NPUPlace(devices[i]));
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 521ca032a50ddb..4f896f852ffd6c 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -16,6 +16,9 @@ endif()
 if (WITH_GPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler)
 endif()
+if (WITH_IPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
+endif()
 
 if (WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)

From 2bd0f3c7c56c50674835659670d1cb45f43737ec Mon Sep 17 00:00:00 2001
From: Zuza <gawrysiak.zuzanna@gmail.com>
Date: Tue, 7 Dec 2021 03:41:57 +0100
Subject: [PATCH 089/124] Quantize slice op (#37630)

* quantize slice op

* correct test

* fix code formatting
---
 .../framework/ir/graph_pattern_detector.cc    |  22 ++-
 .../framework/ir/graph_pattern_detector.h     |  14 ++
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  52 ++++++
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |   1 +
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     | 111 +++++++++++++
 .../fluid/inference/api/mkldnn_quantizer.cc   |  10 ++
 .../inference/api/mkldnn_quantizer_config.cc  |   3 +
 .../fluid/inference/tests/api/CMakeLists.txt  |  19 ++-
 .../tests/api/analyzer_ernie_int8_tester.cc   |  54 +++++++
 .../tests/api/analyzer_ernie_tester.cc        | 135 +---------------
 .../tests/api/analyzer_ernie_tester.h         | 152 ++++++++++++++++++
 .../fluid/operators/mkldnn/slice_mkldnn_op.cc |   2 +
 paddle/fluid/operators/slice_op.cc            |   2 +-
 .../quantization/quant2_int8_mkldnn_pass.py   |   9 +-
 .../fluid/contrib/slim/tests/CMakeLists.txt   |   2 +-
 15 files changed, 450 insertions(+), 138 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_ernie_tester.h

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index dd0ffe8b9fd0d4..5334b08248992b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1619,6 +1619,26 @@ PDNode *patterns::Reshape::operator()() {
   return reshape_out;
 }
 
+PDNode *patterns::Slice::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+
+  auto slice_op = pattern->NewNode(slice_op_repr())->assert_is_op("slice");
+
+  auto slice_in = pattern->NewNode(slice_in_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("slice", "Input");
+  auto slice_out = pattern->NewNode(slice_out_repr())
+                       ->AsOutput()
+                       ->assert_is_op_output("slice", "Out");
+
+  auto next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  prev_op->LinksTo({slice_in});
+  slice_op->LinksFrom({slice_in}).LinksTo({slice_out});
+  next_op->LinksFrom({slice_out});
+  return slice_out;
+}
+
 PDNode *patterns::Matmul::operator()() {
   auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul");
 
@@ -2315,7 +2335,7 @@ PDNode *patterns::QuantizePlacement::operator()(
       std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
                                        "fc", "matmul", "pool2d", "prior_box",
                                        "reshape2", "transpose2", "fusion_gru",
-                                       "fusion_lstm", "multi_gru"});
+                                       "fusion_lstm", "multi_gru", "slice"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d7bfdc57d1c7ed..fa8504d074a884 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -980,6 +980,20 @@ struct Reshape : public PatternBase {
   PATTERN_DECL_NODE(reshape_out);
   PATTERN_DECL_NODE(next_op);
 };
+// Slice op
+// Forward pass for slice.
+// slice_out is a result of the operator.
+struct Slice : public PatternBase {
+  Slice(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "slice") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(slice_in);
+  PATTERN_DECL_NODE(slice_op);
+  PATTERN_DECL_NODE(slice_out);
+  PATTERN_DECL_NODE(next_op);
+};
 
 // Matmul op
 // Forward pass for matmul.
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 2bf8a3b64f0a78..3df4a844705242 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -676,6 +676,57 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
   PrettyLogDetail("---    quantized %d reshape ops", quantize_reshape_count);
 }
 
+void CPUQuantizePass::QuantizeSlice(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Slice slice_pattern{pattern, name_scope_};
+  slice_pattern();
+
+  int quantize_slice_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize slice op";
+    GET_IR_NODE_FROM_SUBGRAPH(slice_op, slice_op, slice_pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(slice_op->Op())) {
+      LogQuantizationDisabled(slice_op);
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, slice_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, slice_pattern);
+
+    // skip if prev op and next op is not quantized
+    if (!IsOpDequantized(prev_op) && !IsOpQuantized(next_op)) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(slice_in, slice_in, slice_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(slice_out, slice_out, slice_pattern);
+
+    if (!AreScalesPresentForNodes({slice_out})) {
+      LogCannotQuantizeOp(slice_op);
+      return;
+    }
+
+    bool is_input_unsigned{false};
+    auto input_scale = GetScaleValueForNode(slice_out, &is_input_unsigned);
+    QuantizeInput(g, slice_op, slice_in, "Input", input_scale,
+                  is_input_unsigned);
+
+    bool is_output_unsigned{false};
+    auto output_scale = GetScaleValueForNode(slice_out, &is_output_unsigned);
+    DequantizeOutput(g, slice_op, slice_out, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_slice_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_slice_count);
+
+  PrettyLogDetail("---    quantized %d slice ops", quantize_slice_count);
+}
+
 void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
@@ -1024,6 +1075,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
+  QuantizeSlice(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 18735633c0d69a..b3ee98263c0c0a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -61,6 +61,7 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
+  void QuantizeSlice(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_input_unsigned,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index b6a8de263aa2af..838912f659ff7c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -55,6 +55,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
     op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "slice") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
   } else if (type == "dropout") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
@@ -784,6 +788,113 @@ TEST(CpuQuantizePass, reshapeBetweenNonQuantizedOp) {
                   added_nodes_count, 2.0f * 127);
 }
 
+static const std::initializer_list<std::string> variable_names_slice = {
+    "a", "b", "c", "d"};
+
+// a->Dequantize->b
+// b->Slice->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescSlice() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_slice) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
+
+  return prog;
+}
+
+// a->Transpose->b
+// b->slice->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescSliceBetweenNonQuantizedOp() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_slice) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, "float32");
+  SetOp(&prog, "slice", "Slice", {"b"}, {"c"}, true, "int8");
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, "float32");
+
+  return prog;
+}
+
+void MainTestSlice(const ProgramDesc& prog, int transpose_count,
+                   int slice_count, int quant_count, int dequant_count,
+                   int added_nodes_count, float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_slice, &original_nodes_num,
+              &current_nodes_num);
+
+  float quant_scale = 1.0f;
+  float dequant_scale = 1.0f;
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int transpose_nodes_count = 0;
+  int slice_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "transpose2") {
+        transpose_nodes_count++;
+      } else if (op->Type() == "slice") {
+        slice_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+        quant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
+        EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'.";
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+        auto op_name = op->GetAttrIfExists<std::string>("name");
+        VLOG(3) << op_name << "\n";
+        if (op_name != "Dequantize1") {
+          dequant_scale = BOOST_GET_CONST(float, op->GetAttr("Scale"));
+          EXPECT_EQ(dequant_scale, scale)
+              << "Scale for node '" + op->Type() + "'.";
+        }
+      }
+    }
+  }
+  EXPECT_EQ(transpose_nodes_count, transpose_count);
+  EXPECT_EQ(slice_nodes_count, slice_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, slice) {
+  // a->Dequantize->b
+  // b2->Quant->b3->slice->c1->Dequant->c2
+  // c2->Dropout->d
+  int slice_count = 1;
+  int transpose_count = 0;
+  int quant_count = 1;
+  int dequant_count = 2;
+  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
+  int added_nodes_count = 4;
+  MainTestSlice(BuildProgramDescSlice(), transpose_count, slice_count,
+                quant_count, dequant_count, added_nodes_count, 2.0f * 127);
+}
+
+TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
+  // a->Transpos2->b
+  // b->slice->c
+  // c->Dropout->d
+  int slice_count = 1;
+  int transpose_count = 1;
+  int quant_count = 0;
+  int dequant_count = 0;
+  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
+  int added_nodes_count = 0;
+  MainTestSlice(BuildProgramDescSliceBetweenNonQuantizedOp(), transpose_count,
+                slice_count, quant_count, dequant_count, added_nodes_count,
+                2.0f * 127);
+}
+
 static const std::initializer_list<std::string> variable_names_matmul = {
     "a", "b", "c", "d", "e", "f"};
 
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 654b58a2ded341..aa29b779e471b3 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -134,6 +134,16 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
           scales_[var_name] = scales_[input_var_name];
         }
         compute_scale = false;
+      } else if (op->Type() == "slice") {
+        auto input_var_name = op->Input("Input")[0];
+        PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(),
+                          platform::errors::PreconditionNotMet(
+                              "Input scales must be calculated before the "
+                              "output scales to infer if output is unsigned."));
+        if (scales_.find(input_var_name) != scales_.end()) {
+          scales_[var_name] = scales_[input_var_name];
+        }
+        compute_scale = false;
       } else if (op->Type() == "concat") {
         // output of ops with unsigned input must be unsigned
         is_unsigned = true;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 5a07cc7e240d5e..6642a2c030b266 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -42,6 +42,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["transpose2"]["X"] = ScaleAlgo::KL;
   rules_["transpose2"]["Out"] = ScaleAlgo::NONE;
 
+  rules_["slice"]["Input"] = ScaleAlgo::KL;
+  rules_["slice"]["Out"] = ScaleAlgo::NONE;
+
   rules_["fc"]["Input"] = ScaleAlgo::KL;
   rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
   rules_["fc"]["Bias"] = ScaleAlgo::NONE;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6fd3944a6c5280..a28b0c172aff0e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -94,6 +94,17 @@ function(inference_analysis_api_test target install_dir filename)
         ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt --refer_result=${install_dir}/result.txt)
 endfunction()
 
+function(inference_analysis_api_int8_test target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model
+             --infer_data=${install_dir}/data.txt
+             --refer_result=${install_dir}/result.txt
+             --accuracy=0.8
+             --batch_size=5
+             --enable_int8=true)
+endfunction()
+
 function(inference_multiple_models_analysis_api_test target install_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -284,13 +295,14 @@ set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
 download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc)
 
-#Ernie
+# Ernie
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
 inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
+inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc)
 
-#Ernie large
+# Ernie large
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large")
 download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f)
 download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73)
@@ -426,7 +438,7 @@ if(WITH_MKLDNN)
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
   download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
-#   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
+#  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # vgg19 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
@@ -730,6 +742,7 @@ set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
new file mode 100644
index 00000000000000..b85726647b548c
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_int8_tester.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+#ifdef PADDLE_WITH_MKLDNN
+void SetInt8Config(AnalysisConfig *cfg,
+                   std::vector<paddle::PaddleTensor> data) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->EnableMKLDNN();
+  cfg->EnableMkldnnQuantizer();
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
+  cfg->mkldnn_quantizer_config()->SetWarmupData(warmup_data);
+  cfg->mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_batch_size);
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+void compare_int8(bool use_mkldnn = false) {
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+
+  AnalysisConfig cfg;
+  SetInt8Config(&cfg, inputs[0]);
+
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
+}
+
+TEST(Analyzer_ernie, compare_int8_mkldnn) {
+  compare_int8(true /* use_mkldnn */);
+}
+#endif
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
index 0c2a140023e293..d6ff3e422368bd 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
@@ -12,142 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/tests/api/analyzer_ernie_tester.h"
 
 namespace paddle {
 namespace inference {
 
 using paddle::PaddleTensor;
 
-template <typename T>
-void GetValueFromStream(std::stringstream *ss, T *t) {
-  (*ss) >> (*t);
-}
-
-template <>
-void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
-  *t = ss->str();
-}
-
-// Split string to vector
-template <typename T>
-void Split(const std::string &line, char sep, std::vector<T> *v) {
-  std::stringstream ss;
-  T t;
-  for (auto c : line) {
-    if (c != sep) {
-      ss << c;
-    } else {
-      GetValueFromStream<T>(&ss, &t);
-      v->push_back(std::move(t));
-      ss.str({});
-      ss.clear();
-    }
-  }
-
-  if (!ss.str().empty()) {
-    GetValueFromStream<T>(&ss, &t);
-    v->push_back(std::move(t));
-    ss.str({});
-    ss.clear();
-  }
-}
-
-// Parse tensor from string
-template <typename T>
-bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
-  std::vector<std::string> data;
-  Split(field, ':', &data);
-  if (data.size() < 2) return false;
-
-  std::string shape_str = data[0];
-
-  std::vector<int> shape;
-  Split(shape_str, ' ', &shape);
-
-  std::string mat_str = data[1];
-
-  std::vector<T> mat;
-  Split(mat_str, ' ', &mat);
-
-  tensor->shape = shape;
-  auto size =
-      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
-      sizeof(T);
-  tensor->data.Resize(size);
-  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
-  tensor->dtype = GetPaddleDType<T>();
-
-  return true;
-}
-
-// Parse input tensors from string
-bool ParseLine(const std::string &line,
-               std::vector<paddle::PaddleTensor> *tensors) {
-  std::vector<std::string> fields;
-  Split(line, ';', &fields);
-
-  tensors->clear();
-  tensors->reserve(4);
-
-  int i = 0;
-  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
-  for (; i < 3; i++) {
-    paddle::PaddleTensor temp;
-    ParseTensor<int64_t>(fields[i], &temp);
-    temp.name = input_name + std::to_string(i);
-    tensors->push_back(temp);
-  }
-
-  // input_mask
-  paddle::PaddleTensor input_mask;
-  ParseTensor<float>(fields[i], &input_mask);
-  input_mask.name = input_name + std::to_string(i);
-  tensors->push_back(input_mask);
-
-  return true;
-}
-
-bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
-  if (FLAGS_infer_data.empty()) {
-    LOG(ERROR) << "please set input data path";
-    return false;
-  }
-
-  std::ifstream fin(FLAGS_infer_data);
-  std::string line;
-  int sample = 0;
-
-  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
-  while (std::getline(fin, line)) {
-    std::vector<paddle::PaddleTensor> feed_data;
-    ParseLine(line, &feed_data);
-    inputs->push_back(std::move(feed_data));
-    sample++;
-    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
-  }
-  LOG(INFO) << "number of samples: " << sample;
-  return true;
-}
-
-void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
-               bool use_gpu = false) {
-  cfg->SetModel(FLAGS_infer_model);
-  if (use_mkldnn) {
-    cfg->EnableMKLDNN();
-  }
-  if (use_gpu) {
-    cfg->EnableUseGpu(100, 0);
-  } else {
-    cfg->DisableGpu();
-  }
-  cfg->SwitchSpecifyInputNames();
-  cfg->SwitchIrOptim();
-  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
-}
-
 void profile(bool use_mkldnn = false, bool use_gpu = false) {
   AnalysisConfig config;
+
   SetConfig(&config, use_mkldnn, use_gpu);
 
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -189,11 +63,12 @@ TEST(Analyzer_Ernie, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 void compare(bool use_mkldnn = false) {
+  std::vector<std::vector<PaddleTensor>> inputs;
+  LoadInputData(&inputs);
+
   AnalysisConfig cfg;
   SetConfig(&cfg, use_mkldnn, false);
 
-  std::vector<std::vector<PaddleTensor>> inputs;
-  LoadInputData(&inputs);
   CompareNativeAndAnalysis(
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
new file mode 100644
index 00000000000000..dd3faac7592104
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h
@@ -0,0 +1,152 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false,
+               bool use_gpu = false) {
+  cfg->SetModel(FLAGS_infer_model);
+  if (use_mkldnn) {
+    cfg->EnableMKLDNN();
+  }
+  if (use_gpu) {
+    cfg->EnableUseGpu(100, 0);
+  } else {
+    cfg->DisableGpu();
+  }
+  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index d9bd843a9d0cf0..e5f70fa10e3751 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -227,6 +227,8 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace,
                    ops::SliceMKLDNNKernel<float>,
+                   ops::SliceMKLDNNKernel<int8_t>,
+                   ops::SliceMKLDNNKernel<uint8_t>,
                    ops::SliceMKLDNNKernel<paddle::platform::bfloat16>);
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index a5513ba648776c..4965e5e156c342 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -244,7 +244,7 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
         "mkldnn_data_type",
         "(string, default \"float32\"). Data type of mkldnn kernel")
         .SetDefault("float32")
-        .InEnum({"float32", "bfloat16"})
+        .InEnum({"float32", "int8", "bfloat16"})
         .AsExtra();
     AddComment(R"DOC(
 Slice Operator.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 4c9c4058318a97..0627bf2123adbd 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -62,7 +62,9 @@ def __init__(self,
         self._ops_to_quantize = _ops_to_quantize
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
-        self._scale_immutable_ops = ['transpose2', 'reshape2', 'pool2d']
+        self._scale_immutable_ops = [
+            'transpose2', 'reshape2', 'pool2d', 'slice'
+        ]
         self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._pool_ops = ['pool2d']
@@ -241,7 +243,10 @@ def _update_scales(graph):
             waiting_for_scale = set()
             for op in graph.all_op_nodes():
                 if op.name() in self._scale_immutable_ops:
-                    input_name = op.input("X")[0]
+                    if op.name() == 'slice':
+                        input_name = op.input("Input")[0]
+                    else:
+                        input_name = op.input("X")[0]
                     output_name = op.output("Out")[0]
                     tensor_names = [input_name, output_name]
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 03503111fca9a6..94d7a2ed153488 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -253,7 +253,7 @@ if(LINUX AND WITH_MKLDNN)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
 	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE} 114f38804a3ef8c45e7259e68bbd838b)
-	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
+	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add,slice")
 	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
 
 	# Quant2 GRU

From 0372883e30b895fafcaf5460d87b1a6d54c44d3d Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 7 Dec 2021 10:42:25 +0800
Subject: [PATCH 090/124] Add ce framework dockerfile (#37762)

---
 tools/dockerfile/ci_dockerfile.sh | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index bb7bdfe46c29b8..48c2e70b014237 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -76,10 +76,40 @@ function make_cinn_dockerfile(){
   sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
 }
 
+
+function make_ce_framework_dockcerfile(){
+  dockerfile_name="Dockerfile.cuda11.2_cudnn8_gcc82_trt8"
+  sed "s/<baseimg>/11.2.0-cudnn8-devel-ubuntu18.04/g" ./Dockerfile.ubuntu18 >${dockerfile_name}
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "7i RUN chmod 777 /tmp" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
+  sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
+}
+
+
 function main() {
   make_ubuntu_dockerfile
   make_centos_dockerfile
   make_cinn_dockerfile
+  make_ce_framework_dockcerfile
 }
 
 main "$@"

From 508b756a37c9fb89a5d4cd09b63e1a46d338823d Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 7 Dec 2021 10:53:35 +0800
Subject: [PATCH 091/124] fix pyyaml dependence problem for api-gen (#37879)

---
 paddle/pten/api/lib/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 189548880694d9..33f20a4df04135 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -24,6 +24,7 @@ set(api_source_file_tmp ${api_source_file}.tmp)
 
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
+  COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
   COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file} 
                  --api_yaml_path ${api_yaml_file}
                  --api_header_path ${api_header_file_tmp}

From 4035bd2b79441eaf6f7c7c8f13d650eed3acbac5 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Tue, 7 Dec 2021 10:55:28 +0800
Subject: [PATCH 092/124] don't exit if requested_size < size (#37880)

don't exit if requested_size < size
---
 paddle/fluid/framework/tensor.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 063ede6ffbf319..cbbc020989d1e9 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -62,14 +62,7 @@ void* Tensor::mutable_data(const platform::Place& place,
           "The Tensor's shape is [",
           dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
-  if (requested_size) {
-    PADDLE_ENFORCE_GE(
-        requested_size, size,
-        platform::errors::InvalidArgument(
-            "The requested memory size is less than the memory size of Tensor. "
-            "But received requested memory size is %d, "
-            "memory size of Tensor is %d.",
-            requested_size, size));
+  if (requested_size && (requested_size > size)) {
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */

From b31852967d3509fbf2ebe805ccb271341974f17a Mon Sep 17 00:00:00 2001
From: zmxdream <zmxdream@pku.edu.cn>
Date: Tue, 7 Dec 2021 11:20:49 +0800
Subject: [PATCH 093/124] [heterps]fix heter service (#37860)

* fix heter service. test=develop

* fix heter section worker in debug mode
---
 paddle/fluid/distributed/service/heter_server.h | 4 ++--
 paddle/fluid/framework/heter_section_worker.cc  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/distributed/service/heter_server.h b/paddle/fluid/distributed/service/heter_server.h
index 66141622f8cdc3..5f062755c92424 100644
--- a/paddle/fluid/distributed/service/heter_server.h
+++ b/paddle/fluid/distributed/service/heter_server.h
@@ -336,7 +336,7 @@ class HeterServer {
 
   bool IsExit() { return service_.IsExit(); }
 
-  HeterServer() { this->ready_ = 0; }
+  HeterServer() : service_(), ready_(0) {}
 
   void RegisterServiceHandler(std::string message_name,
                               HeterServiceHandler func);
@@ -391,7 +391,7 @@ class HeterServer {
   DISABLE_COPY_AND_ASSIGN(HeterServer);
   std::mutex mutex_ready_;
 
-  int ready_ = 0;
+  int ready_;
 };
 
 }  // end namespace distributed
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index a8db38f8077dd7..69a4a180a9071c 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -277,7 +277,7 @@ void HeterSectionWorker::CopyParameters(int microbatch_id,
 void HeterSectionWorker::Run() {
   if (debug_) {
     size_t total_ops_size = forward_ops_.size() + backward_ops_.size();
-    op_name_.resize(total_ops_size);
+    op_name_.reserve(total_ops_size);
     op_total_time_.resize(total_ops_size);
     platform::SetNumThreads(1);
     // forward op + backward op

From bfa0d7f347215152428cb27ee98142bcbfda1122 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 7 Dec 2021 11:59:32 +0800
Subject: [PATCH 094/124] [Pten]Move func from kernel_context.h into
 kernel_context.cc (#37804)

* add inplace op adaptation

* optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor>

* move func in kernel_context.h into kernel_context.cc

* refactor logic that transform variable to densetensor

* fix bugs when compile

* update func name

* fix bugs when run windows-ci
---
 paddle/pten/core/kernel_context.cc      | 112 +++++++++++++++++++++++-
 paddle/pten/core/kernel_context.h       | 104 ++++------------------
 paddle/pten/kernels/cpu/manipulation.h  |   2 +
 paddle/pten/kernels/cuda/manipulation.h |   1 +
 4 files changed, 132 insertions(+), 87 deletions(-)

diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
index 443990c07247dc..b2c84807951a52 100644
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -14,4 +14,114 @@
 
 #include "paddle/pten/core/kernel_context.h"
 
-namespace pten {}  // namespace pten
+namespace pten {
+
+void KernelContext::EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+  int index = inputs_.size();
+  inputs_.emplace_back(std::move(input));
+  // Record the start and end index of the input
+  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
+}
+
+void KernelContext::EmplaceBackInputWithoutSetRange(
+    std::shared_ptr<TensorBase> input) {
+  inputs_.emplace_back(std::move(input));
+}
+
+void KernelContext::EmplaceBackInputs(
+    paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+  int index = inputs_.size();
+  // Record the start and end index of the input
+  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
+}
+
+void KernelContext::EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+  int index = outputs_.size();
+  outputs_.emplace_back(std::move(output));
+  // Record the start and end index of the input
+  output_range_.emplace_back(std::pair<int, int>(index, index + 1));
+}
+
+void KernelContext::EmplaceBackOutputWithoutSetRange(
+    std::shared_ptr<TensorBase> output) {
+  outputs_.emplace_back(std::move(output));
+}
+
+void KernelContext::EmplaceBackOutputs(
+    paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+  int index = outputs_.size();
+  // Record the start and end index of the input
+  output_range_.emplace_back(
+      std::pair<int, int>(index, index + outputs.size()));
+  outputs_.insert(outputs_.end(),
+                  std::make_move_iterator(outputs.begin()),
+                  std::make_move_iterator(outputs.end()));
+}
+
+void KernelContext::EmplaceBackAttr(paddle::any attr) {
+  attrs_.emplace_back(std::move(attr));
+}
+
+void KernelContext::AssignInputRange(std::pair<int, int>&& range, size_t idx) {
+  if (idx < input_range_.size()) {
+    input_range_[idx] = range;
+  } else if (idx == input_range_.size()) {
+    input_range_.emplace_back(range);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Invalid idx when trying to set InputRange, "
+        "index is `%d`, it is greater than the size(%d) of InputRange.",
+        idx,
+        input_range_.size()));
+  }
+}
+
+void KernelContext::AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
+  if (idx < output_range_.size()) {
+    output_range_[idx] = range;
+  } else if (idx == output_range_.size()) {
+    output_range_.emplace_back(range);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Invalid idx when trying to set InputRange, "
+        "index is `%d`, it is greater than the size(%d) of InputRange.",
+        idx,
+        output_range_.size()));
+  }
+}
+
+const std::pair<int, int>& KernelContext::InputRangeAt(size_t idx) const {
+  return input_range_.at(idx);
+}
+
+const std::pair<int, int>& KernelContext::OutputRangeAt(size_t idx) const {
+  return output_range_.at(idx);
+}
+
+std::pair<int, int>& KernelContext::MutableInputRangeAt(size_t idx) {
+  return input_range_[idx];
+}
+
+std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
+  return output_range_[idx];
+}
+
+// Temporary method: For compatible with fluid Tensor and improve performance
+// Only deal with DenseTensor now
+void KernelContext::ClearData() {
+  for (auto& in : inputs_) {
+    if (in) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(in.get()));
+    }
+  }
+  for (auto& out : outputs_) {
+    CompatibleDenseTensorUtils::ClearStorage(
+        static_cast<DenseTensor*>(out.get()));
+  }
+  attrs_.clear();
+}
+}  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 8a87a5b735e99e..6c695987096cb7 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -51,53 +51,29 @@ class KernelContext {
     return static_cast<const CtxType&>(*dev_ctx_);
   }
 
-  void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
-    int index = inputs_.size();
-    inputs_.emplace_back(std::move(input));
-    // Record the start and end index of the input
-    input_range_.emplace_back(std::pair<int, int>(index, index + 1));
-  }
+  void EmplaceBackInput(std::shared_ptr<TensorBase> input);
 
-  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input) {
-    inputs_.emplace_back(std::move(input));
-  }
+  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input);
 
   void EmplaceBackInputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
-    int index = inputs_.size();
-    // Record the start and end index of the input
-    input_range_.emplace_back(
-        std::pair<int, int>(index, index + inputs.size()));
-    inputs_.insert(inputs_.end(),
-                   std::make_move_iterator(inputs.begin()),
-                   std::make_move_iterator(inputs.end()));
-  }
+      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs);
 
-  void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
-    int index = outputs_.size();
-    outputs_.emplace_back(std::move(output));
-    // Record the start and end index of the input
-    output_range_.emplace_back(std::pair<int, int>(index, index + 1));
-  }
+  void EmplaceBackOutput(std::shared_ptr<TensorBase> output);
 
-  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output) {
-    outputs_.emplace_back(std::move(output));
-  }
+  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output);
 
   void EmplaceBackOutputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
-    int index = outputs_.size();
-    // Record the start and end index of the input
-    output_range_.emplace_back(
-        std::pair<int, int>(index, index + outputs.size()));
-    outputs_.insert(outputs_.end(),
-                    std::make_move_iterator(outputs.begin()),
-                    std::make_move_iterator(outputs.end()));
-  }
+      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs);
 
-  void EmplaceBackAttr(paddle::any attr) {
-    attrs_.emplace_back(std::move(attr));
-  }
+  void EmplaceBackAttr(paddle::any attr);
+
+  const std::pair<int, int>& InputRangeAt(size_t idx) const;
+
+  const std::pair<int, int>& OutputRangeAt(size_t idx) const;
+
+  std::pair<int, int>& MutableInputRangeAt(size_t idx);
+
+  std::pair<int, int>& MutableOutputRangeAt(size_t idx);
 
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
@@ -119,41 +95,9 @@ class KernelContext {
     return v;
   }
 
-  const std::pair<int, int>& InputRangeAt(size_t idx) const {
-    return input_range_.at(idx);
-  }
-
-  const std::pair<int, int>& OutputRangeAt(size_t idx) const {
-    return output_range_.at(idx);
-  }
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx);
 
-  void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
-    if (idx < input_range_.size()) {
-      input_range_[idx] = range;
-    } else if (idx == input_range_.size()) {
-      input_range_.emplace_back(range);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "Invalid idx when trying to set InputRange, "
-          "index is `%d`, it is greater than the size(%d) of InputRange.",
-          idx,
-          input_range_.size()));
-    }
-  }
-
-  void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
-    if (idx < output_range_.size()) {
-      output_range_[idx] = range;
-    } else if (idx == output_range_.size()) {
-      output_range_.emplace_back(range);
-    } else {
-      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
-          "Invalid idx when trying to set InputRange, "
-          "index is `%d`, it is greater than the size(%d) of InputRange.",
-          idx,
-          output_range_.size()));
-    }
-  }
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
   template <typename TensorType>
   TensorType* MutableInputAt(size_t idx) {
@@ -187,19 +131,7 @@ class KernelContext {
 
   // Temporary method: For compatible with fluid Tensor and improve performance
   // Only deal with DenseTensor now
-  void ClearData() {
-    for (auto& in : inputs_) {
-      if (in) {
-        CompatibleDenseTensorUtils::ClearStorage(
-            static_cast<DenseTensor*>(in.get()));
-      }
-    }
-    for (auto& out : outputs_) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(out.get()));
-    }
-    attrs_.clear();
-  }
+  void ClearData();
 
   size_t InputsSize() const { return inputs_.size(); }
   size_t OutputsSize() const { return outputs_.size(); }
diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
index 3dce249c54532c..36f9aaa85aa5e3 100644
--- a/paddle/pten/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/pten/kernels/cuda/manipulation.h b/paddle/pten/kernels/cuda/manipulation.h
index bb724beb2e34b9..c0f2d8a11414e6 100644
--- a/paddle/pten/kernels/cuda/manipulation.h
+++ b/paddle/pten/kernels/cuda/manipulation.h
@@ -18,6 +18,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"

From 506e79d1833066f5ce72c478383b352eb5d3e1d7 Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Tue, 7 Dec 2021 12:13:14 +0800
Subject: [PATCH 095/124] [Auto para] Relaunch with auto mapping function
 (#37326)

* [Auto Parallel]  Add the unified cluster representation

* [Auto Parallel] Add the graph class for physical mapping

* [Auto Parallel] Add the simple physical mapper

* Set the timeout of the mapper

* Merge the upstream develop unittests cmake files

* Fix a bug of the process group

* Remove mapper unittest from platforms which is not GPU

* Move the instantiation of process group after resharding

* Add the local id for devices

* Update the rank mapping format

* [Auto Parallel] Relaunch with the rank mapping file

* Remove the unnecessary json file

* Avoid entering get_device_proc_info for auto mapping

* Correct the mapper unit test

* Add some comments

* Remove the related files about mapping

* Update the unittest for auto mapping

* Remove unused rank_mapping unittest

* Improve the unittest coverage

* Improve the unittest coverage

* Improve the unittest of relaunch

* Fix the unittest problem in CI

* Improve the unittest of relaunch

* Remove unnecessary statements

* Update the unittest cmakefile

* Correct the cmakefile of auto parallel unittests

* Modify codes based on the new elastic change

* Use the GPUs exclusively in the unittest

* Correct the cmakefile

* Set the timeout of the unittest
---
 .../distributed/auto_parallel/mapper.py       |  14 +-
 .../distributed/auto_parallel/parallelizer.py | 158 +++++++++----
 .../auto_parallel/process_group.py            |  12 +-
 python/paddle/distributed/fleet/launch.py     |  73 ++++--
 .../paddle/distributed/fleet/launch_utils.py  | 215 +++++++++++++++---
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../unittests/auto_parallel/CMakeLists.txt    |   6 +
 .../auto_parallel_relaunch_model.py           | 162 +++++++++++++
 .../tests/unittests/auto_parallel/launch.py   |  22 ++
 .../test_auto_parallel_relaunch.py            | 118 ++++++++++
 .../unittests/test_auto_parallel_mapper.py    |   3 +
 11 files changed, 675 insertions(+), 110 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/launch.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py

diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py
index f015cf4477195f..543fa2d9681c01 100644
--- a/python/paddle/distributed/auto_parallel/mapper.py
+++ b/python/paddle/distributed/auto_parallel/mapper.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import os
 import operator
 import functools
 import json
@@ -175,9 +176,19 @@ def build_process_graph(distributed_program):
 
 def build_cluster_graph(cluster):
     graph = Graph()
+    cuda_visible_devices_env = os.getenv("CUDA_VISIBLE_DEVICES")
+    cuda_visible_devices = []
+    if cuda_visible_devices_env is not None and cuda_visible_devices_env != "":
+        cuda_visible_devices = [
+            int(d.strip()) for d in cuda_visible_devices_env.split(",")
+        ]
     for machine in cluster.machines.values():
         for device in machine.devices.values():
             graph.add_node(device.global_id, device=device)
+            if cuda_visible_devices and device.local_id not in cuda_visible_devices:
+                graph.nodes[device.global_id]["occupied"] = True
+            else:
+                graph.nodes[device.global_id]["occupied"] = False
         for link in machine.links.values():
             graph.add_edge(
                 link.source.global_id, link.target.global_id, link=link)
@@ -195,9 +206,6 @@ def mapping(distributed_program, cluster):
     for cur_rank_node in process_graph:
         cur_rank_node["visited"] = False
 
-    for cur_device_node in cluster_graph:
-        cur_device_node["occupied"] = False
-
     def sort_by_comm_volume(rank_edge):
         return rank_edge["comm_requirements"]["comm_volume"]
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 14556ff6ef4598..affb27317daafa 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -12,6 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import sys
+import json
+import shlex
+import copy
+import pathlib
+import subprocess
 import logging
 import paddle
 from paddle.distributed.utils import get_logger
@@ -23,9 +30,12 @@
 from .completion import complete_annotation, complete_backward_annotation
 from .partitioner import Partitioner
 from .process_group import get_all_process_groups
+from .process_group import get_world_process_groups
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
 from .reshard import reshard
+from .cluster import Cluster
+from .mapper import mapping
 # from .auto_search import auto_search
 
 _logger = get_logger(logging.INFO)
@@ -46,6 +56,21 @@ def __init__(self, fleet):
         self._optimizer = self._fleet.user_defined_optimizer
         self._dist_strategy = self._fleet._user_defined_strategy
         self._dist_context = DistributedContext()
+        self._cluster = None
+        self._cluster_topo_path = os.getenv("PADDLE_CLUSTER_TOPO_PATH", None)
+        if self._cluster_topo_path is not None:
+            self._cluster = Cluster()
+            self._cluster.build_from_file(self._cluster_topo_path)
+        # Prepare information for auto mapping
+        self._rank_mapping_path = os.getenv("PADDLE_RANK_MAPPING_PATH", None)
+        enable_auto_mapping_env = os.getenv("PADDLE_ENABLE_AUTO_MAPPING", None)
+        if enable_auto_mapping_env is None:
+            self._enable_auto_mapping = False
+        else:
+            self._enable_auto_mapping = True
+        self._need_rank_mapping = os.getenv("PADDLE_NEED_RANK_MAPPING")
+        self._need_rank_mapping = True if self._need_rank_mapping and \
+            self._need_rank_mapping.lower() == 'true' else False
 
     def _remove_distributed_attrs(self, main_program):
         suffix = core.kAutoParallelSuffix()
@@ -57,60 +82,103 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
+    def _get_dist_program(self, dist_context, rank):
+        # Annotation completion
+        completed_main_program = complete_annotation(self._main_program,
+                                                     dist_context)
+        # Logical partition
+        partitioner = Partitioner(self._dist_strategy, dist_context, rank)
+        dist_main_prog, dist_startup_prog = partitioner.transpile_forward(
+            completed_main_program, self._startup_program)
+        dist_params_grads = partitioner.apply_backward(
+            self._loss, completed_main_program, self._startup_program,
+            dist_main_prog, dist_startup_prog)
+        dist_optimize_ops = partitioner.apply_optimize(
+            copy.deepcopy(self._optimizer), dist_params_grads, dist_main_prog,
+            dist_startup_prog)
+
+        make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
+
+        reshard(dist_main_prog, dist_startup_prog, rank, dist_context)
+
+        return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
+
     def parallelize(self,
                     loss,
                     startup_program,
                     parameter_list=None,
                     no_grad_set=None):
         assert startup_program is not None
-        main_program = loss.block.program
-
-        if self._dist_strategy.auto_search:
-            # auto search
-            _logger.info("Start search dist attr.")
-            # self._dist_context, _ = auto_search(main_program, startup_program,
-            #                                     loss, self._optimizer)
-            # completed_main_program = main_program
-            raise NotImplementedError("Auto search has not implemented")
-        else:
-            # Annotation completion
-            _logger.info("Start annotation dist attr.")
-            completed_main_program = complete_annotation(main_program,
-                                                         self._dist_context)
-
-        # Logical partition 
-        rank = paddle.distributed.get_rank()
-        partitioner = Partitioner(self._dist_strategy, self._dist_context, rank)
-        partitioned_main_prog, partitioned_startup_prog = partitioner.transpile_forward(
-            completed_main_program, startup_program)
-        dist_params_grads = partitioner.apply_backward(
-            loss, completed_main_program, startup_program,
-            partitioned_main_prog, partitioned_startup_prog)
-        dist_optimize_ops = partitioner.apply_optimize(
-            self._optimizer, dist_params_grads, partitioned_main_prog,
-            partitioned_startup_prog)
+        self._loss = loss
+        self._startup_program = startup_program
+        self._main_program = loss.block.program
+        self._parameter_list = parameter_list
+        self._no_grad_set = no_grad_set
+
+        if self._enable_auto_mapping and self._need_rank_mapping:
+            # Do the mapping pass before parallelization
+            assert self._cluster is not None, \
+                "The cluster must not be none when using auto mapping."
+            dist_programs = {}
+            world_process_group = get_world_process_groups()
+            for rank in world_process_group.ranks:
+                dist_context = DistributedContext()
+                dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program(
+                    dist_context, rank)
+                dist_programs[rank] = dist_main_prog
+
+            # Do the mapping between the distributed program graph and the cluster graph
+            rank_mapping_dict = mapping(dist_programs, self._cluster)
+            rank_mapping = list(rank_mapping_dict.values())
 
-        # set the grad var shape
-        set_grad_var_shape(partitioned_main_prog, self._dist_context)
+            # Relaunch the training by using the rank mapping file
+            with open(self._rank_mapping_path, "w") as rank_mapping_file:
+                json.dump(rank_mapping, rank_mapping_file)
+
+            enable_elastic = os.getenv("PADDLE_ENABLE_ELASTIC")
+            enable_elastic = True if enable_elastic and enable_elastic.lower(
+            ) == 'true' else False
+            if enable_elastic:
+                print("Auto mapping finished, now do elastic re-launch")
+                sys.exit(paddle.distributed.fleet.elastic.manager.
+                         ELASTIC_AUTO_PARALLEL_EXIT_CODE)
+
+            original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS")
+            rank_mapping_args = " ".join(
+                ["--rank_mapping_path", self._rank_mapping_path])
+            if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+                coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+            else:
+                coverage_args = []
+            new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args
+            new_cmd = [sys.executable, "-u"] + coverage_args + shlex.split(
+                new_cmd_args)
+            new_process = subprocess.Popen(new_cmd)
+            new_process.wait()
+            assert new_process.returncode == 0, \
+                "Launch failed with rank mapping"
+            print("Successfully do the second launch for auto mapping!")
+            sys.exit(0)
+        else:
+            # Parallelization after the mapping pass
+            rank = paddle.distributed.get_rank()
 
-        # The last step: remove all distributed attributes to be compatiable
-        # with inference.
-        self._remove_distributed_attrs(partitioned_main_prog)
-        make_data_unshard(partitioned_main_prog, partitioned_startup_prog,
-                          self._dist_context)
+            dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program(
+                self._dist_context, rank)
 
-        reshard(partitioned_main_prog, partitioned_startup_prog, rank,
-                self._dist_context)
+            # Traverse different rank programs and traverse each op of them,
+            # instantiate communication by process_mapping.
+            all_process_groups = get_all_process_groups()
+            for process_group in all_process_groups:
+                if rank not in process_group.ranks:
+                    continue
+                process_group.instantiate()
 
-        # Traverse different rank programs and traverse each op of them,
-        # instantiate communication by process_mapping.
-        all_process_groups = get_all_process_groups()
-        for process_group in all_process_groups:
-            if rank not in process_group.ranks:
-                continue
-            process_group.instantiate()
+            # Copy distributed info to the default context
+            set_default_distributed_context(self._dist_context)
 
-        # Copy distributed info to the default context
-        set_default_distributed_context(self._dist_context)
+            # The last step: remove all distributed attributes to be compatible
+            # with inference.
+            self._remove_distributed_attrs(dist_main_prog)
 
-        return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog
+            return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 70a19f6c5386a3..2e4d370b39435d 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -19,10 +19,6 @@
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers.tensor import fill_constant
 
-# Note that Process group 0 is reserved for representing all ranks.
-# At the begining, group 0 is empty and new ranks will be added automatically. 
-_g_process_group_map = {}
-
 
 def get_all_process_groups():
     global _g_process_group_map
@@ -34,6 +30,11 @@ def get_process_group(group_id):
     return _g_process_group_map.get(group_id, None)
 
 
+def get_world_process_groups():
+    global _g_process_group_map
+    return _g_process_group_map[0]
+
+
 def new_process_group(ranks):
     global _g_process_group_map
     # A key constructed from ranks is used for avoiding duplication 
@@ -151,4 +152,7 @@ def __str__(self):
         return string
 
 
+# Note that Process group 0 is reserved for representing all ranks.
+# At the begining, group 0 is empty and new ranks will be added automatically. 
+_g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 708ba2816077e1..19306d3da99168 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -175,25 +175,17 @@ def _parse_args():
         default="127.0.0.1",
         help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
     collective_group.add_argument(
-        "--rank_mapping_file",
-        type=argparse.FileType('r'),
-        default=sys.stdin,
-        help="This rank mapping information in json format is used specifically "
-        "for lazy launch for auto parallel. Some of the ranks in each node "
-        "may not be used, and the indices of rank should be kept the same "
-        "as the indices of sub-task splited by auto parallel. "
-        " { "
-        "   \"ip_ranks\": [ "
-        "     { "
-        "       \"ip\": \"127.0.0.1\", "
-        "       \"ranks\": [0,1] "
-        "     }, "
-        "     { "
-        "       \"ip\": \"127.0.0.2\", "
-        "       \"ranks\": [2,3,4] "
-        "     } "
-        "   ] "
-        " } ")
+        "--cluster_topo_path",
+        type=str,
+        default=None,
+        help="A json format file will be stored in this path which is used"
+        "to represent the cluster topology information for auto parallel.")
+    collective_group.add_argument(
+        "--rank_mapping_path",
+        type=str,
+        default=None,
+        help="A json format file will be stored in this path which is used"
+        "to map processes to machines for auto parallel.")
     collective_group.add_argument(
         "--enable_auto_mapping",
         type=bool,
@@ -297,20 +289,56 @@ def cpuonly_check(args):
 def get_cluster_info(args):
     # parse arguments, used for cloud-single-machine and local
     if args.backend == 'gloo': cpuonly_check(args)
-    (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args)
+    if args.enable_auto_mapping:
+        (device_mode, devices_per_proc) = (DeviceMode.GPU, [])
+    else:
+        (device_mode,
+         devices_per_proc) = launch_utils.get_device_proc_info(args)
     trainers_num = cloud_utils.get_trainers_num()
     logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format(
         trainers_num, device_mode, devices_per_proc))
 
+    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+
     cluster = None
     pod = None
 
     start_port = 6170
     if os.environ.get('FLAGS_START_PORT') is not None:
         start_port = os.environ.get('FLAGS_START_PORT')
-    # lazy launch for auto-parallel
+    # auto mapping between processes and devices for auto-parallel
     if args.enable_auto_mapping == True:
-        cluster, pod = get_mapped_cluster_from_args(args, device_mode)
+        assert args.cluster_topo_path is not None, \
+            "The cluster topology must be provied when enabling auto mapping."
+        rank_mapping_path = args.rank_mapping_path or os.getenv(
+            "PADDLE_RANK_MAPPING_PATH")
+        if not rank_mapping_path:
+            os.environ["PADDLE_NEED_RANK_MAPPING"] = str(True)
+            os.environ["PADDLE_ENABLE_ELASTIC"] = str(
+                enable_elastic(args, device_mode))
+            cwd = pathlib.Path().resolve()
+            rank_mapping_path = os.path.join(cwd,
+                                             "auto_parallel_rank_mapping.json")
+            os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path)
+
+            original_args = sys.argv[1:]
+            os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args)
+            os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path)
+            os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str(
+                args.enable_auto_mapping)
+            cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping(
+                args, device_mode)
+        else:
+            os.environ["PADDLE_NEED_RANK_MAPPING"] = str(False)
+            os.environ["PADDLE_ENABLE_ELASTIC"] = str(
+                enable_elastic(args, device_mode))
+
+            os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path)
+            os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path)
+            os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str(
+                args.enable_auto_mapping)
+            cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping(
+                args, device_mode)
     elif cloud_utils.use_paddlecloud() and trainers_num != 1:
         cluster, pod = cloud_utils.get_cloud_cluster(
             args.ips, device_mode, devices_per_proc, start_port)
@@ -328,6 +356,7 @@ def get_cluster_info(args):
         logger.debug("get cluster from args:{}".format(cluster))
     return cluster, pod
 
+
 def get_global_envs(args, tmp_dir):
     global_envs = copy.copy(os.environ.copy())
     # add gloo env
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 569f64c18bf52f..c20c209d601714 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -511,6 +511,17 @@ def start_local_trainers(cluster,
             "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
         }
 
+        # The following three environnement variables are used for auto mapping
+        if current_env.get("PADDLE_CLUSTER_TOPO_PATH", None) is not None:
+            proc_env["PADDLE_CLUSTER_TOPO_PATH"] = current_env[
+                "PADDLE_CLUSTER_TOPO_PATH"]
+        if current_env.get("PADDLE_RANK_MAPPING_PATH", None) is not None:
+            proc_env["PADDLE_RANK_MAPPING_PATH"] = current_env[
+                "PADDLE_RANK_MAPPING_PATH"]
+        if current_env.get("PADDLE_ENABLE_AUTO_MAPPING", None) is not None:
+            proc_env["PADDLE_ENABLE_AUTO_MAPPING"] = current_env[
+                "PADDLE_ENABLE_AUTO_MAPPING"]
+
         if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
@@ -531,7 +542,8 @@ def start_local_trainers(cluster,
         current_env.update(proc_env)
 
         coverage_args = []
-        if run_with_coverage():
+        if run_with_coverage() or os.environ.get("WITH_COVERAGE",
+                                                 "OFF") == "ON":
             coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
         cmd = [sys.executable, "-u"] + coverage_args + [training_script
                                                         ] + training_script_args
@@ -557,7 +569,11 @@ def start_local_trainers(cluster,
             with open("%s/endpoints.log" % log_dir, "w") as f:
                 f.write("PADDLE_TRAINER_ENDPOINTS: \n")
                 f.write("\n".join(cluster.trainers_endpoints()))
-            fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
+            if current_env.get("PADDLE_ENABLE_AUTO_MAPPING") is not None \
+                and current_env.get("PADDLE_NEED_RANK_MAPPING").lower() == "true":
+                fn = open("%s/prelaunchlog.%d" % (log_dir, idx), "a")
+            else:
+                fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
             proc = subprocess.Popen(
                 cmd, env=current_env, stdout=fn, stderr=fn, preexec_fn=pre_fn)
         else:
@@ -876,8 +892,8 @@ def get_custom_endpoints(origin_endpoints, offset=0):
 #        pretty_print_envs(environs)))
 
 
-def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
-                       node_mapping_ranks):
+def get_mapped_cluster_without_rank_mapping(
+        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks):
     assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     assert device_mode == DeviceMode.GPU, \
         "Only support get mapped cluster for gpu now."
@@ -890,17 +906,121 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         cur_node_endpoints = trainer_endpoints[node_rank]
 
         # choose rank from global mapped ranks and set it to the trainer.
-        ranks_per_node = node_mapping_ranks[node_rank]
+        ranks_per_node = node_ranks[node_rank]
+        assert len(ranks_per_node) == 1
         for i in range(len(ranks_per_node)):
             trainer = Trainer()
-            # change global rank(mapped) to local rank within each node.
-            # e.g. mapped ranks of node: 3,4,7 -> 0,1,2
-            local_rank = ranks_per_node.index(ranks_per_node[i])
-            trainer.accelerators.append(local_rank)
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
-            # global mapped ranks
             trainer.rank = ranks_per_node[i]
+            pod.trainers.append(trainer)
+        cluster.pods.append(pod)
+
+    pod_rank = node_ips.index(node_ip)
+    return cluster, cluster.pods[pod_rank]
+
+
+def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode):
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+    gpus_num = fluid.core.get_cuda_device_count()
+
+    # parse ip-ranks json file
+    cluster_topo = None
+    with open(args.cluster_topo_path, "r") as json_file:
+        cluster_topo = json.load(json_file)
+
+    node_ips = []
+    node_ranks = []
+    for idx, cur_cluster_topo in enumerate(cluster_topo["machines"]):
+        node_ips.append(cur_cluster_topo['addr'])
+        node_ranks.append([idx])
+
+    if len(node_ips) == 1:
+        node_ip = node_ips[0]
+    else:
+        if args.host:
+            node_ip = args.host
+        else:
+            _, node_ip = get_host_name_ip()
+
+    assert node_ip in node_ips, \
+        "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips)
+    node_rank = node_ips.index(node_ip)
+
+    assert len(node_ranks) == len(node_ips), \
+        "ranks length should be equal to ips length."
+
+    logger.debug("parsed from args: node_ips:{} node_ip:{} "
+                 "node_rank:{} node_ranks:{}".format(
+                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
+
+    # NOTE: there are different number of global mapped ranks on each node.
+    free_ports = []
+    trainer_endpoints = []
+    for ip in node_ips:
+        node_rank = node_ips.index(ip)
+        if os.environ.get('PADDLE_PORT') is not None:
+            start_port = int(os.getenv("PADDLE_PORT", ""))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        elif os.environ.get('FLAGS_START_PORT') is not None:
+            start_port = int(os.environ.get('FLAGS_START_PORT'))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        else:
+            free_ports = find_free_ports(len(node_ranks[node_rank]))
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_mapped_cluster_without_rank_mapping(
+        node_ips, node_ip, trainer_endpoints, device_mode, node_ranks)
+
+
+def get_mapped_cluster_with_rank_mapping(node_ips, node_ip, trainer_endpoints,
+                                         device_mode, node_ranks,
+                                         node_rank_mappings):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
+    assert device_mode == DeviceMode.GPU, \
+        "Only support get mapped cluster for gpu now."
+
+    def get_relative_gpu_id(gpu_id):
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices is None or cuda_visible_devices == "":
+            return gpu_id
+        else:
+            cuda_visible_devices_list = cuda_visible_devices.split(',')
+            relative_id = cuda_visible_devices_list.index(str(gpu_id))
+            logger.info(
+                "Change gpu id from {} to {} based on CUDA_VISIBLE_DEVICES {}".
+                format(gpu_id, relative_id, cuda_visible_devices_list))
+            return relative_id
+
+    cluster = Cluster(hdfs=None)
+    for node_rank, ip in enumerate(node_ips):
+        pod = Pod()
+        pod.rank = node_rank
+        pod.addr = ip
+        pod.device_mode = device_mode
+        cur_node_endpoints = trainer_endpoints[node_rank]
 
+        # choose rank from global mapped ranks and set it to the trainer.
+        ranks_per_node = node_ranks[node_rank]
+        cur_node_rank_mapping = node_rank_mappings[node_rank]
+        for i in range(len(ranks_per_node)):
+            trainer = Trainer()
+            local_device_ids = cur_node_rank_mapping["ranks"][str(
+                ranks_per_node[i])]
+            assert len(local_device_ids) == 1, \
+                "Only support one process to one device mapping"
+            trainer.accelerators.append(
+                get_relative_gpu_id(local_device_ids[0]))
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
+            trainer.rank = ranks_per_node[i]
             pod.trainers.append(trainer)
         cluster.pods.append(pod)
 
@@ -908,22 +1028,31 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
     return cluster, cluster.pods[pod_rank]
 
 
-def get_mapped_cluster_from_args(args, device_mode):
+def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
     assert device_mode == DeviceMode.GPU, \
         "Only support get mapped cluster for gpu now."
     gpus_num = fluid.core.get_cuda_device_count()
 
     # parse ip-ranks json file
-    json_data = None
-    with args.rank_mapping_file as json_file:
-        json_data = json.load(json_file)
+    rank_mapping_path = args.rank_mapping_path or os.getenv(
+        "PADDLE_RANK_MAPPING_PATH")
+    rank_mapping = None
+    with open(rank_mapping_path, "r") as json_file:
+        rank_mapping = json.load(json_file)
+    # reset PADDLE_RANK_MAPPING_PATH env
+    os.environ["PADDLE_RANK_MAPPING_PATH"] = ""
 
     node_ips = []
-    node_ranks_mapping = []
-    ip_ranks_list = json_data['ip_ranks']
-    for ip_ranks in ip_ranks_list:
-        node_ips.append(ip_ranks['ip'])
-        node_ranks_mapping.append(ip_ranks['ranks'])
+    node_ranks = []
+    node_rank_mappings = []
+    for cur_rank_mapping in rank_mapping:
+        node_ips.append(cur_rank_mapping['addr'])
+        cur_node_rank_list = [
+            int(i) for i in list(cur_rank_mapping['ranks'].keys())
+        ]
+        cur_node_rank_list.sort()
+        node_ranks.append(cur_node_rank_list)
+        node_rank_mappings.append(cur_rank_mapping)
 
     if len(node_ips) == 1:
         node_ip = node_ips[0]
@@ -937,31 +1066,41 @@ def get_mapped_cluster_from_args(args, device_mode):
         "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
-    assert len(node_ranks_mapping[node_rank]) <= gpus_num, \
+    assert len(node_ranks[node_rank]) <= gpus_num, \
         "number of ranks mapped to one node should not exceed the avaiable ones."
-    assert len(node_ranks_mapping) == len(node_ips), \
+    assert len(node_ranks) == len(node_ips), \
         "ranks length should be equal to ips length."
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} "
-                 "node_rank:{} node_ranks_mapping:{}".format(
-                     node_ips, node_ip, node_rank, node_ranks_mapping[
-                         node_rank]))
+                 "node_rank:{} node_ranks:{}".format(
+                     node_ips, node_ip, node_rank, node_ranks[node_rank]))
 
     # NOTE: there are different number of global mapped ranks on each node.
     free_ports = []
     trainer_endpoints = []
     for ip in node_ips:
         node_rank = node_ips.index(ip)
-        if os.environ.get('FLAGS_START_PORT') is not None:
+        if os.environ.get('PADDLE_PORT') is not None:
+            start_port = int(os.getenv("PADDLE_PORT", ""))
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
+        elif os.environ.get('FLAGS_START_PORT') is not None:
             start_port = int(os.environ.get('FLAGS_START_PORT'))
-            end_port = start_port + len(node_ranks_mapping[node_rank])
-            free_ports = [x for x in range(start_port, end_port)]
+            free_ports = [
+                x
+                for x in range(start_port, start_port + len(node_ranks[
+                    node_rank]))
+            ]
         else:
-            free_ports = find_free_ports(len(node_ranks_mapping[node_rank]))
+            free_ports = find_free_ports(len(node_ranks[node_rank]))
         trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
 
-    return get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
-                              node_ranks_mapping)
+    return get_mapped_cluster_with_rank_mapping(node_ips, node_ip,
+                                                trainer_endpoints, device_mode,
+                                                node_ranks, node_rank_mappings)
 
 
 class ParameterServerLauncher(object):
@@ -1229,14 +1368,18 @@ def get_role_endpoints(self, args):
                 _, self.current_node_ip = get_host_name_ip()
             else:
                 self.current_node_ip = pod_ip
-            assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                  % (self.current_node_ip, self.node_ips)
-        self.node_rank = self.node_ips.index(self.current_node_ip)
-        logger.debug(
-            "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
-            format(self.node_ips, self.current_node_ip, self.node_rank))
+            if not self.distribute_mode == DistributeMode.PS_HETER:
+                assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                      % (self.current_node_ip, self.node_ips)
+        if self.current_node_ip in self.node_ips:
+            self.node_rank = self.node_ips.index(self.current_node_ip)
+            logger.debug(
+                "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
+                format(self.node_ips, self.current_node_ip, self.node_rank))
 
     def start_ps(self):
+        if not self.current_node_ip in self.node_ips:
+            return
         cluster = Cluster(hdfs=None)
         server_rank = 0
         worker_rank = 0
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 15f857f6087302..4162f697d27eac 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -589,6 +589,8 @@ set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_passes)
   
+    add_subdirectory(auto_parallel)
+
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
new file mode 100644
index 00000000000000..4244fda0c51d9c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -0,0 +1,6 @@
+# file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+# string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+if(WITH_DISTRIBUTE AND WITH_GPU)
+    py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS})
+    set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
new file mode 100644
index 00000000000000..8e5221ed5ffa68
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import IterableDataset, DataLoader
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+batch_size = 4
+hidden_size = 1024
+sequence_len = 512
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+
+        return out
+
+
+def mlp_pretrain_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _global_process_mesh,
+                "dims_mappig": [-1, -1, -1]
+            })
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=[input, label], capacity=4 * batch_size, iterable=True)
+
+    return loss, train_program, start_program, loader
+
+
+def train():
+    global _global_process_mesh
+    _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    loss, train_program, start_program, loader = mlp_pretrain_forward(
+        train_program, start_program)
+
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+
+    optimizer = fleet.distributed_optimizer(optimizer)
+    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
+        loss, start_program)
+
+    places = static.cuda_places()
+    loader.set_batch_generator(batch_generator_creator(), places=places)
+    exe = paddle.static.Executor(places[0])
+    exe.run(distributed_startup_program)
+
+    for data in loader():
+        exe.run(distributed_main_program, feed=data, fetch_list=[loss])
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
new file mode 100644
index 00000000000000..c225fe85cd8448
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.distributed.fleet import launch
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+if __name__ == "__main__":
+    if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+        run_with_coverage(True)
+    launch.launch()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
new file mode 100644
index 00000000000000..321b262286218f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import json
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+cluster_json = """
+{
+  "machines": [
+    {
+      "hostname": "machine1",
+      "addr": "127.0.0.1",
+      "port": "768",
+      "devices": [
+        {
+          "global_id": 0,
+          "local_id": 0,
+          "type": "GPU",
+          "model": "Tesla V100-SXM2-32GB",
+          "sp_gflops": 15700,
+          "dp_gflops": 7800,
+          "memory": 32
+        },
+        {
+          "global_id": 1,
+          "local_id": 1,
+          "type": "GPU",
+          "model": "Tesla V100-SXM2-32GB",
+          "sp_gflops": 15700,
+          "dp_gflops": 7800,
+          "memory": 32
+        },
+        {
+          "global_id": 2,
+          "local_id": 0,
+          "type": "CPU",
+          "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G",
+          "arch": "x86_64",
+          "vendor": "GenuineIntel",
+          "sp_gflops": 150,
+          "dp_gflops": 75,
+          "memory": "503"
+        }
+      ],
+      "links": [
+        {
+          "source_global_id": 0,
+          "target_global_id": 1,
+          "type": "NVL",
+          "bandwidth": 42
+        },
+        {
+          "source_global_id": 1,
+          "target_global_id": 0,
+          "type": "PHB",
+          "bandwidth": 12
+        }
+      ]
+    }
+  ]
+}
+"""
+
+
+class TestAutoParallelReLaunch(unittest.TestCase):
+    def test_relaunch(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+
+        launch_model_path = os.path.join(file_dir,
+                                         "auto_parallel_relaunch_model.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--cluster_topo_path", cluster_json_path,
+            "--enable_auto_mapping", "True", launch_model_path
+        ]
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+        rank_mapping_json_path = os.path.join(file_dir,
+                                              "auto_parallel_rank_mapping.json")
+        if os.path.exists(rank_mapping_json_path):
+            os.remove(rank_mapping_json_path)
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 7b60a9753bd6d4..de37ac56bfbb63 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -52,6 +52,9 @@
 from paddle.distributed.auto_parallel.mapper import get_dtype_bytes
 from paddle.distributed.auto_parallel.mapper import get_comm_volume
 
+if os.getenv("CUDA_VISIBLE_DEVICES") is not None:
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
 paddle.enable_static()
 _global_parallel_strategy = None
 _global_process_mesh = None

From 79c25979773b77902ecb5b2d9f918ad8f9bcaf76 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Tue, 7 Dec 2021 12:24:18 +0800
Subject: [PATCH 096/124] [Eager] fix cmake generate error, and fix circular
 import (#37871)

* refine a test case, test=develop

* rm python, test=develop

* refine, test=develop

* fix cmake generate error, and fix circular import, test=develop
---
 paddle/fluid/pybind/eager_method.cc           |  2 +-
 paddle/fluid/pybind/eager_utils.cc            | 34 +++++++++++++++++++
 paddle/fluid/pybind/eager_utils.h             |  2 ++
 paddle/pten/core/CMakeLists.txt               |  6 ++--
 paddle/pten/core/convert_utils.cc             | 34 -------------------
 paddle/pten/core/convert_utils.h              |  1 -
 .../fluid/eager/eager_tensor_patch_methods.py |  2 +-
 7 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f040566260c74a..e01396a4e3ca76 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -42,7 +42,7 @@ static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
     return Py_None;
   }
   auto tensor_dims = self->eagertensor.shape();
-  auto numpy_dtype = pten::TensorDtype2NumpyDtype(self->eagertensor.type());
+  auto numpy_dtype = TensorDtype2NumpyDtype(self->eagertensor.type());
   auto sizeof_dtype = pten::DataTypeSize(self->eagertensor.type());
   Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
   Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9268fc8e7b976c..c8b6f2c06c731a 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -17,9 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -37,6 +39,38 @@ extern PyTypeObject* g_xpuplace_pytype;
 extern PyTypeObject* g_npuplace_pytype;
 extern PyTypeObject* g_cudapinnedplace_pytype;
 
+int TensorDtype2NumpyDtype(pten::DataType dtype) {
+  switch (dtype) {
+    case pten::DataType::BOOL:
+      return pybind11::detail::npy_api::NPY_BOOL_;
+    case pten::DataType::INT8:
+      return pybind11::detail::npy_api::NPY_INT8_;
+    case pten::DataType::UINT8:
+      return pybind11::detail::npy_api::NPY_UINT8_;
+    case pten::DataType::INT16:
+      return pybind11::detail::npy_api::NPY_INT16_;
+    case pten::DataType::INT32:
+      return pybind11::detail::npy_api::NPY_INT32_;
+    case pten::DataType::INT64:
+      return pybind11::detail::npy_api::NPY_INT64_;
+    case pten::DataType::FLOAT16:
+      return pybind11::detail::NPY_FLOAT16_;
+    case pten::DataType::FLOAT32:
+      return pybind11::detail::npy_api::NPY_FLOAT_;
+    case pten::DataType::FLOAT64:
+      return pybind11::detail::npy_api::NPY_DOUBLE_;
+    case pten::DataType::COMPLEX64:
+      return pybind11::detail::NPY_COMPLEX64;
+    case pten::DataType::COMPLEX128:
+      return pybind11::detail::NPY_COMPLEX128;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unknow pten::DataType, the int value = %d.",
+          static_cast<int>(dtype)));
+      return 0;
+  }
+}
+
 bool PyObject_CheckLongOrConvertToLong(PyObject** obj) {
   if ((PyLong_Check(*obj) && !PyBool_Check(*obj))) {
     return true;
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 49f56a61c31f1f..f311e62b8965e1 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -21,6 +21,8 @@ typedef struct {
   PyObject_HEAD egr::EagerTensor eagertensor;
 } EagerTensorObject;
 
+int TensorDtype2NumpyDtype(pten::DataType dtype);
+
 bool PyObject_CheckLongOrConvertToLong(PyObject** obj);
 bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj);
 bool PyObject_CheckStr(PyObject* obj);
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 0a2504f50327c1..e19d0a490cef39 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,9 +1,9 @@
 if(WITH_GPU)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info python)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 elseif(WITH_ROCM)
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info python)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 else()
-  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place python)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index e457c57d59e55c..211734f3315bc3 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/pten/core/convert_utils.h"
-#include "paddle/fluid/operators/py_func_op.h"
-#include "paddle/fluid/pybind/tensor_py.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -272,36 +270,4 @@ std::string DataType2String(DataType dtype) {
   }
 }
 
-int TensorDtype2NumpyDtype(pten::DataType dtype) {
-  switch (dtype) {
-    case pten::DataType::BOOL:
-      return pybind11::detail::npy_api::NPY_BOOL_;
-    case pten::DataType::INT8:
-      return pybind11::detail::npy_api::NPY_INT8_;
-    case pten::DataType::UINT8:
-      return pybind11::detail::npy_api::NPY_UINT8_;
-    case pten::DataType::INT16:
-      return pybind11::detail::npy_api::NPY_INT16_;
-    case pten::DataType::INT32:
-      return pybind11::detail::npy_api::NPY_INT32_;
-    case pten::DataType::INT64:
-      return pybind11::detail::npy_api::NPY_INT64_;
-    case pten::DataType::FLOAT16:
-      return pybind11::detail::NPY_FLOAT16_;
-    case pten::DataType::FLOAT32:
-      return pybind11::detail::npy_api::NPY_FLOAT_;
-    case pten::DataType::FLOAT64:
-      return pybind11::detail::npy_api::NPY_DOUBLE_;
-    case pten::DataType::COMPLEX64:
-      return pybind11::detail::NPY_COMPLEX64;
-    case pten::DataType::COMPLEX128:
-      return pybind11::detail::NPY_COMPLEX128;
-    default:
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-          "Unknow pten::DataType, the int value = %d.",
-          static_cast<int>(dtype)));
-      return 0;
-  }
-}
-
 }  // namespace pten
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index e5990eb0a89f03..32ed753b4b0abb 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -48,6 +48,5 @@ pten::LoD TransToPtenLoD(const paddle::framework::LoD& lod);
 size_t DataTypeSize(DataType dtype);
 DataType String2DataType(const std::string& str);
 std::string DataType2String(DataType dtype);
-int TensorDtype2NumpyDtype(pten::DataType dtype);
 
 }  // namespace pten
diff --git a/python/paddle/fluid/eager/eager_tensor_patch_methods.py b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
index 206c5cf23e6dad..b61bf78116aeb3 100644
--- a/python/paddle/fluid/eager/eager_tensor_patch_methods.py
+++ b/python/paddle/fluid/eager/eager_tensor_patch_methods.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid.core as core
+from .. import core as core
 
 
 def monkey_patch_eagertensor():

From efd7a2293af9540bbf6ba75e64b2017476f2da22 Mon Sep 17 00:00:00 2001
From: TTerror <tangzhiyi11@users.noreply.github.com>
Date: Tue, 7 Dec 2021 13:21:03 +0800
Subject: [PATCH 097/124] add some op to xpu2 op list && format xpu op list
 (#37832)

* format xpu op list

* format xpu op list

* update xpu1 op list
---
 .../fluid/platform/device/xpu/xpu1_op_list.h  | 284 ++++++------
 .../fluid/platform/device/xpu/xpu2_op_list.h  | 403 ++++++++++--------
 2 files changed, 367 insertions(+), 320 deletions(-)

diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index 1cc7bba132e594..d6b466ff92c5b9 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -29,40 +29,35 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl1_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu1_kernels{
-      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sigmoid_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"hard_switch_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"leaky_relu_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"affine_channel",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"affine_channel_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                pOpKernelType(vartype::FP64, XPUPlace()),
                                pOpKernelType(vartype::INT32, XPUPlace()),
                                pOpKernelType(vartype::INT64, XPUPlace()),
                                pOpKernelType(vartype::BOOL, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace())})},
       {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::INT32, XPUPlace())})},
@@ -72,188 +67,197 @@ XPUOpMap& get_kl1_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"c_reduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"c_allreduce_sum",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace())})},
       {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::INT8, XPUPlace()),
-                                  pOpKernelType(vartype::INT16, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                   pOpKernelType(vartype::INT8, XPUPlace()),
-                                   pOpKernelType(vartype::INT16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
-                                   pOpKernelType(vartype::INT8, XPUPlace()),
-                                   pOpKernelType(vartype::INT16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"deformable_conv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"deformable_conv_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"dropout_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub",
+      {"c_allreduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_sub_grad",
+      {"c_reduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_add_grad",
+      {"elementwise_div_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div_grad",
+      {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_pow",
+      {"elementwise_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_floordiv",
+      {"elementwise_max",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_mul",
+      {"elementwise_min_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_mul_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max",
+      {"elementwise_mul",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_max_grad",
+      {"elementwise_pow",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min",
+      {"elementwise_sub_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_min_grad",
+      {"elementwise_sub",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
       {"fill_constant",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gaussian_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"bilinear_interp_v2_grad",
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2",
+      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"nearest_interp_v2_grad",
+      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
+      {"leaky_relu_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"load", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                              pOpKernelType(vartype::INT8, XPUPlace()),
                              pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::INT16, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"matmul_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                                    pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"range", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
+      {"reduce_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max_grad",
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                 pOpKernelType(vartype::INT64, XPUPlace()),
-                                 pOpKernelType(vartype::INT32, XPUPlace()),
-                                 pOpKernelType(vartype::BOOL, XPUPlace()),
-                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reshape2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"shape", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::BOOL, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                pOpKernelType(vartype::INT64, XPUPlace()),
-                                pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::BOOL, XPUPlace()),
-                                pOpKernelType(vartype::INT8, XPUPlace()),
-                                pOpKernelType(vartype::UINT8, XPUPlace()),
-                                pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze_grad",
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -268,7 +272,7 @@ XPUOpMap& get_kl1_ops() {
                                  pOpKernelType(vartype::INT8, XPUPlace()),
                                  pOpKernelType(vartype::UINT8, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2_grad",
+      {"squeeze_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -276,27 +280,29 @@ XPUOpMap& get_kl1_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::BOOL, XPUPlace()),
+                                pOpKernelType(vartype::INT8, XPUPlace()),
+                                pOpKernelType(vartype::UINT8, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose_grad",
+      {"transpose2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"transpose2_grad",
+      {"transpose_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"truncated_gaussian_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"uniform_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::INT8, XPUPlace()),
-                                  pOpKernelType(vartype::UINT8, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze_grad",
+      {"unsqueeze2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -311,7 +317,7 @@ XPUOpMap& get_kl1_ops() {
                                    pOpKernelType(vartype::INT8, XPUPlace()),
                                    pOpKernelType(vartype::UINT8, XPUPlace()),
                                    pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2_grad",
+      {"unsqueeze_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -319,21 +325,13 @@ XPUOpMap& get_kl1_ops() {
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::UINT8, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
       // AddMore
   };
 
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 636b27e051122a..74f519c7a86175 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -29,141 +29,109 @@ using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
 XPUOpMap& get_kl2_ops() {
   // KL1支持的op，通过op_name, data_type, place来索引
   static XPUOpMap s_xpu2_kernels{
-      {"label_smooth",
+      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign_value",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                            pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_sub_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_add",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"batch_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_add_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_div",
+      {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"elementwise_div_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_div_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_pow",
+      {"elementwise_div",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_mul_grad",
+      {"elementwise_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_max",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_max_grad",
+      {"elementwise_min_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"elementwise_min",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"elementwise_min_grad",
+      {"elementwise_mul_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"batch_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"layer_norm_grad",
+      {"elementwise_mul",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"softmax_with_cross_entropy_grad",
+      {"elementwise_pow",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                            pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose_grad",
+      {"elementwise_sub_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"transpose2_grad",
+      {"elementwise_sub",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"iou_similarity",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_mean_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::FP16, XPUPlace()),
-                              pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::FP16, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace())})},
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace()),
                               pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                   pOpKernelType(vartype::INT32, XPUPlace()),
-                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"greater_than",
-       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                     pOpKernelType(vartype::INT32, XPUPlace()),
+      {"expand_as_v2",
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"greater_equal",
+      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"fill_any_like",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace()),
-                              pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace()),
-                             pOpKernelType(vartype::BOOL, XPUPlace()),
-                             pOpKernelType(vartype::INT64, XPUPlace()),
-                             pOpKernelType(vartype::INT32, XPUPlace())})},
-      {"fill_any_like",
+      {"fill_constant",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT16, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
-                                pOpKernelType(vartype::INT32, XPUPlace()),
-                                pOpKernelType(vartype::INT8, XPUPlace()),
-                                pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_grad",
+                     pOpKernelType(vartype::BF16, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
+                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
+      {"flatten2_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
@@ -172,124 +140,205 @@ XPUOpMap& get_kl2_ops() {
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::INT8, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten2_grad",
+      {"flatten_contiguous_range_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"assign_value",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"dropout_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"elementwise_div_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                   pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"layer_norm_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lookup_table_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-
       {"flatten_contiguous_range",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"flatten_contiguous_range_grad",
+      {"flatten_grad",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                              pOpKernelType(vartype::FP16, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace())})},
-      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                             pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+      {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT8, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::FP16, XPUPlace())})},
       {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                              pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                               pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                    pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"fill_constant",
+      {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
-                     pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
-                     pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP64, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
-                     pOpKernelType(vartype::BF16, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX64, XPUPlace()),
-                     pOpKernelType(vartype::COMPLEX128, XPUPlace())})},
-      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
-                                pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"softmax_grad",
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"greater_than",
+       XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"iou_similarity",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"label_smooth",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
-      {"gather_nd", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"less_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"less_than", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                             pOpKernelType(vartype::INT64, XPUPlace()),
-                             pOpKernelType(vartype::BOOL, XPUPlace()),
-                             pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                              pOpKernelType(vartype::INT64, XPUPlace()),
-                              pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                    pOpKernelType(vartype::BOOL, XPUPlace()),
-                                    pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"masked_select",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
-                                  pOpKernelType(vartype::INT64, XPUPlace()),
-                                  pOpKernelType(vartype::BOOL, XPUPlace()),
-                                  pOpKernelType(vartype::FP16, XPUPlace()),
+      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"momentum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"expand_as_v2",
-       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                    pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"reduce_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
-                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d",
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::FP16, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"depthwise_conv2d_grad",
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"softmax_with_cross_entropy_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"squeeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::INT8, XPUPlace()),
+                                 pOpKernelType(vartype::UINT8, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::BOOL, XPUPlace()),
+                             pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::UINT8, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                    pOpKernelType(vartype::BOOL, XPUPlace()),
+                                    pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"where", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
       // AddMore
   };
 

From 4e63d69b6639b1d802554c1036024f67b4fbb5a0 Mon Sep 17 00:00:00 2001
From: 0x45f <23097963+0x45f@users.noreply.github.com>
Date: Tue, 7 Dec 2021 14:11:17 +0800
Subject: [PATCH 098/124] [Dy2Stat]Polish for zip in dy2stat (#37846)

* polish for zip in dy2stat

* polish comment

* polish is_builtin_len

* fix comment
---
 .../dygraph_to_static/call_transformer.py     |  7 +++--
 .../dygraph_to_static/convert_call_func.py    |  9 +++++-
 .../dygraph_to_static/convert_operators.py    |  9 ++++++
 .../dygraph_to_static/test_for_enumerate.py   | 28 +++++++++++++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index 3e606139245d60..a80dfa11402c5c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -39,7 +39,7 @@ def _no_need_convert_call(self, node):
         Determines whether a function needs to be transformed by `convert_call`.
         It doesn't need to be transformed when a function satisfies the following conditions:
           1. It's a api of paddle
-          2. It's a python builtin function not include `len`
+          2. It's a python builtin function not include `len` and `zip`
         """
         assert isinstance(node, gast.Call)
         if is_paddle_api(node):
@@ -47,10 +47,11 @@ def _no_need_convert_call(self, node):
 
         func_str = ast_to_source_code(node.func).strip()
         try:
-            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin
+            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin, is_builtin_zip
             is_builtin = eval("is_builtin({})".format(func_str))
             is_builtin_len = eval("is_builtin_len({})".format(func_str))
-            return is_builtin and not is_builtin_len
+            is_builtin_zip = eval("is_builtin_zip({})".format(func_str))
+            return is_builtin and not is_builtin_len and not is_builtin_zip
         except Exception:
             return False
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 300586969ff65b..0b009c0049dcb8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -27,7 +27,7 @@
 import six
 
 from paddle.fluid.dygraph.container import Sequential
-from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
+from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len, convert_zip
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
@@ -79,6 +79,10 @@ def is_builtin_len(func):
     return False
 
 
+def is_builtin_zip(func):
+    return is_builtin(func) and func.__name__ == 'zip'
+
+
 def is_unsupported(func):
     """
     Checks whether the func is supported by dygraph to static graph.
@@ -164,6 +168,9 @@ def dyfunc(x):
     if is_builtin_len(func):
         return convert_len
 
+    if is_builtin_zip(func):
+        return convert_zip
+
     if is_builtin(func) or is_unsupported(func):
         return func
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 0ac4da947a46bc..ba45dedc40faa4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -298,6 +298,15 @@ def convert_len(var):
         return len(var)
 
 
+def convert_zip(*args):
+    for i, arg in enumerate(args):
+        if isinstance(arg, Variable) and arg.shape[0] == -1:
+            raise RuntimeError(
+                "Not support zip(tensor, ...) when tensor.shape[0] == -1, "
+                "but found args[{}].shape[0] == -1 in 'zip'".format(str(i)))
+    return zip(*args)
+
+
 def convert_var_shape(x, idx=None, in_control_flow=False):
     """
     A function representation of the shape of variable.
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index 2aab27c03110d1..750ed615e7109e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
+from paddle.static import InputSpec
 
 program_translator = ProgramTranslator()
 
@@ -322,6 +323,24 @@ def for_original_tuple():
     return z
 
 
+# 23. for zip error
+@paddle.jit.to_static(
+    input_spec=[InputSpec(shape=[None, 10]), InputSpec(shape=[None, 10])])
+def for_zip_error(x, y):
+    for i, j in zip(x, y):
+        a = i + j
+    return x + y
+
+
+# 24. for zip
+@paddle.jit.to_static(
+    input_spec=[InputSpec(shape=[2, 10]), InputSpec(shape=[2, 10])])
+def for_zip(x, y):
+    for i, j in zip(x, y):
+        a = i + j
+    return x + y
+
+
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
@@ -512,5 +531,14 @@ def test_transformed_result_compare(self):
         self.transformed_result_compare()
 
 
+class TestForZip(unittest.TestCase):
+    def test_for_zip_error(self):
+        with self.assertRaises(RuntimeError):
+            paddle.jit.save(for_zip_error, './for_zip_error')
+
+    def test_for_zip(self):
+        paddle.jit.save(for_zip, './for_zip')
+
+
 if __name__ == '__main__':
     unittest.main()

From e3cca8ac7c32906244e25e07f19347b5d7ec7f24 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Tue, 7 Dec 2021 14:16:08 +0800
Subject: [PATCH 099/124] Set runtime_include_dir in Paddle.__init__.py
 (#37886)

Paddle don't have to set runtime_include_dir during run CINN.
---
 paddle/fluid/pybind/pybind.cc    |  9 +++++++++
 python/paddle/__init__.py        | 11 +++++++++++
 python/paddle/device/__init__.py |  2 ++
 python/paddle/fluid/framework.py | 16 ++++++++++++++++
 python/setup.py.in               |  2 ++
 5 files changed, 40 insertions(+)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 9ff9377abb2624..635f3149773e8f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -207,6 +207,14 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }
 
+bool IsCompiledWithCINN() {
+#ifndef PADDLE_WITH_CINN
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithHETERPS() {
 #ifndef PADDLE_WITH_HETERPS
   return false;
@@ -2191,6 +2199,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
+  m.def("is_compiled_with_cinn", IsCompiledWithCINN);
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 661cd495b53e8c..a70bd3f81bfc7f 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -282,6 +282,7 @@
 from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
+from .fluid.framework import is_compiled_with_cinn  # noqa: F401
 from .fluid.framework import is_compiled_with_cuda  # noqa: F401
 from .fluid.framework import is_compiled_with_rocm  # noqa: F401
 from .fluid.framework import disable_signal_handler  # noqa: F401
@@ -310,6 +311,16 @@
 import paddle.vision  # noqa: F401
 
 from .tensor.random import check_shape  # noqa: F401
+
+# CINN has to set a flag to include a lib
+if is_compiled_with_cinn():
+    import os
+    package_dir = os.path.dirname(os.path.abspath(__file__))
+    runtime_include_dir = os.path.join(package_dir, "libs")
+    cuh_file = os.path.join(runtime_include_dir, "cinn_cuda_runtime_source.cuh")
+    if os.path.exists(cuh_file):
+        os.environ['runtime_include_dir'] = runtime_include_dir
+
 disable_static()
 
 __all__ = [  # noqa
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 84b08fcdd39a09..95402898589f6e 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -18,6 +18,7 @@
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.framework import is_compiled_with_cinn  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_cuda  # noqa: F401
 from paddle.fluid.framework import is_compiled_with_rocm  # noqa: F401
 from . import cuda
@@ -28,6 +29,7 @@
     'get_device',
     'XPUPlace',
     'is_compiled_with_xpu',
+    'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_npu'
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index ee7aa4560364e6..0b09c513db8589 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -54,6 +54,7 @@
     'xpu_places',
     'cuda_pinned_places',
     'in_dygraph_mode',
+    'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_xpu',
@@ -477,6 +478,21 @@ def disable_signal_handler():
     core.disable_signal_handler()
 
 
+def is_compiled_with_cinn():
+    """
+    Whether this whl package can be used to run the model on CINN.
+
+    Returns (bool): `True` if CINN is currently available, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_cinn = paddle.device.is_compiled_with_cinn()
+    """
+    return core.is_compiled_with_cinn()
+
+
 def is_compiled_with_cuda():
     """
     Whether this whl package can be used to run the model on GPU.
diff --git a/python/setup.py.in b/python/setup.py.in
index 5690fccf89dda3..e286b9cc735dfb 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -470,7 +470,9 @@ if '${WITH_LITE}' == 'ON':
 
 if '${WITH_CINN}' == 'ON':
     shutil.copy('${CINN_LIB_LOCATION}/${CINN_LIB_NAME}', libs_path)
+    shutil.copy('${CINN_INCLUDE_DIR}/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh', libs_path)
     package_data['paddle.libs']+=['libcinnapi.so']
+    package_data['paddle.libs']+=['cinn_cuda_runtime_source.cuh']
 
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)

From 6b7b767791054e6ddb98d5117e55ef0777209153 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Tue, 7 Dec 2021 14:22:55 +0800
Subject: [PATCH 100/124] multithread memory optimize error fix (#37894)

* multithread_memory_optimize
---
 .../analysis/passes/memory_optimize_pass.cc   | 29 ++++++++++---------
 .../analysis/passes/memory_optimize_pass.h    |  8 ++---
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 2202b94bee727c..3fa417c2ea6311 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -52,11 +52,11 @@ typedef struct {
 // The traversal order also affect the lifecycles, so different sort_kind is
 // used.
 void MemoryOptimizePass::CollectLifeCycle(
-    std::unordered_map<std::string, lifecycle_t>* lifecycles,
+    Graph* graph, std::unordered_map<std::string, lifecycle_t>* lifecycles,
     int sort_kind) const {
-  max_lifecycle_ = 0;
+  int max_lifecycle = 0;
   for (auto* op_node : framework::ir::TopologyVarientSort(
-           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
+           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
     if (!op_node->IsOp()) continue;
     auto reads = op_node->inputs;
     auto writes = op_node->outputs;
@@ -77,20 +77,20 @@ void MemoryOptimizePass::CollectLifeCycle(
         if (node->Var()->Persistable()) continue;
         std::string var = node->Name();
         if (!lifecycles->count(var)) {
-          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
+          (*lifecycles)[var] = std::make_pair(max_lifecycle, max_lifecycle);
         } else {
           (*lifecycles)[var].second =
-              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
+              std::max(max_lifecycle, lifecycles->at(var).second);  // max()
         }
       }
     }
 
-    ++max_lifecycle_;
+    ++max_lifecycle;
   }
 }
 
 void MemoryOptimizePass::CollectVarMemorySize(
-    space_table_t* space_table) const {
+    Graph* graph, space_table_t* space_table) const {
   const int fake_batch_size = 1;
 
   auto valid_var = [&](framework::ir::Node* node) -> bool {
@@ -130,7 +130,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
   // although it's not always the case. so black list is the best compromise
   // between performance and underlying principle.
   std::unordered_set<std::string> black_list;
-  for (auto* node : graph_->Nodes()) {
+  for (auto* node : graph->Nodes()) {
     if (node->IsVar() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
@@ -141,7 +141,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
   }
 
   // Collect tensors from graph.
-  for (auto* node : graph_->Nodes()) {
+  for (auto* node : graph->Nodes()) {
     if (node->IsVar() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR &&
@@ -304,7 +304,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   // 3. Perform reuse plan: Replace all var's name in the model according to the
   // mapping table.
   if (!argument->enable_memory_optim()) return;
-  graph_ = argument->main_graph_ptr();
+  // Because of pass is a singleton, graph can not be member
+  // variables，otherwise，errors will be caused under multithreading
+  // conditions.
+  auto graph = argument->main_graph_ptr();
 
   int sort_kind = 0;
   std::unordered_map<std::string, lifecycle_t> lifecycles;
@@ -312,10 +315,10 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   std::unordered_map<std::string, std::string> node2cluster;
   std::unordered_map<std::string, int> cluster_size;
 
-  CollectLifeCycle(&lifecycles, sort_kind);
-  CollectVarMemorySize(&space_table);
+  CollectLifeCycle(graph, &lifecycles, sort_kind);
+  CollectVarMemorySize(graph, &space_table);
   MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-  UpdateOpDescsByReuse(graph_, node2cluster, sort_kind);
+  UpdateOpDescsByReuse(graph, node2cluster, sort_kind);
   return;
 }
 
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 6d20aee295b7c1..57052243d2f189 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -57,17 +57,15 @@ class MemoryOptimizePass : public AnalysisPass {
 
  private:
   void CollectLifeCycle(
+      framework::ir::Graph *graph,
       std::unordered_map<std::string, lifecycle_t> *lifecycles,
       int sort_kind) const;
 
-  void CollectVarMemorySize(space_table_t *space_table) const;
+  void CollectVarMemorySize(framework::ir::Graph *graph,
+                            space_table_t *space_table) const;
 
  public:
   std::string repr() const override;
-
- private:
-  mutable framework::ir::Graph *graph_{nullptr};
-  mutable int max_lifecycle_{-1};
 };
 
 }  // namespace analysis

From b154110a0357f9a709ff1c09eb6fb804a6811b19 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Tue, 7 Dec 2021 15:06:53 +0800
Subject: [PATCH 101/124] make some non_parallel unittest parallel execute
 (#37805)

* make some non_parallel unittest parallel execute

* delete duplicate ut
---
 tools/parallel_UT_rule.py | 587 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 559 insertions(+), 28 deletions(-)

diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 0d106102aa2675..79a742c314bd07 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1236,7 +1236,6 @@
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
     'test_sequence_scatter_op',
-    'test_sequence_scatter_op',
     'test_sequence_last_step',
     'test_sequence_first_step',
     'test_seqpool_cvm_concat_fuse_pass',
@@ -1258,7 +1257,6 @@
     'test_require_version',
     'test_requantize_mkldnn_op',
     'test_repeated_fc_relu_fuse_pass',
-    'test_repeated_fc_relu_fuse_pass',
     'test_registry',
     'test_reducescatter_api',
     'test_reducescatter',
@@ -1410,7 +1408,6 @@
     'test_fleet_nocvm_1',
     'test_fleet_base_4',
     'test_fleet',
-    'test_fleet',
     'test_flags_use_mkldnn',
     'test_flags_mkldnn_ops_on_off',
     'test_filter_by_instag_op',
@@ -1419,8 +1416,6 @@
     'test_feed_fetch_method',
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
-    'test_fc_lstm_fuse_pass',
-    'test_fc_gru_fuse_pass',
     'test_fc_gru_fuse_pass',
     'test_fc_elementwise_layernorm_fuse_pass',
     'test_fc_bf16_mkldnn_op',
@@ -1511,7 +1506,6 @@
     'test_attention_lstm_op',
     'test_analyzer',
     'test_aligned_allocator',
-    'system_allocator_test',
     'stringprintf_test',
     'stringpiece_test',
     'split_test',
@@ -1525,7 +1519,6 @@
     'save_load_op_test',
     'save_load_combine_op_test',
     'rw_lock_test',
-    'retry_allocator_test',
     'reader_test',
     'reader_blocking_queue_test',
     'prune_test',
@@ -1581,14 +1574,12 @@
     'conditional_block_op_test',
     'cipher_utils_test',
     'check_reduce_rank_test',
-    'buffered_allocator_test',
     'broadcast_op_test',
     'bfloat16_test',
     'complex_test',
     'beam_search_decode_op_test',
     'auto_growth_best_fit_allocator_test',
     'assign_op_test',
-    'allocator_facade_frac_flags_test',
     'aes_cipher_test',
     'test_dist_sparse_tensor_load_adagrad',
     'test_dist_mnist_fp16_allreduce',
@@ -1673,9 +1664,7 @@
     'test_dist_fleet_grad_clip',
     'test_custom_concat',
     'test_analyzer_seq_pool1_fuse_statis',
-    'test_fc_lstm_fuse_pass_cc',
     'test_layer_norm_fuse_pass',
-    'test_fc_gru_fuse_pass_cc',
     'test_fleet_ps',
     'test_analyzer_multi_model_prediction',
     'test_fleet_base_3',
@@ -1683,9 +1672,7 @@
     'test_ascend_trigger',
     'test_fleet_amp_meta_optimizer',
     'test_fleetrun',
-    'test_check_abi',
     'dense_table_test',
-    'test_adaptive_pool2d_convert_global_pass',
     'test_fleet_recompute_meta_optimizer',
     'test_fleet_fp16_allreduce_meta_optimizer',
     'test_post_training_quantization_lstm_model',
@@ -1695,7 +1682,6 @@
     'test_listen_and_serv_op',
     'test_analyzer_zerocopytensor_tensor',
     'test_collective_optimizer',
-    'test_bf16_utils',
     'test_analyzer_seq_pool1_compare_determine',
     'test_avoid_twice_initialization',
     'test_fleet_distributed_strategy',
@@ -1704,7 +1690,6 @@
     'test_model_cast_to_bf16',
     'test_hybrid_parallel_topology',
     'barrier_table_test',
-    'test_check_error',
     'test_fleet_lamb_meta_optimizer',
     'test_fleet_rolemaker_2',
     'test_distributed_strategy',
@@ -1717,8 +1702,6 @@
     'test_analyzer_capi_ner',
     'test_unsqueeze2_eltwise_fuse_pass_cc',
     'test_dgc_optimizer',
-    'test_fleet_cc',
-    'test_repeated_fc_relu_fuse_pass_cc',
     'heter_server_test',
     'test_custom_conj',
     'test_fleet_private_function',
@@ -1726,7 +1709,6 @@
     'brpc_service_sparse_sgd_test',
     'test_tf32_cudnn',
     'test_communicator_geo',
-    'test_dispatch_jit',
     'test_fleet_dgc_meta_optimizer',
     'test_fc_fuse_pass_cc',
     'test_communicator_sync',
@@ -1775,7 +1757,6 @@
     'test_fc_gru_fuse_pass_cc',
     'test_conv_bn_fuse_pass_cc',
     'test_adaptive_pool2d_convert_global_pass',
-    'test_unsqueeze2_eltwise_fuse_pass_cc',
     'test_layer_norm_fuse_pass_cc',
     'test_fc_act_mkldnn_fuse_pass',
     'test_fleet_cc',
@@ -1897,7 +1878,6 @@
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
-    'test_conv_bn_fuse_pass',
     'test_instance_norm_op_v2',
     'test_softmax_bf16_mkldnn_op',
     'test_mean_iou',
@@ -1954,10 +1934,8 @@
     'test_collect_fpn_proposals_op',
     'test_sequence_unpad_op',
     'test_conv1d_transpose_layer',
-    'test_sequence_slice_op',
     'test_sequence_pool',
     'test_conv_elementwise_add_fuse_pass',
-    'test_sequence_pad_op',
     'test_conv_shift_op',
     'test_sequence_expand_as',
     'test_cos_sim_op',
@@ -2056,16 +2034,12 @@
     'test_cvm_op',
     'test_selu_op',
     'test_cross_op',
-    'test_sequence_conv',
     'test_crop_tensor_op',
-    'test_sequence_expand',
     'test_sequence_mask',
-    'test_sequence_pool',
     'test_conv_elementwise_add2_act_fuse_pass',
     'test_sequence_reshape',
     'test_conv2d_fusion_op',
     'test_sequence_softmax_op',
-    'test_sequence_unpad_op',
     'test_compare_reduce_op',
     'test_clip_by_norm_op',
     'test_box_coder_op',
@@ -2116,8 +2090,6 @@
     'test_batch_norm_op_v2',
     'test_pool2d_mkldnn_op',
     'test_regularizer',
-    'test_sequence_concat',
-    'test_sequence_expand_as',
     'test_sequence_reverse',
     'test_shape_op',
     'test_diag',
@@ -2190,6 +2162,565 @@
     'lite_mul_model_test',
     'test_complex_simplenet',
     'test_imperative_layers',
+    'test_trt_convert_concat',
+    'test_trt_convert_affine_channel',
+    'test_multi_precision_fp16_train',
+    'test_trt_transpose_flatten_concat_fuse_pass',
+    'test_trt_tuned_dynamic_shape',
+    'test_quantization_pass',
+    'test_trt_fc_fuse_pass',
+    'test_var_base',
+    'trt_split_converter_test',
+    'test_user_defined_quantization',
+    'test_quantization_scale_pass',
+    'feed_forward_test',
+    'test_fuse_optimizer_pass',
+    'test_standalone_executor',
+    'test_imperative_qat_user_defined',
+    'test_mkldnn_fc_act_fuse_pass',
+    'test_cross_entropy_loss',
+    'test_trt_conv3d_op',
+    'test_signal',
+    'test_parallel_executor_drop_scope',
+    'test_fused_feedforward_op',
+    'test_weight_decay_extend',
+    'test_fuse_relu_depthwise_conv_pass',
+    'test_diag_v2',
+    'test_tensorrt_engine',
+    'test_tensordot',
+    'test_ir_memory_optimize_ifelse_op',
+    'test_parallel_executor_mnist',
+    'test_load_state_dict_from_old_format',
+    'test_fuse_elewise_add_act_pass',
+    'test_fetch_unmerged',
+    'test_rnn_decode_api',
+    'test_activation_op',
+    'test_clip_op',
+    'test_randint_op',
+    'test_imperative_ptb_rnn',
+    'test_standalone_controlflow',
+    'test_standalone_multiply_write',
+    'test_reshape_op',
+    'test_parallel_executor_fetch_isolated_var',
+    'test_inplace_abn_op',
+    'test_fused_transformer_encoder_layer',
+    'test_eager_deletion_while_op',
+    'test_dataloader_unkeep_order',
+    'test_parallel_executor_profiler',
+    'test_correlation',
+    'test_conv_affine_channel_fuse_pass',
+    'test_ir_inplace_pass',
+    'test_trt_convert_group_norm',
+    'test_scale_op',
+    'test_moving_average_abs_max_scale_op',
+    'test_tensor_fill_diagonal_',
+    'test_tensor_type_promotion',
+    'test_flatten_contiguous_range_op',
+    'test_fill_any_op',
+    'test_trt_yolo_box_op',
+    'test_transforms',
+    'test_sum_op',
+    'test_scatter_op',
+    'test_tensor_fill_',
+    'test_parallel_executor_pg',
+    'test_mix_precision_all_reduce_fuse',
+    'test_tensor_register_hook',
+    'test_fused_multihead_matmul_op',
+    'test_tensorrt_engine_op',
+    'test_zeropad2d',
+    'test_isclose_op',
+    'test_weight_decay',
+    'test_async_read_write',
+    'test_allclose_op',
+    'test_uniform_random_inplace_op',
+    'test_decoupled_py_reader',
+    'test_op_function_generator',
+    'test_dynamic_rnn_stop_gradient',
+    'test_api_impl',
+    'test_assign_op',
+    'test_py_reader_using_executor',
+    'test_trt_instance_norm_op',
+    'test_uniform_random_op',
+    'test_py_func_op',
+    'test_eager_deletion_delete_vars',
+    'test_bernoulli_op',
+    'test_rmsprop_op',
+    'test_multinomial_op',
+    'test_jit_save_load',
+    'test_asp_optimize',
+    'test_tensor_zero_',
+    'test_fused_elemwise_activation_op',
+    'test_profiler',
+    'test_ir_memory_optimize_pass',
+    'test_callback_reduce_lr_on_plateau',
+    'test_pass_builder',
+    'test_read_file',
+    'test_print_op',
+    'test_parallel_executor_dry_run',
+    'test_paddle_save_load',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_pool3d_api',
+    'test_imperative_trace_non_persistable_inputs',
+    'test_executor_return_tensor_not_overwriting',
+    'test_density_prior_box_op',
+    'test_dataloader_keep_order',
+    'test_bce_loss',
+    'test_simnet_v2',
+    'test_fetch_lod_tensor_array',
+    'test_stack_op',
+    'test_overlap_add_op',
+    'test_frame_op',
+    'test_dygraph_spectral_norm',
+    'test_broadcast_tensors_op',
+    'test_pad3d_op',
+    'test_cumprod_op',
+    'test_imperative_basic',
+    'test_cumsum_op',
+    'test_atan2_op',
+    'trt_fc_prelu_test',
+    'test_std_layer',
+    'test_squeeze_op',
+    'test_split_op',
+    'test_sign_op',
+    'test_sigmoid_focal_loss',
+    'test_set_value_op',
+    'test_searchsorted_op',
+    'test_run_program_op',
+    'test_randperm_op',
+    'test_randint_like',
+    'test_pylayer_op',
+    'test_pow2_decay_with_linear_warmup_op',
+    'test_pow',
+    'test_pixel_shuffle',
+    'test_paddle_imperative_double_grad',
+    'test_optimizer_for_varbase',
+    'test_onnx_export',
+    'test_normalize',
+    'test_norm_all',
+    'test_nn_sigmoid_op',
+    'test_nn_matmul_v2_grad',
+    'test_nn_margin_rank_loss',
+    'test_mv_op',
+    'test_multihead_attention',
+    'test_multi_dot_op',
+    'test_mse_loss',
+    'test_modelaverage',
+    'test_min_op',
+    'test_metrics',
+    'test_merged_momentum_op',
+    'test_median',
+    'test_math_op_patch_var_base',
+    'test_lod_append_op',
+    'test_layer_norm_op_v2',
+    'test_label_smooth_functional',
+    'test_instance_norm_op',
+    'test_imperative_thread_local_has_grad',
+    'test_imperative_recurrent_usage',
+    'test_imperative_container_sequential',
+    'test_imperative_container_layerlist',
+    'test_imperative_container_layerdict',
+    'test_group_norm_op_v2',
+    'test_gelu_op',
+    'test_faster_tokenizer_op',
+    'test_expand_as_op',
+    'test_digamma_op',
+    'test_diff_op',
+    'test_diagonal_op',
+    'test_diagflat',
+    'test_determinant_op',
+    'test_deform_conv2d',
+    'test_conv_transpose_nn_grad',
+    'test_conj_op',
+    'test_complex_reshape',
+    'test_chunk_op',
+    'test_bmm_op',
+    'test_bincount_op',
+    'test_beam_search_decode_op',
+    'test_arg_min_max_v2_op',
+    'test_angle_op',
+    'test_adamw_op',
+    'test_adamax_api',
+    'test_activation_nn_grad',
+    'test_sparse_momentum_op',
+    'test_softmax_mask_fuse_op',
+    'test_sgd_op',
+    'test_paddle_save_load_binary',
+    'test_ops_roi_align',
+    'test_nonzero_api',
+    'test_nll_loss',
+    'test_neg_op',
+    'test_mul_nn_grad',
+    'test_inplace',
+    'test_graph_send_recv_op',
+    'test_fill_constant_op',
+    'test_einsum',
+    'test_distribution',
+    'test_cosine_similarity_api',
+    'test_compiled_program',
+    'test_compare_op',
+    'test_bitwise_op',
+    'test_bce_with_logits_loss',
+    'test_adaptive_avg_pool3d',
+    'test_seq2seq',
+    'test_yolo_box_op',
+    'test_word2vec',
+    'test_scale_mkldnn_op',
+    'test_feed_data_check_shape_type',
+    'test_asp_pruning_2d_greedy',
+    'test_asp_pruning_2d_best',
+    'test_asp_pruning_1d',
+    'test_activation_bf16_mkldnn_op',
+    'test_erf_op',
+    'test_complex_getitem',
+    'test_vhp',
+    'test_top_k_v2_op',
+    'test_reinforcement_learning',
+    'test_hessian',
+    'test_concat_mkldnn_op',
+    'test_reduce_mkldnn_op',
+    'test_jacobian',
+    'test_tril_triu_op',
+    'test_transfer_dtype_op',
+    'test_tile_op',
+    'test_yolov3_loss_op',
+    'test_where_op',
+    'test_where_index',
+    'test_variance_layer',
+    'test_unsqueeze_op',
+    'test_trunc_op',
+    'test_trt_dynamic_shape',
+    'test_trt_anchor_generator_op',
+    'test_translated_layer',
+    'test_tensor_shape',
+    'test_split_mkldnn_op',
+    'test_slice',
+    'test_simnet',
+    'test_save_inference_model',
+    'test_return',
+    'test_program_translator',
+    'test_print',
+    'test_prelu_mkldnn_op',
+    'test_op_attr',
+    'test_loop',
+    'test_logical',
+    'test_list',
+    'test_imperative_ocr_attention_model',
+    'test_ifelse',
+    'test_grad',
+    'test_full_name_usage',
+    'test_for_enumerate',
+    'test_error',
+    'test_elementwise_gradient_op',
+    'test_dict',
+    'test_declarative',
+    'test_convert_call',
+    'test_cast',
+    'test_cache_program',
+    'test_break_continue',
+    'test_vjp_jvp',
+    'test_unique_consecutive_op',
+    'test_save_load',
+    'test_partial_program',
+    'test_len',
+    'test_pool2d_api',
+    'test_dlpack',
+    'test_complex_variable',
+    'test_adaptive_max_pool1d',
+    'test_imperative_layer_trainable',
+    'test_cuda_graph',
+    'test_rad2deg',
+    'test_custom_grad_input',
+    'test_accuracy_op',
+    'test_pool1d_api',
+    'test_imperative_selected_rows',
+    'test_tf32_cublas',
+    'test_l1_loss',
+    'test_cuda_stream_event',
+    'test_adaptive_avg_pool2d',
+    'test_normalization_wrapper',
+    'test_select_input_output_op',
+    'test_max_op',
+    'test_variable_trans_func',
+    'test_param_guard',
+    'test_share_data_op',
+    'test_multiply',
+    'test_affine_grid_function',
+    'test_lambda',
+    'test_prod_op',
+    'test_fused_attention_op_api',
+    'test_complex_grad_accumulated',
+    'cc_imp_py_test',
+    'test_deg2rad',
+    'test_lgamma_op',
+    'test_grad_clip_minimize',
+    'test_get_tensor_from_selected_rows_op',
+    'test_executor_and_mul',
+    'test_tensor',
+    'test_complex_abs',
+    'test_subtract_op',
+    'test_complex_elementwise_layers',
+    'test_marker_op',
+    'test_typing',
+    'test_imperative_container_parameterlist',
+    'test_cuda_empty_cache',
+    'test_randn_op',
+    'test_maximum_op',
+    'test_conv2d_api',
+    'test_add_position_encoding_op',
+    'test_adaptive_max_pool2d',
+    'test_tensor_methods',
+    'test_imperative_partitial_backward',
+    'test_inplace_auto_generated_apis',
+    'test_imperative_triple_grad',
+    'test_cost_model',
+    'test_zeros_like_op',
+    'test_ops_roi_pool',
+    'test_nn_functional_embedding_dygraph',
+    'test_function_hook',
+    'test_real_imag_op',
+    'test_minimum_op',
+    'test_view_op_reuse_allocation',
+    'test_ast_util',
+    'test_nn_quant_functional_layers',
+    'test_adaptive_max_pool3d',
+    'test_ones_like',
+    'test_lod_array_length_op',
+    'test_fetch_feed',
+    'test_memory_reuse_exclude_feed_var',
+    'test_ir_embedding_eltwise_layernorm_fuse_pass',
+    'test_pairwise_distance',
+    'test_imperative_hook_for_layer',
+    'test_complex_sum_layer',
+    'test_sort_op',
+    'test_complex_cast',
+    'test_complex_transpose',
+    'test_reorder_lod_tensor',
+    'test_complex_kron',
+    'test_complex_trace_layer',
+    'test_merge_selectedrows_op',
+    'test_imperative_parallel_coalesce_split',
+    'test_viterbi_decode_op',
+    'test_square_error_cost',
+    'test_lod_tensor',
+    'test_array_read_write_op',
+    'test_weight_normalization',
+    'test_glu',
+    'test_nn_dice_loss',
+    'test_adaptive_avg_pool1d',
+    'data_type_transform_test',
+    'test_tracer',
+    'test_elementwise_div_grad_grad',
+    'tensor_util_test',
+    'concat_test',
+    'math_function_gpu_test',
+    'malloc_test',
+    'test_elementwise_add_grad_grad',
+    'transform_test',
+    'strided_memcpy_test',
+    'test_gradient_accmulator',
+    'test_fused_residual_dropout_bias',
+    'test_elementwise_add_op_inplace',
+    'lod_tensor_gpu_test',
+    'device_event_test',
+    'copy_cross_scope_test',
+    'test_fused_layernorm_residual_dropout_bias',
+    'test_fused_dropout_act_bias',
+    'test_tensorrt',
+    'test_matmul_api',
+    'test_egr_task_fwd_bwd_joint',
+    'beam_search_test',
+    'test_tensor_to_list',
+    'test_identity_op',
+    'test_eigvals_op',
+    'test_functional_conv1d_transpose',
+    'test_Tensor_type',
+    'test_analyzer_transformer',
+    'test_analyzer_text_classification',
+    'test_analyzer_small_dam',
+    'test_analyzer_int8_mobilenetv2',
+    'test_analyzer_int8_mobilenetv1',
+    'test_analyzer_int8_googlenet',
+    'test_analyzer_bfloat16_resnet50',
+    'test_analyzer_bfloat16_mobilenetv2',
+    'test_analyzer_bfloat16_mobilenetv1',
+    'test_analyzer_quant_performance_benchmark',
+    'test_analyzer_int8_resnet50',
+    'test_analyzer_int8_mobilenet_ssd',
+    'test_analyzer_bfloat16_googlenet',
+    'test_analyzer_transformer_profile',
+    'test_analyzer_capi_exp_gpu',
+    'test_ir_subgraph_python_interface',
+    'test_memory_analysis',
+    'test_functional_conv1d',
+    'test_op_converter',
+    'cost_model_test',
+    'test_mkldnn_softplus_activation_fuse_pass',
+    'test_custom_relu_op_jit',
+    'test_custom_relu_model',
+    'test_custom_linear',
+    'test_custom_attrs_jit',
+    'test_custom_relu_op_setup',
+    'test_mkldnn_matmul_v2_transpose_reshape_fuse_pass',
+    'pten_test_backend',
+    'test_allocator',
+    'pten_test_data_type',
+    'test_slice_api',
+    'test_scale_api',
+    'test_sum_api',
+    'enforce_test',
+    'test_op_compat_sensible_pass',
+    'test_generate_pass_cc',
+    'program_processing_test',
+    'build_strategy_test',
+    'workqueue_test',
+    'test_fc_rnn_mkldnn_fuse_pass',
+    'test_cpu_quantize_squash_pass',
+    'scope_guard_test',
+    'pten_utils_test',
+    'init_test',
+    'cpu_helper_test',
+    'complex_gpu_test',
+    'bfloat16_gpu_test',
+    'test_scale_dev_api',
+    'job',
+    'test_kernel_factory',
+    'test_dot_dev_api',
+    'test_copy_dev_api',
+    'test_convert_utils',
+    'test_type_info',
+    'test_flatten_dev_api',
+    'test_storage',
+    'test_intrusive_ptr',
+    'test_dense_tensor',
+    'test_mean_dev_api',
+    'test_cast_dev_api',
+    'test_trt_convert_slice',
+    'test_framework_tensor_utils',
+    'test_sum_dev_api',
+    'test_reshape_dev_api',
+    'test_elementwise_dev_api',
+    'small_vector_test',
+    'test_framework_place_utils',
+    'test_reshape_api',
+    'test_cast_api',
+    'test_pten_exception',
+    'test_mean_api',
+    'test_framework_storage',
+    'test_flatten_api',
+    'test_fill_api',
+    'test_elementwise_api',
+    'test_dot_api',
+    'test_split_plugin',
+    'test_auto_parallel_api',
+    'test_linear_chain_crf_op',
+    'test_callback_early_stop',
+    'test_tensor_copy_from',
+    'test_inplace_and_clear_gradient',
+    'test_analyzer_capi_exp_xpu',
+    'test_table_printer',
+    'test_egr_task_autocodegen',
+    'test_static_save_load_bf16',
+    'test_reset_grad_inplace_version',
+    'test_parallel_executor_run_cinn',
+    'test_initializer',
+    'test_egr_task_tensor_utils',
+    'test_egr_task_hook',
+    'test_egr_task_forward_autograd',
+    'test_egr_task_eager_utils',
+    'test_egr_task_cross_batch',
+    'test_egr_task_backward',
+    'test_egr_ds_tensor_wrapper',
+    'test_egr_ds_grad_tensor_holder',
+    'test_egr_ds_grad_node_info',
+    'test_egr_ds_auotgrad_meta',
+    'test_egr_ds_accumulation_node',
+    'test_save_inference_model_conditional_op',
+    'test_resnet50_with_cinn',
+    'test_parallel_executor_run_load_infer_program',
+    'test_parallel_dygraph_sync_batch_norm',
+    'test_monitor',
+    'test_mkldnn_quantizer_config',
+    'test_mkldnn_quantizer',
+    'test_lookup_table_v2_bf16_op',
+    'test_hapi_hub_model',
+    'test_get_inputs_outputs_in_block',
+    'test_get_device_properties',
+    'test_fleet_elastic_manager',
+    'test_fleet_elastic_init',
+    'test_fleet_elastic_collective',
+    'test_fleet_ascend_utils',
+    'test_executor_check_fetch_list',
+    'test_eig_op',
+    'test_egr_performance_benchmark_fluid_cpu',
+    'test_egr_performance_benchmark_eager_cpu',
+    'test_egr_ds_eager_tensor',
+    'test_datasets',
+    'test_dataset_wmt',
+    'test_dataset_movielens',
+    'test_dataset_download',
+    'test_dataset_consistency_inspection',
+    'test_dataset_cifar',
+    'test_cyclic_cifar_dataset',
+    'test_cuda_device_name_capability',
+    'test_cuda_device_count',
+    'test_cuda_cudnn_version',
+    'test_collective_base',
+    'test_collective_api_base',
+    'test_backward_infer_var_data_type_shape',
+    'test_auto_parallel_graph',
+    'test_auto_parallel_completion_gpt',
+    'test_auto_parallel_completion',
+    'test_auto_parallel_cluster',
+    'test_analyzer_transformer_fuse',
+    'test_analyzer_save_model',
+    'test_analyzer_lexical_gru_int8_multi_gru',
+    'test_analyzer_lexical_gru_int8',
+    'test_analyzer_lexical_gru_bfloat16',
+    'test_analyzer_lexical_gru',
+    'test_analyzer_lac',
+    'test_analyzer_detect_functional_mkldnn',
+    'test_analyzer_capi_exp_pd_tensor',
+    'test_analyzer_capi_exp_pd_config',
+    'test_analyzer_capi_exp_ner',
+    'test_analyzer_capi_exp_int',
+    'test_analyzer_capi_exp',
+    'string_helper_test',
+    'preprocess_local_pascalvoc',
+    'preprocess_local_imagenet',
+    'paddle_infer_api_errors_test',
+    'test_flatten_mkldnn_op',
+    'test_transfer_layout_op',
+    'test_squeeze2_mkldnn_op',
+    'test_conv2d_transpose_bf16_mkldnn_op',
+    'test_slice_mkldnn_op',
+    'test_parallel_executor_seresnext_base_cpu',
+    'test_stack_mkldnn_op',
+    'test_split_bf16_mkldnn_op',
+    'test_softplus_mkldnn_op',
+    'test_scale_bf16_mkldnn_op',
+    'test_parallel_executor_seresnext_with_reduce_cpu',
+    'test_nearest_interp_v2_mkldnn_op',
+    'test_ir_generate_pass',
+    'test_fusion_lstm_mkldnn_op',
+    'test_fuse_resnet_unit',
+    'test_expand_v2_mkldnn_op',
+    'test_elementwise_sub_mkldnn_op',
+    'test_elementwise_div_mkldnn_op',
+    'test_uniform_random_bf16_op',
+    'test_reshape_mkldnn_op',
+    'test_reduce_bf16_mkldnn_op',
+    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
+    'test_nearest_interp_mkldnn_op',
+    'test_ir_graph_to_program_pass',
+    'test_fusion_lstm_int8_mkldnn_op',
+    'test_fusion_lstm_bf16_mkldnn_op',
+    'test_convert_call_generator',
+    'test_container',
+    'test_clip_mkldnn_op',
+    'test_cast_mkldnn_op',
+    'test_bilinear_interp_v2_mkldnn_op',
+    'test_bilinear_interp_mkldnn_op',
+    'test_asp_utils',
 ]
 
 

From b48545ee3d857a6e922f36265b037359b717071b Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Tue, 7 Dec 2021 15:19:42 +0800
Subject: [PATCH 102/124] fix filter_by_instag op for lod_level=0 without
 lod;test=develop (#37834)

---
 paddle/fluid/operators/filter_by_instag_op.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 77bc9e466e808c..fd0f42df11875e 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -65,19 +65,26 @@ class FilterByInstagKernel : public framework::OpKernel<T> {
     // expected auto = const int64_t
     auto* x2_data = x2->data<int64_t>();
     // e.g get [0, 1, 2, 3, ...]
-    auto x2_lods = x2->lod()[0];
+    size_t x2_lods_size = x2->dims()[0];
     Vector<size_t> x1_lods(1, 0);
     if (!is_x1_lod) {
       for (int i = 0; i < x1->dims()[0]; i++) {
         x1_lods.push_back(i + 1);
       }
     } else {
-      x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
+      // new: lod_level=0 => lod() return {}
+      if (x1->lod().size() != 0) {
+        x1_lods = x1->lod()[0];
+      } else {
+        for (int i = 0; i < x1->dims()[0]; i++) {
+          x1_lods.push_back(i + 1);
+        }
+      }
     }
     std::unordered_map<int64_t, int64_t> mmap_aux;
     Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
+    for (size_t i = 0; i < x2_lods_size; i++) {
+      for (size_t j = i; j < i + 1; j++) {
         if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
           size_t batch_len = x1_lods[i + 1] - x1_lods[i];
           mmap_aux[out_lods.back()] = x1_lods[i];

From cf5de26fe8bb30a71e2a77c083a3c818d6ce1f14 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 7 Dec 2021 15:55:31 +0800
Subject: [PATCH 103/124] ut support block (#37909)

---
 .../unittests/ir/inference/program_config.py  | 125 +++++++++++++++++-
 1 file changed, 120 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index a58be906762cfc..a8c43daab731b6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -14,6 +14,7 @@
 
 from typing import Optional, List, Callable, Dict, Any, Set
 import numpy as np
+import enum
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -57,6 +58,12 @@ def __repr__(self):
         return str({'shape': self.shape, 'lod': self.lod, 'dtype': self.dtype})
 
 
+class VarType(enum.Enum):
+    LOD_TENSOR = 1
+    LOD_TENSOR_ARRAY = 2
+    STEP_SCOPES = 3
+
+
 class OpConfig:
     '''  A config builder for generating a Op.  '''
 
@@ -65,10 +72,14 @@ def __init__(self,
                  inputs: Dict[str, List[str]],
                  outputs: Dict[str, List[str]],
                  attrs: Dict[str, Any]=None,
+                 outputs_var_type: Dict[str, VarType]=None,
+                 outputs_dtype: Dict[str, np.dtype]=None,
                  **kwargs):
         self.type = type
         self.inputs = inputs
         self.outputs = outputs
+        self.outputs_dtype = outputs_dtype
+        self.outputs_var_type = outputs_var_type
         self.attrs = attrs
         if self.attrs is None:
             self.attrs = dict()
@@ -80,6 +91,88 @@ def __repr__(self):
         return log_str
 
 
+_OP_WITHOUT_KERNEL_SET = {
+    'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
+    'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
+    'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
+    'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
+    'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
+    'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
+    'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+    'copy_cross_scope'
+}
+
+
+class BlockConfig:
+    ''' A config builder for generating a Block. '''
+
+    def __init__(self,
+                 ops: List[OpConfig],
+                 vars: List[str],
+                 vars_dtype: Dict[str, np.dtype]=None,
+                 vars_var_type: Dict[str, VarType]=None,
+                 vars_lod_level: Dict[str, int]=None):
+        self.ops = ops
+        self.vars = vars
+        self.vars_dtype = vars_dtype
+        self.vars_var_type = vars_var_type
+        self.vars_lod_level = vars_lod_level
+
+    def fill_block_desc(self, block_desc):
+        for name in self.vars:
+            var_desc = block_desc.var(cpt.to_bytes(name))
+            var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
+            if self.vars_lod_level is not None and name in self.vars_lod_level.keys(
+            ):
+                var_desc.set_lod_level(self.vars_lod_level[name])
+            if self.vars_var_type is not None and name in self.vars_var_type.keys(
+            ):
+                if self.vars_var_type[name] == VarType.LOD_TENSOR_ARRAY:
+                    var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                elif self.vars_var_type[name] == VarType.STEP_SCOPES:
+                    var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                    continue
+            var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+            if self.vars_dtype is not None and name in self.vars_dtype.keys():
+                var_desc.set_dtype(
+                    convert_np_dtype_to_dtype_(self.vars_dtype[name]))
+
+        for op_config in self.ops:
+            op_desc = block_desc.append_op()
+            op_desc.set_type(op_config.type)
+            for name, values in op_config.inputs.items():
+                op_desc.set_input(name, values)
+            for name, values in op_config.attrs.items():
+                op_desc._set_attr(name, values)
+            for name, values in op_config.outputs.items():
+                op_desc.set_output(name, values)
+                for v in values:
+                    if block_desc.has_var_recursive(cpt.to_bytes(v)):
+                        continue
+                    var_desc = block_desc.var(cpt.to_bytes(v))
+                    var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
+                    if op_config.outputs_var_type is not None and v in op_config.outputs_var_type.keys(
+                    ):
+                        if op_config.outputs_var_type[
+                                v] == VarType.LOD_TENSOR_ARRAY:
+                            var_desc.set_type(
+                                core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                        elif op_config.outputs_var_type[
+                                v] == VarType.STEP_SCOPES:
+                            var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                            continue
+                    var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+                    if op_config.outputs_dtype is not None and v in op_config.outputs_dtype.keys(
+                    ):
+                        var_desc.set_dtype(
+                            convert_np_dtype_to_dtype_(op_config.outputs_dtype[
+                                v]))
+            if op_config.type not in _OP_WITHOUT_KERNEL_SET:
+                op_desc.infer_var_type(block_desc)
+                op_desc.infer_shape(block_desc)
+            op_desc.check_attrs()
+
+
 class ProgramConfig:
     '''  A config builder for generating a Program.  '''
 
@@ -137,6 +230,8 @@ def create_fake_model(program_config):
         var_desc.set_dtype(convert_np_dtype_to_dtype_(tensor_config.dtype))
         var_desc.set_shape(tensor_config.shape)
         var_desc.set_need_check_feed(True)
+        if tensor_config.lod is not None:
+            var_desc.set_lod_level(len(tensor_config.lod))
         op_desc = main_block_desc._prepend_op()
         op_desc.set_type("feed")
         op_desc.set_input('X', ["feed"])
@@ -177,16 +272,36 @@ def create_fake_model(program_config):
         for name, values in op_config.inputs.items():
             op_desc.set_input(name, values)
         for name, values in op_config.attrs.items():
-            op_desc._set_attr(name, values)
+            if name == 'sub_block':
+                sub_block_desc = main_program_desc.append_block(main_block_desc)
+                values.fill_block_desc(sub_block_desc)
+                op_desc._set_attr(name, sub_block_desc)
+            else:
+                op_desc._set_attr(name, values)
         for name, values in op_config.outputs.items():
             op_desc.set_output(name, values)
             for v in values:
+                if main_block_desc.has_var_recursive(cpt.to_bytes(v)):
+                    continue
                 var_desc = main_block_desc.var(cpt.to_bytes(v))
                 var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR)
-                var_desc.set_dtype(
-                    convert_np_dtype_to_dtype_(tensor_config.dtype))
-        op_desc.infer_var_type(main_block_desc)
-        op_desc.infer_shape(main_block_desc)
+                if op_config.outputs_var_type is not None and v in op_config.outputs_var_type.keys(
+                ):
+                    if op_config.outputs_var_type[
+                            v] == VarType.LOD_TENSOR_ARRAY:
+                        var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR_ARRAY)
+                    elif op_config.outputs_var_type[v] == VarType.STEP_SCOPES:
+                        var_desc.set_type(core.VarDesc.VarType.STEP_SCOPES)
+                        continue
+                var_desc.set_dtype(convert_np_dtype_to_dtype_(np.float32))
+                if op_config.outputs_dtype is not None and v in op_config.outputs_dtype.keys(
+                ):
+                    var_desc.set_dtype(
+                        convert_np_dtype_to_dtype_(op_config.outputs_dtype[v]))
+        if op_config.type not in _OP_WITHOUT_KERNEL_SET:
+            op_desc.infer_var_type(main_block_desc)
+            op_desc.infer_shape(main_block_desc)
+        op_desc.check_attrs()
 
     for index, name in enumerate(program_config.outputs):
         var_desc = main_block_desc.var(cpt.to_bytes("fetch"))

From a754d907725f8e9478bde18f87283a3e250402f7 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 7 Dec 2021 16:06:07 +0800
Subject: [PATCH 104/124] Fix static git diff (#37914)

* fix static git diff check

* test=document_fix
---
 paddle/scripts/paddle_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4849fa20e58c3b..00da59d05691ec 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -324,7 +324,7 @@ function check_style() {
     clang-format --version
 
     commit_files=on
-    for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
+    for file_name in `git diff --numstat ${BRANCH} |awk '{print $NF}'`;do
         if ! pre-commit run --files $file_name ; then
             commit_files=off
         fi

From 7e831b5a5144ed4e9ffa038b83b4b3b9762792ed Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 7 Dec 2021 16:28:55 +0800
Subject: [PATCH 105/124] add cmake depend for api_gen.py (#37900)

---
 paddle/pten/api/lib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 33f20a4df04135..96ad9ade8e3ad5 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -32,7 +32,7 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
   COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
-  DEPENDS ${api_yaml_file}
+  DEPENDS ${api_yaml_file} ${api_gen_file}
   VERBATIM)
 
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)

From 723cbe9ffa0a40ca2103c9023648f503469a0047 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 7 Dec 2021 16:42:02 +0800
Subject: [PATCH 106/124] update logsumexp doc (#37883)

* update logsumexp doc

* update api doc

* update api doc
---
 python/paddle/tensor/math.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 36d61fa08546bf..df0116c4c29c29 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1078,7 +1078,8 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
        logsumexp(x) = \\log\\sum exp(x)
 
     Args:
-        x (Tensor): The input Tensor with data type float32, float64.
+        x (Tensor): The input Tensor with data type float32 or float64, which 
+            have no more than 4 dimensions.
         axis (int|list|tuple, optional): The axis along which to perform
             logsumexp calculations. ``axis`` should be int, list(int) or
             tuple(int). If ``axis`` is a list/tuple of dimension(s), logsumexp

From cf5860215c19caaea09d298514085bc24aad0439 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 7 Dec 2021 17:25:03 +0800
Subject: [PATCH 107/124] Buf fix for reset grad inplace version (#37811)

* Debug

* Fixed issue with reset_grad_inplace_version when used with clear_gradient & cross-batch accumulation

* Rearranged interfaces

* Fixed ci issues
---
 paddle/fluid/framework/tensor.h               |  1 +
 paddle/fluid/framework/variable.h             |  7 +++
 paddle/fluid/imperative/variable_wrapper.h    | 22 +++++--
 paddle/fluid/pybind/imperative.cc             | 10 ++-
 .../meta_parallel/sharding/sharding_stage2.py |  4 +-
 .../test_reset_grad_inplace_version.py        | 62 ++++++++++++++++++-
 6 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 494a02878f1a2c..4b1ae041fc4cad 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -81,6 +81,7 @@ class TensorInplaceVersion {
   bool IsUnique() const { return inplace_version_ == 0; }
   void Bump() { ++inplace_version_; }
   uint32_t CurrentVersion() const { return inplace_version_; }
+  void SetInplaceVersionToZero() { inplace_version_ = 0; }
 
  private:
   uint32_t inplace_version_;
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 792a2accd41d67..f8ad990a668ce6 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -75,6 +75,7 @@ class Variable {
   framework::TensorInplaceVersion* InplaceVersionCounter();
 
  public:
+  void SetInplaceVersionToZero();
   uint32_t CurrentInplaceVersion();
   void BumpInplaceVersion();
 
@@ -134,6 +135,12 @@ inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() {
   return version_counter_ptr;
 }
 
+inline void Variable::SetInplaceVersionToZero() {
+  auto inplace_version_counter = this->InplaceVersionCounter();
+  if (inplace_version_counter)
+    inplace_version_counter->SetInplaceVersionToZero();
+}
+
 inline uint32_t Variable::CurrentInplaceVersion() {
   auto version_counter_ptr = InplaceVersionCounter();
   if (version_counter_ptr) {
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 9fbbe7d06f8ad8..c257191a546e43 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -209,13 +209,23 @@ class VariableWrapper {
 
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
-  void ResetInplaceVersion() {
-    auto new_version = var_.CurrentInplaceVersion();
+  void ResetInplaceVersion(bool set_to_zero = false) {
+    if (!set_to_zero) {
+      auto new_version = var_.CurrentInplaceVersion();
 
-    VLOG(6) << "The wrapper version of VariableWrapper '" << name_
-            << "' will be updated from " << inplace_version_snapshot_ << "to "
-            << new_version;
-    inplace_version_snapshot_ = new_version;
+      VLOG(6) << "The wrapper version of VariableWrapper '" << name_
+              << "' will be updated from " << inplace_version_snapshot_ << "to "
+              << new_version;
+      inplace_version_snapshot_ = new_version;
+
+    } else {
+      // Reset Snapshot & InplaceVersion to zero
+      inplace_version_snapshot_ = 0;
+      auto var = this->MutableVar();
+      if (var) {
+        var->SetInplaceVersionToZero();
+      }
+    }
   }
 
   bool hasCacheKey(const paddle::framework::OpKernelType& key) {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 2c850f0ca84d5f..dc97d98e8c47fc 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1538,7 +1538,7 @@ void BindImperative(py::module *m_ptr) {
              self.MutableGradVarBase()->SetType(type);
            })
       .def("_reset_grad_inplace_version",
-           [](imperative::VarBase &self) {
+           [](imperative::VarBase &self, bool set_to_zero) {
              /*
              *** This interfaceis a complete hack ***
              reset_grad_inplace_version removes all inplace related records to
@@ -1550,15 +1550,20 @@ void BindImperative(py::module *m_ptr) {
              Make sure you fully understand what you're doing before make use of
              this interface, and prepare for the worst.
              */
+             py::gil_scoped_release release;
+
              if (self.HasGradVar()) {
                auto grad_var = self.GradVarBase();
                auto var_wrapper = grad_var->SharedVar();
-               if (var_wrapper) var_wrapper->ResetInplaceVersion();
+               if (var_wrapper) {
+                 var_wrapper->ResetInplaceVersion(set_to_zero);
+               }
              }
            })
       .def("_grad_ivar",
            [](const imperative::VarBase &self) {
              auto &grad_var = self.GradVarBase();
+
              if (grad_var && grad_var->Var().IsInitialized()) {
                auto *tensor =
                    grad_var->MutableVar()->IsType<framework::LoDTensor>()
@@ -1567,6 +1572,7 @@ void BindImperative(py::module *m_ptr) {
                        : grad_var->MutableVar()
                              ->GetMutable<framework::SelectedRows>()
                              ->mutable_value();
+
                if (tensor->IsInitialized()) {
                  return grad_var;
                }
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 329dc9eaa4e575..37b85751149f71 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -177,7 +177,7 @@ def grad_scale(self):
         for param in self._trainable_params:
             if param.name in self._param_grads and param.grad is not None:
                 param.grad.scale_(scale=self._world_size_scaling)
-                param._reset_grad_inplace_version()
+                param._reset_grad_inplace_version(True)
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -283,7 +283,7 @@ def reduce(*_):
                     self._grad_reduced[index] = False
                     if not self._accumulate_grads:
                         param.grad.scale_(scale=self._world_size_scaling)
-                    param._reset_grad_inplace_version()
+                    param._reset_grad_inplace_version(True)
 
                     # Clear the gradient that does not belong to the current rank through the callback function
                     def cleanup():
diff --git a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
index d9634f4997d80e..fee5bb8f47f260 100644
--- a/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
+++ b/python/paddle/fluid/tests/unittests/test_reset_grad_inplace_version.py
@@ -20,12 +20,13 @@
 paddle.set_device('cpu')
 
 
-def clear_grad(w, a):
+# Test 1
+def clear_grad_test_0(w, a):
     @paddle.no_grad()
     def warp(*_):
         assert w.grad is not None
         _C_ops.scale_(w.grad, 'scale', 0.5)
-        w._reset_grad_inplace_version()
+        w._reset_grad_inplace_version(True)
 
     return warp
 
@@ -35,7 +36,7 @@ def test(self):
         input_data = np.ones([1, 1])
         w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
 
-        _clear_grad = clear_grad(w, a="1")
+        _clear_grad = clear_grad_test_0(w, a="1")
         w._register_backward_hook(_clear_grad)
         for i in range(2):
             print(" Step: ", i)
@@ -45,5 +46,60 @@ def test(self):
         assert w.grad[0] == 0.15
 
 
+# Test 2
+class Counter:
+    def __init__(self):
+        self.num_calls = 0
+        self.step = 0
+
+
+def clear_grad_test_1(w, c):
+    @paddle.no_grad()
+    def warp(*_):
+        assert w.grad is not None
+        if c.step == 1:
+            w.grad.scale_(scale=0.5)
+            w._reset_grad_inplace_version(True)
+
+        c.num_calls += 1
+
+    return warp
+
+
+class TestInplaceClearGradAccumulation(unittest.TestCase):
+    def test(self):
+        input_data = np.ones([1, 1])
+        w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
+        c = Counter()
+
+        _clear_grad = clear_grad_test_1(w, c)
+        w._register_backward_hook(_clear_grad)
+
+        for c.step in range(5):
+            out0 = _C_ops.scale(w, 'scale', 0.1)
+            out = _C_ops.matmul_v2(out0, w, 'trans_x', False, 'trans_y', False)
+
+            out.backward()
+
+            if c.step == 1:
+                w.clear_gradient(False)
+
+            assert c.num_calls == 1
+            c.num_calls = 0
+
+
+class TestInplaceClearGradAccumulationAlt(unittest.TestCase):
+    def test(self):
+        input_data = np.ones([1, 1])
+        w = paddle.to_tensor(input_data, 'float32', stop_gradient=False)
+        out = _C_ops.scale(w, 'scale', 0.1)
+        out.backward()
+
+        w.grad.scale_(scale=0.5)
+        w._reset_grad_inplace_version(False)
+
+        assert w.grad._inplace_version() == 1
+
+
 if __name__ == '__main__':
     unittest.main()

From ca6ff1f6ab1e2d2f7adc23c46a0325e00464cb7d Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Tue, 7 Dec 2021 17:26:51 +0800
Subject: [PATCH 108/124] block MASM : warning A4018 when building cryptopp in
 windows with ninja (#37890)

---
 cmake/external/cryptopp.cmake   | 4 ++--
 patches/cryptopp/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 913fbfed316d8e..27a013c1763a72 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,9 +22,9 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  # There is a compilation parameter 'FI\"winapifamily.h\"' can't be used correctly
+  # There is a compilation parameter "/FI\"winapifamily.h\"" or "/FIwinapifamily.h" can't be used correctly
   # with Ninja on Windows. The only difference between the patch file and original
-  # file is that the compilation parameters are changed to 'FIwinapifamily.h'. This
+  # file is that the compilation parameters are changed to '/nologo'. This
   # patch command can be removed when upgrading to a higher version.
   if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
     set(CRYPTOPP_PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
diff --git a/patches/cryptopp/CMakeLists.txt b/patches/cryptopp/CMakeLists.txt
index c533b707350d69..d5918d6a1da770 100644
--- a/patches/cryptopp/CMakeLists.txt
+++ b/patches/cryptopp/CMakeLists.txt
@@ -447,7 +447,7 @@ if (MSVC)
   if (CMAKE_SYSTEM_VERSION MATCHES "10\\.0.*")
     list(APPEND CRYPTOPP_COMPILE_DEFINITIONS "_WIN32_WINNT=0x0A00")
   endif ()
-  list(APPEND CRYPTOPP_COMPILE_OPTIONS "/FIwinapifamily.h")
+  list(APPEND CRYPTOPP_COMPILE_OPTIONS "/nologo")
 endif ()
 
 # Enable PIC for all target machines except 32-bit i386 due to register pressures.

From 890bd6266c1ba638ded7487e189fcf658e0579a1 Mon Sep 17 00:00:00 2001
From: xiaoting <31891223+tink2123@users.noreply.github.com>
Date: Tue, 7 Dec 2021 17:32:49 +0800
Subject: [PATCH 109/124] add maxunpool2d in __all__ (#37698)

* add maxunpool2d in __all__

* fix MaxUnPool2D example
---
 python/paddle/nn/__init__.py      | 3 ++-
 python/paddle/nn/layer/pooling.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1abe74e9783dc4..3afd2b56569a49 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -295,5 +295,6 @@ def weight_norm(*args):
            'ELU',
            'ReLU6',
            'LayerDict',
-           'ZeroPad2D'
+           'ZeroPad2D',
+           'MaxUnPool2D',
 ]
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index b5d6d7834f9592..cc49db9b2056fc 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1181,7 +1181,7 @@ class MaxUnPool2D(Layer):
         pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
         # pool_out shape: [1, 1, 3, 3],  indices shape: [1, 1, 3, 3]
         Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0)
-        unpool_out = UnPool2D(pool_out, indices)
+        unpool_out = Unpool2D(pool_out, indices)
         # unpool_out shape: [1, 1, 6, 6]
 
     """

From 70dea13868a1945a3e7c6dd892d7b880d4fd7cbb Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Tue, 7 Dec 2021 18:40:30 +0800
Subject: [PATCH 110/124] introduce INF-RT (#37669)

* add infrt code

refined with Paddle's code style.

* rename CinnRtConfig to InfRtConfig

* rename CinnRt to InfRt of some code

* rename CINNRT to INFRT

* remove unnecessary code

* replace CINN to INFRT in the source code

* replace all "cinn" in code to "infrt"

* remove some const_cast
---
 CMakeLists.txt                                |   1 +
 cmake/external/llvm.cmake                     | 110 ++++
 cmake/third_party.cmake                       |   5 +
 paddle/CMakeLists.txt                         |   1 +
 paddle/infrt/CMakeLists.txt                   |  79 +++
 paddle/infrt/api/CMakeLists.txt               |   8 +
 paddle/infrt/api/infrt_api.cc                 | 246 ++++++++
 paddle/infrt/api/infrt_api.h                  |  63 ++
 paddle/infrt/api/infrt_api_test.cc            |  79 +++
 paddle/infrt/common/CMakeLists.txt            |  14 +
 paddle/infrt/common/buffer.cc                 |  98 +++
 paddle/infrt/common/buffer.h                  | 296 ++++++++++
 paddle/infrt/common/common.h                  |  61 ++
 paddle/infrt/common/dtype.cc                  |  50 ++
 paddle/infrt/common/dtype.def                 |  18 +
 paddle/infrt/common/dtype.h                   |  85 +++
 paddle/infrt/common/global.cc                 |  30 +
 paddle/infrt/common/global.h                  |  32 +
 paddle/infrt/common/macros.h                  |  52 ++
 paddle/infrt/common/memory.cc                 |  42 ++
 paddle/infrt/common/memory.h                  |  76 +++
 paddle/infrt/common/object.cc                 |  19 +
 paddle/infrt/common/object.h                  |  81 +++
 paddle/infrt/common/shared.cc                 |  15 +
 paddle/infrt/common/shared.h                  | 153 +++++
 paddle/infrt/common/string.cc                 | 128 ++++
 paddle/infrt/common/string.h                  |  84 +++
 paddle/infrt/common/target.cc                 | 118 ++++
 paddle/infrt/common/target.h                  | 112 ++++
 paddle/infrt/common/type.cc                   | 358 +++++++++++
 paddle/infrt/common/type.h                    | 223 +++++++
 paddle/infrt/dialect/CMakeLists.txt           |  61 ++
 paddle/infrt/dialect/basic_kernels.cc         | 164 +++++
 paddle/infrt/dialect/basic_kernels.h          |  24 +
 paddle/infrt/dialect/basic_kernels.td         | 139 +++++
 paddle/infrt/dialect/dense_tensor.cc          | 277 +++++++++
 paddle/infrt/dialect/dense_tensor.h           |  79 +++
 paddle/infrt/dialect/dense_tensor.td          | 150 +++++
 paddle/infrt/dialect/diagnostic_utils.cc      |  52 ++
 paddle/infrt/dialect/diagnostic_utils.h       |  39 ++
 paddle/infrt/dialect/dialect.cc               |  36 ++
 paddle/infrt/dialect/infrt_base.cc            | 127 ++++
 paddle/infrt/dialect/infrt_base.h             |  73 +++
 paddle/infrt/dialect/infrt_base.td            |  42 ++
 paddle/infrt/dialect/init_infrt_dialects.cc   |  34 ++
 paddle/infrt/dialect/init_infrt_dialects.h    |  23 +
 paddle/infrt/dialect/mlir_loader.cc           |  72 +++
 paddle/infrt/dialect/mlir_loader.h            |  30 +
 paddle/infrt/dialect/mlir_loader_test.cc      |  57 ++
 paddle/infrt/dialect/mlir_tests/basic.mlir    |  40 ++
 .../infrt/dialect/mlir_tests/benchmark.mlir   |  23 +
 .../dialect/mlir_tests/dense_tensor.mlir      |  22 +
 .../infrt/dialect/mlir_tests/paddle_ops.mlir  |   8 +
 paddle/infrt/dialect/mlir_tests/rewrite.mlir  |  24 +
 .../dialect/mlir_tests/rewrite_conv_bn.mlir   |  15 +
 .../infrt/dialect/mlir_tests/tensor_map.mlir  |  31 +
 .../dialect/mlir_tests/tensor_shape.mlir      |   5 +
 .../infrt/dialect/mlir_tests/tensor_type.mlir |   9 +
 paddle/infrt/dialect/ops.td                   |   6 +
 paddle/infrt/dialect/opt.cc                   |  45 ++
 paddle/infrt/dialect/pd_op_base.td            |  77 +++
 paddle/infrt/dialect/pd_ops.cc                | 177 ++++++
 paddle/infrt/dialect/pd_ops.h                 |  57 ++
 paddle/infrt/dialect/pd_ops.td                | 182 ++++++
 paddle/infrt/dialect/pd_types.cc              |  15 +
 paddle/infrt/dialect/pd_types.h               |  57 ++
 paddle/infrt/dialect/print_ir.cc              | 134 +++++
 paddle/infrt/dialect/rewrite.td               |  90 +++
 paddle/infrt/dialect/tensor_shape.cc          |  68 +++
 paddle/infrt/dialect/tensor_shape.h           |  40 ++
 paddle/infrt/dialect/tensor_shape.td          |  49 ++
 paddle/infrt/dialect/tensor_shape_base.td     |  36 ++
 paddle/infrt/dialect/test_kernels.cc          | 163 +++++
 paddle/infrt/dialect/test_kernels.h           |  23 +
 paddle/infrt/dialect/test_kernels.td          |  65 ++
 paddle/infrt/dialect/types.cc                 |  17 +
 paddle/infrt/dialect/types.h                  |  16 +
 paddle/infrt/external_kernels/CMakeLists.txt  |  13 +
 paddle/infrt/external_kernels/basic.mlir      |  21 +
 .../infrt/external_kernels/basic_kernels.cc   |  59 ++
 paddle/infrt/external_kernels/fc.mlir         |  43 ++
 paddle/infrt/external_kernels/paddle.mlir     |  50 ++
 paddle/infrt/gtest_main.cc                    |  23 +
 paddle/infrt/host_context/CMakeLists.txt      |  29 +
 paddle/infrt/host_context/core_runtime.cc     |  93 +++
 paddle/infrt/host_context/core_runtime.h      |  86 +++
 .../infrt/host_context/core_runtime_test.cc   |  96 +++
 paddle/infrt/host_context/function.cc         |  19 +
 paddle/infrt/host_context/function.h          |  62 ++
 paddle/infrt/host_context/kernel_frame.cc     |  29 +
 paddle/infrt/host_context/kernel_frame.h      | 166 ++++++
 paddle/infrt/host_context/kernel_registry.cc  |  70 +++
 paddle/infrt/host_context/kernel_registry.h   |  67 +++
 .../host_context/kernel_registry_test.cc      |  47 ++
 paddle/infrt/host_context/kernel_utils.cc     |  19 +
 paddle/infrt/host_context/kernel_utils.h      | 352 +++++++++++
 .../infrt/host_context/kernel_utils_test.cc   |  69 +++
 paddle/infrt/host_context/mlir_exec.cc        |  80 +++
 .../host_context/mlir_function_executable.cc  | 135 +++++
 .../host_context/mlir_function_executable.h   |  78 +++
 .../host_context/mlir_program_executor.cc     |  19 +
 .../host_context/mlir_program_executor.h      |  79 +++
 .../infrt/host_context/mlir_tests/basic.mlir  |  30 +
 .../host_context/mlir_tests/dense_tensor.mlir |   9 +
 .../infrt/host_context/mlir_tests/shape.mlir  |   7 +
 .../host_context/mlir_to_runtime_translate.cc | 558 ++++++++++++++++++
 .../host_context/mlir_to_runtime_translate.h  | 107 ++++
 .../mlir_to_runtime_translate_test.cc         | 160 +++++
 paddle/infrt/host_context/op_executable.cc    | 151 +++++
 paddle/infrt/host_context/op_executable.h     |  92 +++
 .../infrt/host_context/op_executable_test.cc  |  56 ++
 paddle/infrt/host_context/symbol_table.cc     |  82 +++
 paddle/infrt/host_context/symbol_table.h      |  65 ++
 paddle/infrt/host_context/value.cc            |  69 +++
 paddle/infrt/host_context/value.h             | 156 +++++
 paddle/infrt/host_context/value_test.cc       |  34 ++
 paddle/infrt/kernel/CMakeLists.txt            |   9 +
 paddle/infrt/kernel/basic_kernels.cc          |  85 +++
 paddle/infrt/kernel/basic_kernels.h           |  34 ++
 paddle/infrt/kernel/control_flow_kernels.cc   |  44 ++
 paddle/infrt/kernel/control_flow_kernels.h    |  31 +
 paddle/infrt/kernel/tensor_kernels.cc         |  79 +++
 paddle/infrt/kernel/tensor_kernels.h          |  25 +
 paddle/infrt/kernel/tensor_shape_kernels.cc   |  38 ++
 paddle/infrt/kernel/tensor_shape_kernels.h    |  27 +
 paddle/infrt/kernel/test_kernels.cc           | 200 +++++++
 paddle/infrt/kernel/test_kernels.h            |  31 +
 paddle/infrt/paddle/CMakeLists.txt            |  24 +
 paddle/infrt/paddle/cpp/CMakeLists.txt        |  16 +
 paddle/infrt/paddle/cpp/desc_api.h            | 229 +++++++
 paddle/infrt/paddle/framework.proto           | 213 +++++++
 paddle/infrt/paddle/model_parser.cc           | 172 ++++++
 paddle/infrt/paddle/model_parser.h            |  55 ++
 paddle/infrt/paddle/pb/CMakeLists.txt         |  20 +
 paddle/infrt/paddle/pb/block_desc.cc          |  43 ++
 paddle/infrt/paddle/pb/block_desc.h           |  77 +++
 paddle/infrt/paddle/pb/op_desc.cc             | 139 +++++
 paddle/infrt/paddle/pb/op_desc.h              | 198 +++++++
 paddle/infrt/paddle/pb/program_desc.cc        |  35 ++
 paddle/infrt/paddle/pb/program_desc.h         |  61 ++
 paddle/infrt/paddle/pb/var_desc.cc            | 367 ++++++++++++
 paddle/infrt/paddle/pb/var_desc.h             | 124 ++++
 paddle/infrt/paddle/scope.cc                  |  44 ++
 paddle/infrt/paddle/scope.h                   |  68 +++
 paddle/infrt/paddle/tensor.cc                 |  19 +
 paddle/infrt/paddle/tensor.h                  | 107 ++++
 paddle/infrt/support/CMakeLists.txt           |   1 +
 paddle/infrt/support/type_traits.h            | 147 +++++
 paddle/infrt/support/variant.h                | 219 +++++++
 paddle/infrt/tensor/CMakeLists.txt            |  20 +
 paddle/infrt/tensor/dense_host_tensor.cc      |  86 +++
 paddle/infrt/tensor/dense_host_tensor.h       |  92 +++
 paddle/infrt/tensor/dense_tensor_view.cc      |  17 +
 paddle/infrt/tensor/dense_tensor_view.h       |  64 ++
 paddle/infrt/tensor/tensor_map.cc             |  95 +++
 paddle/infrt/tensor/tensor_map.h              |  29 +
 paddle/infrt/tensor/tensor_metadata.cc        |  30 +
 paddle/infrt/tensor/tensor_metadata.h         |  58 ++
 paddle/infrt/tensor/tensor_shape.cc           |  96 +++
 paddle/infrt/tensor/tensor_shape.h            |  82 +++
 paddle/scripts/paddle_build.sh                |   2 +
 161 files changed, 12742 insertions(+)
 create mode 100644 cmake/external/llvm.cmake
 create mode 100644 paddle/infrt/CMakeLists.txt
 create mode 100644 paddle/infrt/api/CMakeLists.txt
 create mode 100644 paddle/infrt/api/infrt_api.cc
 create mode 100644 paddle/infrt/api/infrt_api.h
 create mode 100644 paddle/infrt/api/infrt_api_test.cc
 create mode 100644 paddle/infrt/common/CMakeLists.txt
 create mode 100644 paddle/infrt/common/buffer.cc
 create mode 100644 paddle/infrt/common/buffer.h
 create mode 100644 paddle/infrt/common/common.h
 create mode 100644 paddle/infrt/common/dtype.cc
 create mode 100644 paddle/infrt/common/dtype.def
 create mode 100644 paddle/infrt/common/dtype.h
 create mode 100644 paddle/infrt/common/global.cc
 create mode 100644 paddle/infrt/common/global.h
 create mode 100644 paddle/infrt/common/macros.h
 create mode 100644 paddle/infrt/common/memory.cc
 create mode 100644 paddle/infrt/common/memory.h
 create mode 100644 paddle/infrt/common/object.cc
 create mode 100644 paddle/infrt/common/object.h
 create mode 100644 paddle/infrt/common/shared.cc
 create mode 100644 paddle/infrt/common/shared.h
 create mode 100644 paddle/infrt/common/string.cc
 create mode 100644 paddle/infrt/common/string.h
 create mode 100644 paddle/infrt/common/target.cc
 create mode 100644 paddle/infrt/common/target.h
 create mode 100644 paddle/infrt/common/type.cc
 create mode 100644 paddle/infrt/common/type.h
 create mode 100644 paddle/infrt/dialect/CMakeLists.txt
 create mode 100644 paddle/infrt/dialect/basic_kernels.cc
 create mode 100644 paddle/infrt/dialect/basic_kernels.h
 create mode 100644 paddle/infrt/dialect/basic_kernels.td
 create mode 100644 paddle/infrt/dialect/dense_tensor.cc
 create mode 100644 paddle/infrt/dialect/dense_tensor.h
 create mode 100644 paddle/infrt/dialect/dense_tensor.td
 create mode 100644 paddle/infrt/dialect/diagnostic_utils.cc
 create mode 100644 paddle/infrt/dialect/diagnostic_utils.h
 create mode 100644 paddle/infrt/dialect/dialect.cc
 create mode 100644 paddle/infrt/dialect/infrt_base.cc
 create mode 100644 paddle/infrt/dialect/infrt_base.h
 create mode 100644 paddle/infrt/dialect/infrt_base.td
 create mode 100644 paddle/infrt/dialect/init_infrt_dialects.cc
 create mode 100644 paddle/infrt/dialect/init_infrt_dialects.h
 create mode 100644 paddle/infrt/dialect/mlir_loader.cc
 create mode 100644 paddle/infrt/dialect/mlir_loader.h
 create mode 100644 paddle/infrt/dialect/mlir_loader_test.cc
 create mode 100644 paddle/infrt/dialect/mlir_tests/basic.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/benchmark.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/rewrite.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/tensor_map.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
 create mode 100644 paddle/infrt/dialect/mlir_tests/tensor_type.mlir
 create mode 100644 paddle/infrt/dialect/ops.td
 create mode 100644 paddle/infrt/dialect/opt.cc
 create mode 100644 paddle/infrt/dialect/pd_op_base.td
 create mode 100644 paddle/infrt/dialect/pd_ops.cc
 create mode 100644 paddle/infrt/dialect/pd_ops.h
 create mode 100644 paddle/infrt/dialect/pd_ops.td
 create mode 100644 paddle/infrt/dialect/pd_types.cc
 create mode 100644 paddle/infrt/dialect/pd_types.h
 create mode 100644 paddle/infrt/dialect/print_ir.cc
 create mode 100644 paddle/infrt/dialect/rewrite.td
 create mode 100644 paddle/infrt/dialect/tensor_shape.cc
 create mode 100644 paddle/infrt/dialect/tensor_shape.h
 create mode 100644 paddle/infrt/dialect/tensor_shape.td
 create mode 100644 paddle/infrt/dialect/tensor_shape_base.td
 create mode 100644 paddle/infrt/dialect/test_kernels.cc
 create mode 100644 paddle/infrt/dialect/test_kernels.h
 create mode 100644 paddle/infrt/dialect/test_kernels.td
 create mode 100644 paddle/infrt/dialect/types.cc
 create mode 100644 paddle/infrt/dialect/types.h
 create mode 100644 paddle/infrt/external_kernels/CMakeLists.txt
 create mode 100644 paddle/infrt/external_kernels/basic.mlir
 create mode 100644 paddle/infrt/external_kernels/basic_kernels.cc
 create mode 100644 paddle/infrt/external_kernels/fc.mlir
 create mode 100644 paddle/infrt/external_kernels/paddle.mlir
 create mode 100644 paddle/infrt/gtest_main.cc
 create mode 100644 paddle/infrt/host_context/CMakeLists.txt
 create mode 100644 paddle/infrt/host_context/core_runtime.cc
 create mode 100644 paddle/infrt/host_context/core_runtime.h
 create mode 100644 paddle/infrt/host_context/core_runtime_test.cc
 create mode 100644 paddle/infrt/host_context/function.cc
 create mode 100644 paddle/infrt/host_context/function.h
 create mode 100644 paddle/infrt/host_context/kernel_frame.cc
 create mode 100644 paddle/infrt/host_context/kernel_frame.h
 create mode 100644 paddle/infrt/host_context/kernel_registry.cc
 create mode 100644 paddle/infrt/host_context/kernel_registry.h
 create mode 100644 paddle/infrt/host_context/kernel_registry_test.cc
 create mode 100644 paddle/infrt/host_context/kernel_utils.cc
 create mode 100644 paddle/infrt/host_context/kernel_utils.h
 create mode 100644 paddle/infrt/host_context/kernel_utils_test.cc
 create mode 100644 paddle/infrt/host_context/mlir_exec.cc
 create mode 100644 paddle/infrt/host_context/mlir_function_executable.cc
 create mode 100644 paddle/infrt/host_context/mlir_function_executable.h
 create mode 100644 paddle/infrt/host_context/mlir_program_executor.cc
 create mode 100644 paddle/infrt/host_context/mlir_program_executor.h
 create mode 100644 paddle/infrt/host_context/mlir_tests/basic.mlir
 create mode 100644 paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
 create mode 100644 paddle/infrt/host_context/mlir_tests/shape.mlir
 create mode 100644 paddle/infrt/host_context/mlir_to_runtime_translate.cc
 create mode 100644 paddle/infrt/host_context/mlir_to_runtime_translate.h
 create mode 100644 paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
 create mode 100644 paddle/infrt/host_context/op_executable.cc
 create mode 100644 paddle/infrt/host_context/op_executable.h
 create mode 100644 paddle/infrt/host_context/op_executable_test.cc
 create mode 100644 paddle/infrt/host_context/symbol_table.cc
 create mode 100644 paddle/infrt/host_context/symbol_table.h
 create mode 100644 paddle/infrt/host_context/value.cc
 create mode 100644 paddle/infrt/host_context/value.h
 create mode 100644 paddle/infrt/host_context/value_test.cc
 create mode 100644 paddle/infrt/kernel/CMakeLists.txt
 create mode 100644 paddle/infrt/kernel/basic_kernels.cc
 create mode 100644 paddle/infrt/kernel/basic_kernels.h
 create mode 100644 paddle/infrt/kernel/control_flow_kernels.cc
 create mode 100644 paddle/infrt/kernel/control_flow_kernels.h
 create mode 100644 paddle/infrt/kernel/tensor_kernels.cc
 create mode 100644 paddle/infrt/kernel/tensor_kernels.h
 create mode 100644 paddle/infrt/kernel/tensor_shape_kernels.cc
 create mode 100644 paddle/infrt/kernel/tensor_shape_kernels.h
 create mode 100644 paddle/infrt/kernel/test_kernels.cc
 create mode 100644 paddle/infrt/kernel/test_kernels.h
 create mode 100644 paddle/infrt/paddle/CMakeLists.txt
 create mode 100644 paddle/infrt/paddle/cpp/CMakeLists.txt
 create mode 100644 paddle/infrt/paddle/cpp/desc_api.h
 create mode 100644 paddle/infrt/paddle/framework.proto
 create mode 100644 paddle/infrt/paddle/model_parser.cc
 create mode 100644 paddle/infrt/paddle/model_parser.h
 create mode 100644 paddle/infrt/paddle/pb/CMakeLists.txt
 create mode 100644 paddle/infrt/paddle/pb/block_desc.cc
 create mode 100644 paddle/infrt/paddle/pb/block_desc.h
 create mode 100644 paddle/infrt/paddle/pb/op_desc.cc
 create mode 100644 paddle/infrt/paddle/pb/op_desc.h
 create mode 100644 paddle/infrt/paddle/pb/program_desc.cc
 create mode 100644 paddle/infrt/paddle/pb/program_desc.h
 create mode 100644 paddle/infrt/paddle/pb/var_desc.cc
 create mode 100644 paddle/infrt/paddle/pb/var_desc.h
 create mode 100644 paddle/infrt/paddle/scope.cc
 create mode 100644 paddle/infrt/paddle/scope.h
 create mode 100644 paddle/infrt/paddle/tensor.cc
 create mode 100644 paddle/infrt/paddle/tensor.h
 create mode 100644 paddle/infrt/support/CMakeLists.txt
 create mode 100644 paddle/infrt/support/type_traits.h
 create mode 100644 paddle/infrt/support/variant.h
 create mode 100644 paddle/infrt/tensor/CMakeLists.txt
 create mode 100644 paddle/infrt/tensor/dense_host_tensor.cc
 create mode 100644 paddle/infrt/tensor/dense_host_tensor.h
 create mode 100644 paddle/infrt/tensor/dense_tensor_view.cc
 create mode 100644 paddle/infrt/tensor/dense_tensor_view.h
 create mode 100644 paddle/infrt/tensor/tensor_map.cc
 create mode 100644 paddle/infrt/tensor/tensor_map.h
 create mode 100644 paddle/infrt/tensor/tensor_metadata.cc
 create mode 100644 paddle/infrt/tensor/tensor_metadata.h
 create mode 100644 paddle/infrt/tensor/tensor_shape.cc
 create mode 100644 paddle/infrt/tensor/tensor_shape.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55f1e4cd224b32..03f8522ad54465 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,6 +216,7 @@ option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN   "Compile PaddlePaddle with CINN" OFF)
+option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
new file mode 100644
index 00000000000000..8fd4a0741eaba3
--- /dev/null
+++ b/cmake/external/llvm.cmake
@@ -0,0 +1,110 @@
+include(FetchContent)
+
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
+set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+
+set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
+set(FETCHCONTENT_QUIET OFF)
+FetchContent_Declare(external_llvm
+  URL ${LLVM_DOWNLOAD_URL}
+  URL_MD5 ${LLVM_MD5}
+  PREFIX ${THIRD_PARTY_PATH}/llvm
+  SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm
+)
+if (NOT LLVM_PATH)
+  FetchContent_GetProperties(external_llvm)
+  if (NOT external_llvm_POPULATED)
+    FetchContent_Populate(external_llvm)
+  endif()
+  set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm)
+  set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
+  set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
+else ()
+  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
+  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
+endif()
+
+if (${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+# To build with MLIR, the LLVM is build from source code using the following flags:
+
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+#]==]
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+
+add_definitions(${LLVM_DEFINITIONS})
+
+llvm_map_components_to_libnames(llvm_libs Support Core irreader
+        X86 executionengine orcjit mcjit all codegen)
+
+message(STATUS "LLVM libs: ${llvm_libs}")
+
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+
+
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+
+
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if (mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+
+function(mlir_add_rewriter td_base)
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+
+# Execute the mlir script with infrt-exec program.
+# @name: name of the test
+# @script: path to the mlir script file
+function (infrt_exec_check name script)
+  add_test(NAME ${name}
+    COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}")
+endfunction()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 7aa1e78abb9a3c..71e1856147449f 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -391,6 +391,11 @@ if (WIN32)
     list(APPEND third_party_deps extern_dirent)
 endif (WIN32)
 
+if (WITH_INFRT)
+    include(external/llvm)
+    list(APPEND third_party_deps external_llvm)
+endif()
+
 if (WITH_IPU)
     include(external/poplar)
     list(APPEND third_party_deps extern_poplar)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index b3a1b2e8c95873..4b88689b9b6dfa 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 add_subdirectory(pten)
+add_subdirectory(infrt)
 add_subdirectory(fluid)
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
new file mode 100644
index 00000000000000..b8f6f4738d3e75
--- /dev/null
+++ b/paddle/infrt/CMakeLists.txt
@@ -0,0 +1,79 @@
+if (NOT WITH_INFRT)
+    return()
+endif()
+
+set(infrt_src CACHE INTERNAL "" FORCE)
+
+# Gather headers for library publish.
+function(core_gather_headers)
+    file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+    foreach(header ${includes})
+        set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+    endforeach()
+endfunction()
+
+function(gather_srcs SRC_GROUP)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs "SRCS")
+    cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+    foreach(cpp ${prefix_SRCS})
+        set(${SRC_GROUP} "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}" CACHE INTERNAL "")
+    endforeach()
+endfunction()
+
+# This method is similar to the global cc_test, but discard the huge amount default dependencies those are
+# not needed by INFRT.
+function(cc_test_tiny TARGET_NAME)
+  if(WITH_TESTING)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cc_test_tiny "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cc_test_tiny_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${cc_test_tiny_DEPS} ${os_dependency_modules} infrt_gtest_main gtest )
+    add_dependencies(${TARGET_NAME} ${cc_test_tiny_DEPS} infrt_gtest_main gtest extern_gtest)
+
+    add_test(NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cc_test_tiny_ARGS}"
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_tiny_SERIAL})
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+  endif()
+
+endfunction()
+
+if (WITH_TESTING)
+    cc_library(infrt_gtest_main SRCS gtest_main.cc DEPS gtest glog gflags)
+endif()
+
+
+add_subdirectory(api)
+add_subdirectory(common)
+add_subdirectory(dialect)
+add_subdirectory(host_context)
+add_subdirectory(kernel)
+add_subdirectory(tensor)
+add_subdirectory(support)
+add_subdirectory(external_kernels)
+add_subdirectory(paddle)
+
+
+# MLIR td file generations
+set(infrt_mlir_incs
+        ops_inc
+        basic_kernels_inc
+        test_kernels_inc
+        infrt_base_inc
+        tensor_shape_inc
+        dense_tensor_inc
+        pd_ops_inc
+        rewrite_inc
+        )
+message(STATUS "infrt srcs:\n${infrt_src}")
+
+cc_library(infrt SRCS ${infrt_src} DEPS glog ${mlir_libs} paddle_framework_proto)
+add_dependencies(infrt ${infrt_mlir_incs})
diff --git a/paddle/infrt/api/CMakeLists.txt b/paddle/infrt/api/CMakeLists.txt
new file mode 100644
index 00000000000000..93a7ae8369521c
--- /dev/null
+++ b/paddle/infrt/api/CMakeLists.txt
@@ -0,0 +1,8 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    infrt_api.cc
+    )
+
+# Disable temporarily for the external-kernel's mkldnn is outdate
+# cc_test(test_infrt_api SRCS infrt_api_test.cc DEPS infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
new file mode 100644
index 00000000000000..c2a4e0aff7a08e
--- /dev/null
+++ b/paddle/infrt/api/infrt_api.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/api/infrt_api.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/Parser.h>
+
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/value.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+
+using namespace infrt::host_context;  // NOLINT
+using namespace infrt::tensor;        // NOLINT
+using namespace infrt::tensor;        // NOLINT
+using infrt::dt::TensorMapType;       // NOLINT
+using infrt::dt::TensorType;          // NOLINT
+
+namespace infrt {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+struct MlirToRuntimeTranslator::Impl {
+  mlir::ModuleOp module;
+  // The runtime for a function call.
+  CoreRuntimeBuilder* runtime{};
+
+  // The current working op, the translator process the ops one by one, each
+  // time it updates `cur_op` here to current op
+  // working on.
+  OpExecutableBuilder* cur_op{};
+
+  // record the current function name.
+  std::string cur_func_name;
+
+  // Name to function definitions.
+  std::unordered_map<std::string, mlir::FuncOp> func_defs;
+
+  // Map from an operation to its results.
+  std::unordered_map<const mlir::Operation*, std::vector<ValueRef>> op_results;
+  llvm::DenseMap<mlir::Value, ValueRef> value_map;
+};
+
+/**
+ * Execute the mlir program in predict mode.
+ */
+class PredictExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder core_runtime;
+
+  PredictExecutor(mlir::ModuleOp module,
+                  KernelRegistry* registry,
+                  TensorMap* map)
+      : MlirToRuntimeTranslator(module, &core_runtime),
+        core_runtime(registry),
+        registry_(registry) {
+    CHECK(registry_);
+    Init(map);
+  }
+
+  void Run() {
+    auto arguments = llvm::makeArrayRef(arguments_);
+    auto results = llvm::makeMutableArrayRef(results_.begin(), results_.size());
+    function_executable_->Execute(arguments, results);
+  }
+
+  int GetInputNum() { return inputs_.size(); }
+
+  DenseHostTensor* GetInput(int i) { return inputs_[i]; }
+
+  int GetOutputNum() { return outputs_.size(); }
+
+  DenseHostTensor* GetOutput(int i) { return outputs_[i]; }
+
+ private:
+  void Init(TensorMap* map) {
+    EmitFunctions();
+    llvm::Optional<mlir::FuncOp> predict_func_ = llvm::None;
+    for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+      if (func_op.getName().str() != "predict") continue;
+      predict_func_ = func_op;
+      break;
+    }
+    if (!predict_func_) {
+      std::cout << "ERROR: init failed, no predict function found in mlir."
+                << std::endl;
+      return;
+    }
+    auto& predict_func = predict_func_.getValue();
+    function_executable_ =
+        new MlirFunctionExecutable(predict_func, registry_, impl_->func_defs);
+
+    // process parammeters
+    for (size_t i = 0; i < predict_func.getNumArguments(); ++i) {
+      auto arg = predict_func.getArgument(i);
+      auto type = arg.getType();
+      // this param is TensorMap
+      if (type.isa<TensorMapType>()) {
+        auto* value = new host_context::Value(std::move(*map));
+        arguments_.push_back(value);
+        AddValue(predict_func.getArgument(i), value);
+      } else {
+        // this param is an input Tensor
+        auto dht = DenseHostTensor();
+        auto* value = new host_context::Value(std::move(dht));
+        arguments_.push_back(value);
+        inputs_.push_back(&(value->get<DenseHostTensor>()));
+      }
+    }
+
+    // process results
+    auto& last_op = predict_func.front().back();
+    if (last_op.getName().getStringRef() == "infrt.return") {
+      for (size_t i = 0; i < last_op.getNumOperands(); ++i) {
+        auto* value = AddValue(mlir::Value(last_op.getOperand(i)));
+        results_.push_back(ValueRef(value));
+        outputs_.push_back(&(value->get<DenseHostTensor>()));
+      }
+    }
+  }
+
+ protected:
+  std::unordered_map<std::string, mlir::FuncOp> func_def_table;
+
+  void EmitFunction(mlir::FuncOp op) override {
+    CHECK(!impl_->func_defs.count(op.getName().str()))
+        << "Duplicate function defition found for function ["
+        << op.getName().str();
+    impl_->func_defs.emplace(op.getName().str(), op);
+  }
+
+ private:
+  KernelRegistry* registry_{};
+  MlirFunctionExecutable* function_executable_;
+  llvm::SmallVector<DenseHostTensor*, 1> inputs_;
+  llvm::SmallVector<host_context::Value*, 2> arguments_;
+  llvm::SmallVector<DenseHostTensor*, 1> outputs_;
+  llvm::SmallVector<ValueRef, 1> results_;
+};
+
+std::shared_ptr<InfRtPredictor> CreateInfRtPredictor(
+    const InfRtConfig& config) {
+  auto x = std::make_shared<InfRtPredictor>();
+  x->Init(config);
+  return x;
+}
+
+struct InfRtPredictor::Impl {
+  mlir::OwningModuleRef module_ref;
+  std::unique_ptr<PredictExecutor> executor;
+};
+
+InfRtPredictor::InfRtPredictor() : impl_(new Impl) {}
+InfRtPredictor::~InfRtPredictor() {}
+
+void InfRtPredictor::Run() { impl_->executor->Run(); }
+
+int InfRtPredictor::Init(const InfRtConfig& config) {
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module_ref = dialect::LoadMlirFile(config.mlir_path(), context);
+
+  KernelRegistry* registry = new KernelRegistry();
+
+  kernel::RegisterBasicKernels(registry);
+  kernel::RegisterTestKernels(registry);
+  kernel::RegisterTensorShapeKernels(registry);
+  kernel::RegisterTensorKernels(registry);
+  kernel::RegisterControlFlowKernels(registry);
+
+  impl_->module_ref = std::move(module_ref);
+
+  // load extra shared library
+  for (const std::string& lib_path : config.shared_libs()) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      return 1;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func = reinterpret_cast<void (*)(KernelRegistry*)>(reg_sym);
+      reg_func(registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+
+  // Load params
+  TensorMap* tensor_map = LoadParams(config.model_dir());
+
+  // Create PredictExecutor
+  impl_->executor.reset(
+      new PredictExecutor(impl_->module_ref.get(), registry, tensor_map));
+  return 0;
+}
+
+int InfRtPredictor::GetInputNum() { return impl_->executor->GetInputNum(); }
+
+DenseHostTensor* InfRtPredictor::GetInput(int i) {
+  return impl_->executor->GetInput(i);
+}
+
+int InfRtPredictor::GetOutputNum() { return impl_->executor->GetOutputNum(); }
+
+DenseHostTensor* InfRtPredictor::GetOutput(int i) {
+  return impl_->executor->GetOutput(i);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h
new file mode 100644
index 00000000000000..82b6cb8df91ff7
--- /dev/null
+++ b/paddle/infrt/api/infrt_api.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt {
+
+class InfRtConfig {
+  std::string model_dir_;
+  std::string mlir_path_;
+  std::vector<std::string> shared_libs_;
+
+ public:
+  InfRtConfig() = default;
+  void set_model_dir(const std::string& model_dir) { model_dir_ = model_dir; }
+  const std::string& model_dir() const { return model_dir_; }
+
+  void set_mlir_path(const std::string& mlir_path) { mlir_path_ = mlir_path; }
+  const std::string& mlir_path() const { return mlir_path_; }
+
+  void set_shared_libs(const std::vector<std::string>& shared_libs) {
+    shared_libs_ = shared_libs;
+  }
+  const std::vector<std::string>& shared_libs() const { return shared_libs_; }
+
+  virtual ~InfRtConfig() = default;
+};
+
+class InfRtPredictor {
+ public:
+  InfRtPredictor();
+  ~InfRtPredictor();
+  void Run();
+  int Init(const InfRtConfig& config);
+  int GetInputNum();
+  tensor::DenseHostTensor* GetInput(int i);
+  int GetOutputNum();
+  tensor::DenseHostTensor* GetOutput(int i);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+std::shared_ptr<InfRtPredictor> CreateInfRtPredictor(const InfRtConfig& config);
+
+}  // namespace infrt
diff --git a/paddle/infrt/api/infrt_api_test.cc b/paddle/infrt/api/infrt_api_test.cc
new file mode 100644
index 00000000000000..92e069f47521b7
--- /dev/null
+++ b/paddle/infrt/api/infrt_api_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/api/infrt_api.h"
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include "llvm/Support/raw_ostream.h"
+#include "paddle/infrt/common/buffer.h"
+#include "paddle/infrt/common/dtype.h"
+
+using infrt::InfRtConfig;
+using infrt::InfRtPredictor;
+using infrt::CreateInfRtPredictor;
+
+namespace infrt {
+
+TEST(InfRtPredictor, predictor) {
+  std::vector<std::string> shared_libs;
+  shared_libs.push_back("../../paddle/libexternal_kernels.so");
+
+  InfRtConfig config;
+
+  // set external shared libraries that contain kernels.
+  config.set_shared_libs(shared_libs);
+  // set model dir
+  config.set_model_dir("../../paddle/paddle_1.8_fc_model");
+  // set mlir path
+  config.set_mlir_path("../../../infrt/dialect/mlir_tests/tensor_map.mlir");
+
+  std::shared_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
+
+  auto* input = predictor->GetInput(0);
+  std::vector<int64_t> shape = {3, 3};
+  input->Init(shape, infrt::GetDType<float>());
+  llvm::outs() << input->shape() << "\n";
+
+  // init input tensor
+  auto* input_data = reinterpret_cast<float*>(input->buffer()->data()->memory);
+  for (int i = 0; i < input->shape().GetNumElements(); i++) input_data[i] = 1.0;
+
+  predictor->Run();
+
+  // get and print output tensor
+  auto* output = predictor->GetOutput(0);
+  auto* output_data =
+      reinterpret_cast<float*>(output->buffer()->data()->memory);
+
+  std::vector<float> ans = {0.428458,
+                            0.244493,
+                            0.572342,
+                            0.572008,
+                            0.509771,
+                            0.495599,
+                            0.651287,
+                            0.326426,
+                            0.404649};
+
+  ASSERT_EQ(output->shape().GetNumElements(), ans.size());
+  for (int i = 0; i < output->shape().GetNumElements(); ++i) {
+    ASSERT_NEAR(output_data[i], ans[i], 0.000001);
+  }
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/CMakeLists.txt b/paddle/infrt/common/CMakeLists.txt
new file mode 100644
index 00000000000000..931e3e42307eb5
--- /dev/null
+++ b/paddle/infrt/common/CMakeLists.txt
@@ -0,0 +1,14 @@
+core_gather_headers()
+set(core_includes "${core_includes};infrt/common/dtype.def" CACHE INTERNAL "")
+
+gather_srcs(infrt_src SRCS
+    dtype.cc
+    global.cc
+    target.cc
+    type.cc
+    shared.cc
+    object.cc
+    string.cc
+    buffer.cc
+    memory.cc
+    )
diff --git a/paddle/infrt/common/buffer.cc b/paddle/infrt/common/buffer.cc
new file mode 100644
index 00000000000000..bc4ec7feada87b
--- /dev/null
+++ b/paddle/infrt/common/buffer.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/buffer.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include <cmath>
+
+namespace infrt {
+void Buffer::Resize(uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory = reinterpret_cast<uint8_t*>(Malloc(size));
+    size_ = size;
+  }
+}
+
+void Buffer::Resize(uint32_t alignment, uint32_t size) {
+  if (size_ > 0) {
+    Free();
+    size_ = 0;
+  }
+
+  if (size_ != size) {
+    data_.memory = reinterpret_cast<uint8_t*>(AlignedAlloc(alignment, size));
+    size_ = size;
+  }
+}
+
+void Buffer::SetTarget(const infrt::common::Target& target) {
+  target_ = target;
+  memory_mng_cache_ = MemoryManager::Global().RetrieveSafely(target_.arch);
+}
+
+void Buffer::ResizeLazy(uint32_t size) {
+  if (size <= size_) return;
+  Resize(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment, uint32_t size) {
+  if (size <= size_) return;
+  Resize(alignment, size);
+}
+
+void Buffer::Resize(uint32_t size, const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(size);
+}
+
+void Buffer::Resize(uint32_t alignment,
+                    uint32_t size,
+                    const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  Resize(alignment, size);
+}
+
+void Buffer::ResizeLazy(uint32_t size, const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(size);
+}
+
+void Buffer::ResizeLazy(uint32_t alignment,
+                        uint32_t size,
+                        const infrt::common::Target& target) {
+  if (target.arch != target_.arch) {
+    Free();
+    SetTarget(target);
+  }
+  ResizeLazy(alignment, size);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/buffer.h b/paddle/infrt/common/buffer.h
new file mode 100644
index 00000000000000..cae2a7ead96abe
--- /dev/null
+++ b/paddle/infrt/common/buffer.h
@@ -0,0 +1,296 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <memory>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/memory.h"
+#include "paddle/infrt/common/target.h"
+
+namespace infrt {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INFRT_ALWAYS_INLINE __attribute__((always_inline)) inline
+
+//! Code for the primitive types supported in INFRT.
+typedef enum infrt_type_code_t {
+  infrt_type_unk = -1,   //! Unknown type
+  infrt_type_int = 0,    //! signed int
+  infrt_type_uint = 1,   //! unsigned int
+  infrt_type_float = 2,  //! floating point
+  infrt_type_handle = 3  //! void*
+} infrt_type_code_t;
+
+#ifndef INFRT_ATTRIBUTE_ALIGN
+#define INFRT_ATTRIBUTE_ALIGN(n) __attribute__((aligned(n)))
+#endif
+
+/**
+ * A tuntime tag for type in INFRT system.
+ */
+typedef struct infrt_type_t {
+#if __cplusplus >= 201103L
+  INFRT_ATTRIBUTE_ALIGN(1) infrt_type_code_t code;
+#else
+  uint8_t code;
+#endif
+
+  //! Number of bits.
+  uint8_t bits;
+
+  //! Number of elements in a vector, 1 for scalar.
+  uint16_t lanes;
+
+  //! Number of '*', e.g. for `float*`, the num_asterisks is 1, `float**` it is
+  //! 2.
+  uint8_t num_asterisks{0};
+
+#ifdef __cplusplus
+  INFRT_ALWAYS_INLINE infrt_type_t()
+      : code(infrt_type_int), bits(0), lanes(0) {}
+  INFRT_ALWAYS_INLINE infrt_type_t(infrt_type_code_t code,
+                                   uint8_t bits,
+                                   uint16_t lanes = 1,
+                                   uint8_t num_asterisks = 0)
+      : code(code), bits(bits), lanes(lanes), num_asterisks(num_asterisks) {}
+  INFRT_ALWAYS_INLINE bool operator==(const infrt_type_t& other) const {
+    return code == other.code && bits == other.bits && lanes == other.lanes;
+  }
+  INFRT_ALWAYS_INLINE bool operator!=(const infrt_type_t& other) const {
+    return !(*this == other);
+  }
+  INFRT_ALWAYS_INLINE uint16_t bytes() const { return (bits + 7) / 8; }
+#endif  // __cplusplus
+} infrt_type_t;
+
+//! Help to define the size of a dimension, due to polyhedral representation, we
+//! no need to record the extend or
+//! min(default to 0).
+typedef int infrt_dimension_t;
+
+//! Help to tell the kind of the device.
+typedef enum infrt_device_kind_t {
+  infrt_unk_device = -1,    // Undefined device.
+  infrt_x86_device = 0,     // X86 device
+  infrt_opencl_device = 1,  // OpenCL device
+  infrt_arm_device = 2      // ARM device
+} infrt_device_kind_t;
+
+struct infrt_buffer_t;
+
+/**
+ * All INFRT backends implementation should provide an interface to be used.
+ */
+struct infrt_device_interface_impl_t;
+
+struct infrt_device_interface_t {
+  int (*malloc)(void* context, struct infrt_buffer_t* buf);
+  int (*free)(void* context, struct infrt_buffer_t* buf);
+  int (*sync)(void* context, struct infrt_buffer_t* buf);
+  int (*release)(void* context,
+                 const struct infrt_device_interface_t* device_interface);
+  int (*copy_to_host)(void* context, struct infrt_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct infrt_buffer_t* buf);
+  int (*buffer_copy)(void* context,
+                     struct infrt_buffer_t* src,
+                     struct infrt_buffer_t* dst);
+  struct infrt_device_interface_impl_t* impl;
+};
+
+//! The raw representation of a buffer,used in the generated code/lib.
+#define INFRT_BUFFER_MAX_DIMS 8
+typedef struct infrt_buffer_t {
+  //! Tell which kind of device this buffer locates.
+  infrt_device_kind_t device;
+
+  //! The interface used to operate on device.
+  const struct infrt_device_interface_t* device_interface;
+
+  //! A pointer to the memory in host.
+  uint8_t* memory;
+
+  //! Extra flags.
+  uint64_t flag;
+
+  //! Data type.
+  infrt_type_t type;
+
+  //! Number of dimensions.
+  int32_t dimensions;
+  infrt_dimension_t dims[INFRT_BUFFER_MAX_DIMS];
+
+  //! Allocate and deallocate lazily, default true.
+  char lazy;
+
+  //! The actual memory size(in bytes).
+  uint64_t memory_size;
+
+  uint16_t align;
+
+#ifdef __cplusplus
+  infrt_buffer_t()
+      : device(infrt_unk_device),
+        device_interface(NULL),
+        memory(NULL),
+        flag(0UL),
+        type(infrt_type_t()),
+        dimensions(0),
+        lazy(true),
+        memory_size(0),
+        align(0) {}
+
+  static void delete_(struct infrt_buffer_t* x) { delete x; }
+
+  ~infrt_buffer_t() {}
+
+  // NOTE the buffer should be resized first.
+  static void alloc(struct infrt_buffer_t*);
+
+  //! Set the shape of the buffer. NOTE this just record the shape, not allocate
+  //! the memory.
+  INFRT_ALWAYS_INLINE void resize(const infrt_dimension_t* dims,
+                                  int dimensions) {
+    this->dimensions = dimensions;
+    memcpy(this->dims, dims, dimensions * sizeof(infrt_dimension_t));
+  }
+
+  INFRT_ALWAYS_INLINE uint64_t num_elements() const {
+    uint64_t res = 1;
+    for (int i = 0; i < dimensions; i++) {
+      res *= dims[i];
+    }
+    return res;
+  }
+
+  INFRT_ALWAYS_INLINE int device_sync(void* ctx = NULL) {
+    if (device_interface && device_interface->sync) {
+      return device_interface->sync(ctx, this);
+    }
+    return 0;
+  }
+
+  INFRT_ALWAYS_INLINE uint8_t* begin() const { return 0; }
+  INFRT_ALWAYS_INLINE uint8_t* end() const {
+    return memory + num_elements() * type.bytes();
+  }
+
+#endif  // __cplusplus
+} infrt_buffer_t;
+
+#ifdef __cplusplus
+struct infrt_device_interface_impl_t {
+  int (*malloc)(void* context, struct infrt_buffer_t* buf);
+  int (*free)(void* context, struct infrt_buffer_t* buf);
+  int (*sync)(void* context, struct infrt_buffer_t* buf);
+  int (*release)(void* context);
+  int (*copy_to_host)(void* context, struct infrt_buffer_t* buf);
+  int (*copy_to_device)(void* context, struct infrt_buffer_t* buf);
+  int (*buffer_copy)(void* context,
+                     struct infrt_buffer_t* src,
+                     struct infrt_buffer_t* dst);
+};
+
+// The device implementations
+extern struct infrt_device_interface_t* infrt_x86_device_interface();
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#define INFRT_LOG(fmt, ...)     \
+  do {                          \
+    fprintf(stderr,             \
+            "%s:%d:%s(): " fmt, \
+            __FILE__,           \
+            __LINE__,           \
+            __func__,           \
+            __VA_ARGS__);       \
+  } while (0)
+
+#define INFRT_CHECK(cond)                \
+  if (!(cond)) {                         \
+    INFRT_LOG("check %s failed", #cond); \
+    abort();                             \
+  }
+/**
+ * Buffer helps to hold the memory, and offers a set of methods to help manage
+ * the memory.
+ */
+struct Buffer final {
+  Buffer() = default;
+  explicit Buffer(const common::Target& target) { SetTarget(target); }
+
+  //! Resize the memory hold by this buffer *exactlly* to \p size.
+  void Resize(uint32_t size);
+  void Resize(uint32_t alignment, uint32_t size);
+
+  //! Lazily resize the memory.
+  void ResizeLazy(uint32_t size);
+  void ResizeLazy(uint32_t alignment, uint32_t size);
+
+  //! Resize the memory to \p size in target \p target.
+  void Resize(uint32_t size, const common::Target& target);
+  void Resize(uint32_t alignment, uint32_t size, const common::Target& target);
+
+  //! Lazily resize the memory to \p size in target \p target.
+  void ResizeLazy(uint32_t size, const common::Target& target);
+  void ResizeLazy(uint32_t alignment,
+                  uint32_t size,
+                  const common::Target& target);
+
+  void SetTarget(const common::Target& target);
+
+  const infrt_buffer_t* data() const { return &data_; }
+  infrt_buffer_t* data() { return &data_; }
+
+  //! Free all the memory owned by this buffer.
+  void Free() {
+    if (!data_.memory) return;
+    memory_mng_cache_->free(data_.memory);
+  }
+
+ private:
+  inline void* Malloc(uint32_t size) INFRT_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->malloc(size);
+  }
+
+  inline void* AlignedAlloc(uint32_t alignment,
+                            uint32_t size) INFRT_RESULT_SHOULD_USE {
+    CHECK(memory_mng_cache_) << "Should set target first";
+    return memory_mng_cache_->aligned_alloc(alignment, size);
+  }
+
+ private:
+  infrt_buffer_t data_;
+
+  //! The place where this buffer locates.
+  common::Target target_;
+
+  //! Number of bytes of this buffer.
+  uint32_t size_{};
+
+  //! Hold the corresponding memory manager for speed.
+  MemoryInterface* memory_mng_cache_{};
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/common.h b/paddle/infrt/common/common.h
new file mode 100644
index 00000000000000..a15bc69b6030ab
--- /dev/null
+++ b/paddle/infrt/common/common.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/common/target.h"
+#include "paddle/infrt/common/type.h"
+
+namespace infrt {
+
+// export some general concepts.
+using common::make_shared;
+using common::Object;
+using common::ref_count;
+using common::Shared;
+
+// Type related.
+using common::Bool;
+using common::Float;
+using common::Int;
+using common::UInt;
+using common::Void;
+
+using common::type_of;
+
+using common::Target;
+using common::Type;
+using common::UnkTarget;
+
+template <typename T>
+T& Reference(const T* x) {
+  return *const_cast<T*>(x);
+}
+
+static void CheckVarNameValid(const std::string& name) {
+  CHECK(!name.empty());
+  CHECK(name.find(' ') == std::string::npos &&   //
+        name.find('.') == std::string::npos &&   //
+        name.find('/') == std::string::npos &&   //
+        name.find('\t') == std::string::npos &&  //
+        name.find('\n') == std::string::npos &&  //
+        name.find('\r') == std::string::npos)
+      << "Some invalid character found";
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/dtype.cc b/paddle/infrt/common/dtype.cc
new file mode 100644
index 00000000000000..d5cf67d8a3c402
--- /dev/null
+++ b/paddle/infrt/common/dtype.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/dtype.h"
+
+namespace infrt {
+
+const char* DType::name() const {
+  switch (kind_) {
+    case Kind::Unk:
+      return "Unk";
+      break;
+#define INFRT_DTYPE(enum__, value__) \
+  case Kind::enum__:                 \
+    return #enum__;                  \
+    break;
+#include "paddle/infrt/common/dtype.def"
+#undef INFRT_DTYPE
+  }
+
+  return "";
+}
+
+size_t DType::GetHostSize() const {
+  switch (kind_) {
+#define INFRT_DTYPE(enum__, value__) \
+  case DType::Kind::enum__:          \
+    return sizeof(DTypeInternal<DType::Kind::enum__>::type);
+#include "paddle/infrt/common/dtype.def"  // NOLINT
+#undef INFRT_DTYPE
+
+    case Kind::Unk:
+      return 0;
+      break;
+  }
+  return 0;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/dtype.def b/paddle/infrt/common/dtype.def
new file mode 100644
index 00000000000000..32df72aa764a38
--- /dev/null
+++ b/paddle/infrt/common/dtype.def
@@ -0,0 +1,18 @@
+// Define all INFRT dtypes
+// DTYPE(ENUM, VALUE)
+#ifdef INFRT_DTYPE
+
+INFRT_DTYPE(UI8,      1)
+INFRT_DTYPE(UI16,     2)
+INFRT_DTYPE(UI32,     3)
+INFRT_DTYPE(UI64,     4)
+INFRT_DTYPE(I1,       5)
+INFRT_DTYPE(I8,       6)
+INFRT_DTYPE(I16,      7)
+INFRT_DTYPE(I32,      8)
+INFRT_DTYPE(I64,      9)
+INFRT_DTYPE(F32,      10)
+INFRT_DTYPE(F64,      11)
+INFRT_DTYPE(STRING,   12)
+
+#endif
\ No newline at end of file
diff --git a/paddle/infrt/common/dtype.h b/paddle/infrt/common/dtype.h
new file mode 100644
index 00000000000000..8b57299fa94fd5
--- /dev/null
+++ b/paddle/infrt/common/dtype.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+namespace infrt {
+class DType {
+ public:
+  enum class Kind : uint8_t {
+    Unk = 0,
+
+// Automatically generate the enum definition
+#define INFRT_DTYPE(enum__, value__) enum__ = value__,
+#include "paddle/infrt/common/dtype.def"
+#undef INFRT_DTYPE
+
+    BOOL = I1,
+  };
+
+  DType() = default;
+  explicit constexpr DType(Kind kind) : kind_(kind) { assert(IsValid()); }
+
+  DType(const DType&) = default;
+  DType& operator=(const DType&) = default;
+  bool operator==(DType other) const { return kind_ == other.kind_; }
+  bool operator!=(DType other) const { return !(*this == other); }
+
+  constexpr Kind kind() const { return kind_; }
+
+  bool IsValid() const { return kind_ != Kind::Unk; }
+  bool IsInvalid() const { return !IsValid(); }
+
+  const char* name() const;
+
+  size_t GetHostSize() const;
+
+ private:
+  Kind kind_{Kind::Unk};
+};
+
+template <typename T>
+constexpr DType GetDType();
+
+template <DType::Kind kind>
+struct DTypeInternal;
+
+#define INFRT_IMPL_GET_DTYPE(cpp_type__, enum__)  \
+  template <>                                     \
+  inline constexpr DType GetDType<cpp_type__>() { \
+    return DType{DType::Kind::enum__};            \
+  }                                               \
+  template <>                                     \
+  struct DTypeInternal<DType::Kind::enum__> {     \
+    using type = cpp_type__;                      \
+  };
+
+INFRT_IMPL_GET_DTYPE(bool, I1);
+INFRT_IMPL_GET_DTYPE(int8_t, I8);
+INFRT_IMPL_GET_DTYPE(int16_t, I16);
+INFRT_IMPL_GET_DTYPE(int32_t, I32);
+INFRT_IMPL_GET_DTYPE(int64_t, I64);
+INFRT_IMPL_GET_DTYPE(uint8_t, UI8);
+INFRT_IMPL_GET_DTYPE(uint16_t, UI16);
+INFRT_IMPL_GET_DTYPE(uint32_t, UI32);
+INFRT_IMPL_GET_DTYPE(uint64_t, UI64);
+INFRT_IMPL_GET_DTYPE(float, F32);
+INFRT_IMPL_GET_DTYPE(double, F64);
+INFRT_IMPL_GET_DTYPE(std::string, STRING);
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/global.cc b/paddle/infrt/common/global.cc
new file mode 100644
index 00000000000000..54ecf1589aa14c
--- /dev/null
+++ b/paddle/infrt/common/global.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/global.h"
+
+namespace infrt {
+
+Global::Global() {}
+
+mlir::MLIRContext* Global::context = nullptr;
+
+mlir::MLIRContext* Global::getMLIRContext() {
+  if (nullptr == context) {
+    context = new mlir::MLIRContext();
+  }
+  return context;
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
new file mode 100644
index 00000000000000..f89164d03f31de
--- /dev/null
+++ b/paddle/infrt/common/global.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/MLIRContext.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt {
+
+// global variables
+class Global {
+ private:
+  static mlir::MLIRContext *context;
+  Global();
+
+ public:
+  static mlir::MLIRContext *getMLIRContext();
+};  // class Global
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/macros.h b/paddle/infrt/common/macros.h
new file mode 100644
index 00000000000000..4481f6b38aed37
--- /dev/null
+++ b/paddle/infrt/common/macros.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(NDEBUG)
+#define INFRT_DEBUG
+#endif
+
+#define INFRT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;            \
+  void operator=(const TypeName&) = delete
+
+#ifndef INFRT_NOT_IMPLEMENTED
+#define INFRT_NOT_IMPLEMENTED LOG(FATAL) << "Not Implemented";
+#endif
+
+#define INFRT_RESULT_SHOULD_USE __attribute__((warn_unused_result))
+
+/**
+ * A trick to enforce the registry.
+ *
+ * usage:
+ *
+ * INFRT_REGISTER_HELPER(some_key) {
+ *   // register methods
+ * }
+ *
+ * INFRT_USE_REGISTER(some_key);
+ */
+#define INFRT_REGISTER_HELPER(symbol__) bool __infrt__##symbol__##__registrar()
+#define INFRT_USE_REGISTER(symbol__)                                 \
+  extern bool __infrt__##symbol__##__registrar();                    \
+  [[maybe_unused]] static bool __infrt_extern_registrar_##symbol__ = \
+      __infrt__##symbol__##__registrar();
+
+#if __cplusplus >= 201703L
+#define INFRT_NODISCARD [[nodiscard]]
+#else
+#define INFRT_NODISCARD
+#endif
diff --git a/paddle/infrt/common/memory.cc b/paddle/infrt/common/memory.cc
new file mode 100644
index 00000000000000..aa5983a56c4344
--- /dev/null
+++ b/paddle/infrt/common/memory.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/memory.h"
+
+namespace infrt {
+
+using infrt::common::Target;
+
+namespace {
+
+class X86MemoryMng : public MemoryInterface {
+ public:
+  void* malloc(size_t nbytes) override { return ::malloc(nbytes); }
+  void free(void* data) override {
+    if (!data) return;
+    ::free(data);
+  }
+  void* aligned_alloc(size_t alignment, size_t nbytes) override {
+    return ::aligned_alloc(alignment, nbytes);
+  }
+};
+
+}  // namespace
+
+MemoryManager::MemoryManager() {
+  Register(Target::Arch::Unk, new X86MemoryMng);
+  Register(Target::Arch::X86, new X86MemoryMng);
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/memory.h b/paddle/infrt/common/memory.h
new file mode 100644
index 00000000000000..678529b8b785cd
--- /dev/null
+++ b/paddle/infrt/common/memory.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <unordered_map>
+
+#include <memory>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/common/target.h"
+
+namespace infrt {
+
+class MemoryInterface {
+ public:
+  virtual void* malloc(size_t nbytes) = 0;
+  virtual void free(void* data) = 0;
+  virtual void* aligned_alloc(size_t alignment, size_t nbytes) {
+    return nullptr;
+  }
+  virtual ~MemoryInterface() {}
+};
+
+/**
+ * MemoryManager holds a map of MemoryInterface for each articture.
+ */
+class MemoryManager final {
+ public:
+  using key_t = common::Target::Arch;
+
+  static MemoryManager& Global() {
+    static auto* x = new MemoryManager;
+    return *x;
+  }
+
+  MemoryInterface* Retrieve(key_t key) INFRT_RESULT_SHOULD_USE {
+    auto it = memory_mngs_.find(key);
+    if (it != memory_mngs_.end()) return it->second.get();
+    return nullptr;
+  }
+
+  MemoryInterface* RetrieveSafely(key_t key) {
+    auto* res = Retrieve(key);
+    CHECK(res) << "no MemoryInterface for architecture [" << key << "]";
+    return res;
+  }
+
+  MemoryInterface* Register(key_t key, MemoryInterface* item) {
+    CHECK(!memory_mngs_.count(key)) << "Duplicate register [" << key << "]";
+    memory_mngs_[key].reset(item);
+    return item;
+  }
+
+ private:
+  MemoryManager();
+
+  std::unordered_map<common::Target::Arch, std::unique_ptr<MemoryInterface>>
+      memory_mngs_;
+
+  INFRT_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
+};
+
+}  // namespace infrt
diff --git a/paddle/infrt/common/object.cc b/paddle/infrt/common/object.cc
new file mode 100644
index 00000000000000..6842ff7ba007d0
--- /dev/null
+++ b/paddle/infrt/common/object.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/object.h"
+
+namespace infrt {
+namespace common {}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h
new file mode 100644
index 00000000000000..ab2d00cce985c1
--- /dev/null
+++ b/paddle/infrt/common/object.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+
+#include "paddle/infrt/common/shared.h"
+
+namespace infrt {
+namespace common {
+
+template <typename T>
+class Shared;
+/**
+ * Object is the basic element in the INFRT, with `Shared` wrapper, the object
+ * can be shared accross the system.
+ */
+struct Object {
+  //! Get the type representation of this object.
+  virtual const char* type_info() const = 0;
+  virtual ~Object() {}
+
+  //! Cast to a derived type.
+  template <typename T>
+  T* as() {
+    return static_cast<T*>(this);
+  }
+
+  //! Cast to a derived type.
+  template <typename T>
+  const T* as() const {
+    return static_cast<const T*>(this);
+  }
+
+  //! Type safe cast.
+  template <typename T>
+  T* safe_as() {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  //! Type safe cast.
+  template <typename T>
+  const T* safe_as() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
+  //! Check if the type is right.
+  template <typename T>
+  bool is_type() const {
+    if (std::strcmp(type_info(), T::__type_info__) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  //! The reference count, which make all the derived type able to share.
+  mutable RefCount __ref_count__;
+};
+
+using object_ptr = Object*;
+using shared_object = Shared<Object>;
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/shared.cc b/paddle/infrt/common/shared.cc
new file mode 100644
index 00000000000000..78457b7ed352b2
--- /dev/null
+++ b/paddle/infrt/common/shared.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/shared.h"
diff --git a/paddle/infrt/common/shared.h b/paddle/infrt/common/shared.h
new file mode 100644
index 00000000000000..dbcf2b0597888c
--- /dev/null
+++ b/paddle/infrt/common/shared.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include <type_traits>
+
+namespace infrt {
+namespace common {
+
+class RefCount {
+ public:
+  using value_type = int32_t;
+  RefCount() = default;
+
+  value_type Inc() { return ++count_; }
+  value_type Dec() { return --count_; }
+  bool is_zero() const { return 0 == count_; }
+  std::string to_string() { return std::to_string(count_.load()); }
+  int32_t val() const { return count_; }
+
+ private:
+  std::atomic<value_type> count_{0};
+};
+
+class Object;
+/**
+ * The templated methods are used to unify the way to get the RefCount instance
+ * in client classes.
+ */
+template <typename T>
+RefCount& ref_count(const T* t) {
+  static_assert(std::is_base_of<Object, T>::value, "T is not a Object");
+  return t->__ref_count__;
+}
+template <typename T>
+void Destroy(const T* t) {
+  delete t;
+}
+
+template <typename T>
+struct Shared {
+  using object_ptr = T*;
+
+  Shared() = default;
+  explicit Shared(T* p) : p_(p) {
+    if (p) IncRef(p);
+  }
+  Shared(const Shared& other) : p_(other.p_) { IncRef(p_); }
+  Shared(Shared&& other) : p_(other.p_) { other.p_ = nullptr; }
+  Shared<T>& operator=(const Shared<T>& other);
+
+  //! Reset to another pointer \p x.
+  void Reset(T* x = nullptr);
+
+  //! Access the pointer in various ways.
+  // @{
+  inline T* get() const { return p_; }
+  inline T& operator*() const { return *p_; }
+  inline T* operator->() const { return p_; }
+  inline T* self() { return p_; }
+  inline const T* self() const { return p_; }
+  // @}
+
+  inline bool same_as(const Shared& other) { return p_ == other.p_; }
+  inline bool defined() const { return p_; }
+  inline bool operator<(const Shared& other) const { return p_ < other.p_; }
+  inline Shared<T>& operator=(T* x);
+  inline bool operator==(const Shared& other) const { return p_ == other.p_; }
+
+  ~Shared();
+
+ private:
+  //! Increase the share count.
+  void IncRef(T* p);
+
+  //! Decrease the share count.
+  void DecRef(T* p);
+
+ protected:
+  T* p_{};
+};
+
+template <typename T>
+void Shared<T>::IncRef(T* p) {
+  if (p) {
+    ref_count(p).Inc();
+  }
+}
+template <typename T>
+void Shared<T>::DecRef(T* p) {
+  if (p) {
+    if (ref_count(p).Dec() == 0) {
+      Destroy(p);
+    }
+  }
+}
+template <typename T>
+Shared<T>& Shared<T>::operator=(const Shared<T>& other) {
+  if (other.p_ == p_) return *this;
+  // Other can be inside of something owned by this, so we should be careful to
+  // incref other before we decref
+  // ourselves.
+  T* tmp = other.p_;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T, typename... Args>
+T* make_shared(Args&&... args) {
+  return new T(args...);
+}
+
+template <typename T>
+Shared<T>& Shared<T>::operator=(T* x) {
+  if (p_ == x) return *this;
+
+  T* tmp = x;
+  IncRef(tmp);
+  DecRef(p_);
+  p_ = tmp;
+  return *this;
+}
+
+template <typename T>
+Shared<T>::~Shared() {
+  DecRef(p_);
+  p_ = nullptr;
+}
+
+template <typename T>
+void Shared<T>::Reset(T* x) {
+  if (x) IncRef(x);
+  DecRef(p_);
+  p_ = x;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/string.cc b/paddle/infrt/common/string.cc
new file mode 100644
index 00000000000000..d02643825a7c82
--- /dev/null
+++ b/paddle/infrt/common/string.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/string.h"
+
+#include <stdarg.h>
+
+#include <cstring>
+
+namespace infrt {
+namespace infrt {
+
+std::string StringFormat(const std::string &fmt_str, ...) {
+  /* Reserve two times as much as the length of the fmt_str */
+  int final_n, n = (static_cast<int>(fmt_str.size())) * 2;
+  std::unique_ptr<char[]> formatted;
+  va_list ap;
+  while (1) {
+    formatted.reset(
+        new char[n]); /* Wrap the plain char array into the unique_ptr */
+    std::strcpy(&formatted[0], fmt_str.c_str());  // NOLINT
+    va_start(ap, fmt_str);
+    final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap);
+    va_end(ap);
+    if (final_n < 0 || final_n >= n)
+      n += abs(final_n - n + 1);
+    else
+      break;
+  }
+  return std::string(formatted.get());
+}
+
+std::string Trim(const std::string &s, const char *empty) {
+  if (s.empty()) return s;
+  auto start = s.find_first_not_of(empty);
+  if (start == std::string::npos) return "";
+  auto end = s.find_last_not_of(empty);
+  return s.substr(start, end - start + 1);
+}
+
+std::string Uppercase(const std::string &x) {
+  auto res = x;
+  for (auto &c : res) {
+    c = toupper(c);
+  }
+  return res;
+}
+
+bool Startswith(const std::string &x, const std::string &str) {
+  return x.find(str) == 0;
+}
+bool Endswith(const std::string &x, const std::string &str) {
+  if (x.length() >= str.length()) {
+    return std::equal(str.rbegin(), str.rend(), x.rbegin());
+  }
+  return false;
+}
+
+std::vector<std::string> Split(const std::string &str,
+                               const std::string &splitter) {
+  std::vector<std::string> results;
+  std::string::size_type pos1, pos2;
+  pos2 = str.find(splitter);
+  pos1 = 0;
+  while (std::string::npos != pos2) {
+    results.push_back(str.substr(pos1, pos2 - pos1));
+    pos1 = pos2 + splitter.size();
+    pos2 = str.find(splitter, pos1);
+  }
+  if (pos1 != str.length()) {
+    results.push_back(str.substr(pos1));
+  }
+  return results;
+}
+
+void Replace(std::string *s, const std::string &from, const std::string &to) {
+  size_t pos = 0;
+  while ((pos = s->find(from, pos)) != std::string::npos) {
+    s->replace(pos, from.size(), to);
+    pos += to.length();
+  }
+}
+
+size_t Count(std::string *s, const std::string &sub) {
+  size_t pos = 0;
+  size_t times = 0;
+  while ((pos = s->find(sub, pos)) != std::string::npos) {
+    if ((pos == 0 || !IsPrefix(s->at(pos - 1))) &&
+        (pos + sub.length() == s->size() ||
+         !IsSuffix(s->at(pos + sub.length())))) {
+      pos += sub.length();
+      times++;
+    } else {
+      pos++;
+    }
+  }
+  return times;
+}
+
+bool IsPrefix(const char &c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
+}
+
+bool IsSuffix(const char &c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_') ||
+         (c >= '0' && c <= '9') || (c == '\'');
+}
+
+std::string TransValidVarName(std::string name) {
+  Replace(&name, ".", "__");
+  Replace(&name, "/", "___");
+  name.erase(0, name.find_first_not_of("_"));
+  return name;
+}
+
+}  // namespace infrt
+}  // namespace infrt
diff --git a/paddle/infrt/common/string.h b/paddle/infrt/common/string.h
new file mode 100644
index 00000000000000..f744470603f804
--- /dev/null
+++ b/paddle/infrt/common/string.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace infrt {
+namespace infrt {
+
+//! Get the content of a stream.
+template <typename T>
+std::string GetStreamCnt(const T& x);
+
+/**
+ * Construct a formatted string with arguments.
+ * @param fmt_str The format.
+ * @param ... The parameters of the format.
+ * @return The formated string.
+ */
+std::string StringFormat(const std::string& fmt_str, ...);
+
+/**
+ * Join multiple fields to a single string. Similar to Python's str.join method.
+ */
+template <typename T = std::string>
+std::string Join(const std::vector<T>& fields, const std::string& splitter) {
+  if (fields.empty()) return "";
+  std::stringstream ss;
+  for (int i = 0; i < fields.size() - 1; i++) ss << fields[i] << splitter;
+  ss << fields.back();
+  return ss.str();
+}
+
+std::vector<std::string> Split(const std::string& str,
+                               const std::string& splitter);
+
+std::string Trim(const std::string& s, const char* empty = " \n\r\t");
+
+//! Convert a string to its uppercase.
+std::string Uppercase(const std::string& x);
+
+//! Replace a substr 'from' to 'to' in string s.
+void Replace(std::string* s, const std::string& from, const std::string& to);
+
+//! Count how many times substr 'sub' appears in string s.
+size_t Count(std::string* s, const std::string& sub);
+
+//! Tell if a char is prefix of a tensor's name.
+bool IsPrefix(const char& c);
+
+//! Tell if a char is suffix of a tensor's name.
+bool IsSuffix(const char& c);
+
+//! Tell if a string \p x start with \p str.
+bool Startswith(const std::string& x, const std::string& str);
+
+//! Tell if a string \p x ends with \p str.
+bool Endswith(const std::string& x, const std::string& str);
+
+template <typename T>
+std::string GetStreamCnt(const T& x) {
+  std::stringstream os;
+  os << x;
+  return os.str();
+}
+
+std::string TransValidVarName(std::string name);
+
+}  // namespace infrt
+}  // namespace infrt
diff --git a/paddle/infrt/common/target.cc b/paddle/infrt/common/target.cc
new file mode 100644
index 00000000000000..d376ad7db0241e
--- /dev/null
+++ b/paddle/infrt/common/target.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/target.h"
+
+#include <glog/logging.h>
+
+namespace infrt {
+namespace common {
+
+bool Target::operator==(const Target &other) const {
+  return os == other.os &&      //
+         arch == other.arch &&  //
+         bits == other.bits &&  //
+         features == other.features;
+}
+
+int Target::max_num_threads() const {
+  CHECK(arch == Arch::NVGPU)
+      << "The target is not NVGPU! Cannot get max number of threads.";
+  return 1024;
+}
+
+std::vector<Target::Lib> Target::get_target_libs() const { return libs; }
+
+int Target::get_target_bits() const {
+  switch (bits) {
+    case Bit::k32:
+      return 32;
+    case Bit::k64:
+      return 64;
+    case Bit::Unk:
+      return 0;
+    default:
+      LOG(FATAL) << "Not supported Bit";
+  }
+  return -1;
+}
+
+std::ostream &operator<<(std::ostream &os, const Target &target) {
+  os << "Target<";
+  switch (target.os) {
+    case Target::OS::Linux:
+      os << "linux";
+      break;
+    case Target::OS::Windows:
+      os << "windows";
+      break;
+    case Target::OS::Unk:
+      os << "unk";
+      break;
+  }
+
+  os << ",";
+
+  switch (target.arch) {
+    case Target::Arch::X86:
+      os << "x86";
+      break;
+    case Target::Arch::ARM:
+      os << "arm";
+      break;
+    case Target::Arch::NVGPU:
+      os << "nvgpu";
+      break;
+    case Target::Arch::Unk:
+      os << "unk";
+      break;
+  }
+  os << ",";
+
+  switch (target.bits) {
+    case Target::Bit::k32:
+      os << "32";
+      break;
+    case Target::Bit::k64:
+      os << "64";
+      break;
+    case Target::Bit::Unk:
+      os << "unk";
+      break;
+  }
+  os << ">";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Target::Arch arch) {
+  switch (arch) {
+    case Target::Arch::Unk:
+      os << "Unk";
+      break;
+    case Target::Arch::X86:
+      os << "X86";
+      break;
+    case Target::Arch::ARM:
+      os << "ARM";
+      break;
+    case Target::Arch::NVGPU:
+      os << "NVGPU";
+      break;
+  }
+  return os;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/target.h b/paddle/infrt/common/target.h
new file mode 100644
index 00000000000000..eaf19efbfe7a8f
--- /dev/null
+++ b/paddle/infrt/common/target.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <vector>
+
+namespace infrt {
+namespace common {
+
+struct Target {
+  /**
+   * The operating system used by the target. Determines which system calls to
+   * generate.
+   */
+  enum class OS : int {
+    Unk = -1,
+    Linux,
+    Windows,
+  };
+
+  /**
+   * The architecture used by the target. Determines the instruction set to use.
+   */
+  enum class Arch : int {
+    Unk = -1,
+    X86,
+    ARM,
+    NVGPU,
+  };
+
+  enum class Bit : int {
+    Unk = -1,
+    k32,
+    k64,
+  };
+
+  OS os{OS::Unk};
+  Arch arch{Arch::Unk};
+  Bit bits{Bit::Unk};
+
+  enum class Feature : int {
+    JIT = 0,
+    Debug,
+  };
+
+  /**
+   * The library used by the target.
+   */
+  enum class Lib : int {
+    Unk = -1,
+    MKL,
+  };
+  std::vector<Feature> features;
+  std::vector<Lib> libs;
+
+  explicit Target(OS o = OS::Linux,
+                  Arch a = Arch::Unk,
+                  Bit b = Bit::Unk,
+                  const std::vector<Feature>& features = {},
+                  const std::vector<Lib>& libs = {})
+      : os(o), arch(a), bits(b), features(features), libs(libs) {}
+
+  bool defined() const {
+    return os != OS::Unk && arch != Arch::Unk && bits != Bit::Unk;
+  }
+
+  int max_num_threads() const;
+
+  int get_target_bits() const;
+
+  std::vector<Lib> get_target_libs() const;
+
+  bool operator==(const Target& other) const;
+  bool operator!=(const Target& other) const { return !(*this == other); }
+  friend std::ostream& operator<<(std::ostream& os, const Target& target);
+};
+
+static const Target& UnkTarget() {
+  static Target target(
+      Target::OS::Unk, Target::Arch::Unk, Target::Bit::Unk, {}, {});
+  return target;
+}
+
+static const Target& DefaultHostTarget() {
+  static Target target(
+      Target::OS::Linux, Target::Arch::X86, Target::Bit::k64, {}, {});
+  return target;
+}
+
+static const Target& DefaultNVGPUTarget() {
+  static Target target(
+      Target::OS::Linux, Target::Arch::NVGPU, Target::Bit::k64, {}, {});
+  return target;
+}
+
+std::ostream& operator<<(std::ostream& os, Target::Arch arch);
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/type.cc b/paddle/infrt/common/type.cc
new file mode 100644
index 00000000000000..f262bd4697b36d
--- /dev/null
+++ b/paddle/infrt/common/type.cc
@@ -0,0 +1,358 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/common/type.h"
+
+#include <utility>
+
+namespace infrt {
+namespace common {
+
+struct Type::Storage {
+  Storage() = default;
+  Storage(type_t t, int b, int w) : type_(t), bits_(b), lanes_(w) {}
+
+  type_t type_{type_t::Unk};
+  cpp_type_t cpp_type_{cpp_type_t::None};
+
+  //! How many bits per element.
+  int bits_{};
+
+  //! How many elements(if a vector type), for scalar types, it should be 1.
+  int lanes_{1};
+
+  //! Name of the customized type.
+  std::string customized_type_;
+};
+
+Type::~Type() {}
+
+std::ostream &operator<<(std::ostream &os, const Type &t) {
+  if (t.is_cpp_const()) os << "const ";
+  switch (t.type()) {
+    case Type::type_t::Int:
+      if (t.bits() == 1) {
+        os << "bool";
+      } else {
+        os << "int" << t.bits();
+      }
+
+      break;
+    case Type::type_t::UInt:
+      os << "uint" << t.bits();
+      break;
+
+    case Type::type_t::Float:
+      os << "float" << t.bits();
+      break;
+    case Type::type_t::Void:
+      os << "void";
+      break;
+    case Type::type_t::Customized:
+      os << t.customized_type();
+      break;
+    case Type::type_t::String:
+      os << "string";
+      break;
+    case Type::type_t::Unk:
+      os << "unk";
+      break;
+  }
+
+  if (t.lanes() > 1) os << "<" << t.lanes() << ">";
+  if (t.is_cpp_handle()) os << "*";
+  if (t.is_cpp_handle2()) os << "**";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Type::type_t t) {
+  switch (t) {
+    case Type::type_t::String:
+      os << "String";
+      break;
+    case Type::type_t::Void:
+      os << "Void";
+      break;
+    case Type::type_t::UInt:
+      os << "UInt";
+      break;
+    case Type::type_t::Int:
+      os << "Int";
+      break;
+    case Type::type_t::Float:
+      os << "Float";
+      break;
+    case Type::type_t::Unk:
+      os << "Unk";
+      break;
+    case Type::type_t::Customized:
+      os << "Customized";
+  }
+  return os;
+}
+
+Type &Type::set_cpp_handle(bool x) {
+  // unset the other handle-related bits.
+  set_cpp_handle2(false);
+
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::Handle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+
+  return *this;
+}
+
+Type &Type::set_cpp_handle2(bool x) {
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::HandleHandle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  return *this;
+}
+
+Type Type::VectorOf(int w) const {
+  CheckTypeValid();
+  return Type(type(), w, bits());
+}
+
+Type::Type(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+}
+
+Type Type::ElementOf() const {
+  CheckTypeValid();
+  auto type = *this;
+  type.storage_->lanes_ = 1;
+  return type;
+}
+
+void Type::CheckTypeValid() const { CHECK_NE(GetStorage().type_, type_t::Unk); }
+
+Type Type::PointerOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf";
+  if (x.is_cpp_handle())
+    x.set_cpp_handle2();
+  else
+    x.set_cpp_handle();
+  return x;
+}
+
+Type Type::ConstOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const();
+  return x;
+}
+
+Type Type::IgnoreConst() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const(false);
+  return x;
+}
+
+Type Type::with_bits(int x) const {
+  CHECK(is_primitive());
+  Type type = *this;
+  type.GetStorage().bits_ = x;
+  return type;
+}
+
+Type Type::with_type(Type::type_t x) const {
+  Type type = *this;
+  type.GetStorage().type_ = x;
+  return type;
+}
+
+Type Type::with_lanes(int x) const {
+  CHECK(valid());
+  Type type = *this;
+  type.GetStorage().lanes_ = x;
+  return type;
+}
+
+Type Type::with_cpp_const(bool x) const {
+  Type type = *this;
+  type.set_cpp_const(x);
+  return type;
+}
+
+Type &Type::set_cpp_const(bool is_const) {
+  uint8_t &data = *reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_);
+  if (is_const) {
+    data |= static_cast<uint8_t>(cpp_type_t::Const);
+  } else {
+    data &= ~(static_cast<uint8_t>(cpp_type_t::Const));
+  }
+
+  return *this;
+}
+Type &Type::set_customized_type(const std::string &t) {
+  GetStorage().type_ = type_t::Customized;
+  GetStorage().customized_type_ = t;
+
+  return *this;
+}
+
+bool Type::valid() const {
+  if (is_unk()) return false;
+  if (is_customized()) {
+    return !GetStorage().customized_type_.empty();
+  }
+  if (is_primitive()) {
+    return bits() != 0;
+  }
+  return true;
+}
+
+Type::Type(Type::type_t t, int b, int w) : storage_(new Storage(t, b, w)) {}
+bool Type::is_primitive() const {
+  return !is_unk() && type() != type_t::Customized;
+}
+bool Type::is_customized() const {
+  return !is_unk() && type() == type_t::Customized;
+}
+bool Type::is_unk() const { return type() == type_t::Unk; }
+bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; }
+bool Type::is_void() const { return type() == type_t::Void; }
+bool Type::is_vector() const { return lanes() > 1; }
+bool Type::is_scalar() const { return lanes() == 1; }
+bool Type::is_float(int bits) const {
+  return type() == type_t::Float && (bits < 0 || bits == this->bits());
+}
+bool Type::is_uint(int bits) const {
+  return type() == type_t::UInt && (bits < 0 || bits == this->bits());
+}
+bool Type::is_int(int bits) const {
+  return type() == type_t::Int && (bits < 0 || bits == this->bits());
+}
+bool Type::is_integer(int bits) const {
+  return (type() == type_t::Int || type() == type_t::UInt) &&
+         (bits < 0 || bits == this->bits());
+}
+bool Type::is_index_type() {
+  return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64);
+}
+bool Type::is_cpp_handle() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::Handle);
+}
+bool Type::is_cpp_handle2() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::HandleHandle);
+}
+bool Type::is_cpp_const() const {
+  return static_cast<uint8_t>(cpp_type_t::Const) &
+         static_cast<uint8_t>(GetStorage().cpp_type_);
+}
+const std::string &Type::customized_type() const {
+  return GetStorage().customized_type_;
+}
+bool Type::is_customized_type() const {
+  return !GetStorage().customized_type_.empty();
+}
+Type::type_t Type::type() const { return GetStorage().type_; }
+int Type::bits() const { return GetStorage().bits_; }
+int Type::lanes() const { return GetStorage().lanes_; }
+Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; }
+bool Type::operator==(const Type &other) const {
+  return type() == other.type() && bits() == other.bits() &&
+         lanes() == other.lanes() &&
+         GetStorage().cpp_type_ == other.GetStorage().cpp_type_ &&
+         customized_type() == other.customized_type();
+}
+bool Type::is_string() const { return type() == type_t::String; }
+
+Type &Type::operator=(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+  return *this;
+}
+
+Type::Storage &Type::GetStorage() { return *storage_; }
+const Type::Storage &Type::GetStorage() const { return *storage_; }
+
+Type::Type() : storage_(new Storage) {}
+Type::Type(Type &&other) : storage_(std::move(other.storage_)) {}
+
+const Type &F16() {
+  static auto t = Float(16);
+  return t;
+}
+const Type &F32() {
+  static auto t = Float(32);
+  return t;
+}
+const Type &F64() {
+  static auto t = Float(64);
+  return t;
+}
+const Type &I8() {
+  static auto t = Int(8);
+  return t;
+}
+const Type &I16() {
+  static auto t = Int(16);
+  return t;
+}
+const Type &I32() {
+  static auto t = Int(32);
+  return t;
+}
+const Type &I64() {
+  static auto t = Int(64);
+  return t;
+}
+const Type &UI8() {
+  static auto t = UInt(8);
+  return t;
+}
+const Type &UI16() {
+  static auto t = UInt(16);
+  return t;
+}
+const Type &UI32() {
+  static auto t = UInt(32);
+  return t;
+}
+const Type &UI64() {
+  static auto t = UInt(64);
+  return t;
+}
+const Type &I1() {
+  static auto t = Int(1);
+  return t;
+}
+const Type &UI1() {
+  static auto t = UInt(1);
+  return t;
+}
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/common/type.h b/paddle/infrt/common/type.h
new file mode 100644
index 00000000000000..b532fc154ff02e
--- /dev/null
+++ b/paddle/infrt/common/type.h
@@ -0,0 +1,223 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+
+#include "paddle/infrt/common/macros.h"
+
+//! Much of the concepts are borrowed from Halide project.
+
+namespace infrt {
+namespace common {
+
+/**
+ * Types in the INFRT type system. They can be ints, unsigned ints, or floats of
+ * various bit-widths.
+ * They can also be vectors of the same (by setting the `lanes` field to
+ * something larger than one).
+ * NOTE: Front-end code other than vectorize shouldn't use vector types.
+ */
+struct Type {
+  enum class type_t {
+    Unk = -1,
+    Int,
+    UInt,
+    Float,
+    String,
+    Void,
+    // stupid idea to mix the Customized with other primitive types, large
+    // refactor needs here.
+    Customized,  // Customized type
+  };
+
+  //! type decorators in C++, the different code can used together.
+  enum class cpp_type_t : uint8_t {
+    None = 0,               // None information.
+    Const = 1,              // const.
+    Handle = 1 << 1,        // pointer type, such as `infrt_buffer_t*`.
+    HandleHandle = 1 << 2,  // pointer of pointer, such as `infrt_buffer_t**`.
+  };
+
+  Type();
+  Type(type_t t, int b, int w);
+  Type(const Type& other);
+  explicit Type(Type&& other);
+  Type& operator=(const Type& other);
+
+  INFRT_NODISCARD bool is_primitive() const;
+  INFRT_NODISCARD bool is_customized() const;
+  INFRT_NODISCARD bool valid() const;
+
+  //! Some helper functions to check a type.
+  // @{
+  INFRT_NODISCARD bool is_unk() const;
+  INFRT_NODISCARD bool is_void() const;
+  INFRT_NODISCARD bool is_bool() const;
+  INFRT_NODISCARD bool is_vector() const;
+  INFRT_NODISCARD bool is_scalar() const;
+  INFRT_NODISCARD bool is_float(int bits = -1) const;
+  INFRT_NODISCARD bool is_int(int bits = -1) const;
+  INFRT_NODISCARD bool is_integer(int bits = -1) const;
+  INFRT_NODISCARD bool is_uint(int bits = -1) const;
+  INFRT_NODISCARD bool is_string() const;
+  INFRT_NODISCARD bool is_index_type();
+  // @}
+
+  Type& set_cpp_handle(bool x = true);
+  INFRT_NODISCARD bool is_cpp_handle() const;
+
+  Type& set_cpp_handle2(bool x = true);
+  INFRT_NODISCARD bool is_cpp_handle2() const;
+
+  Type& set_cpp_const(bool is_const = true);
+  INFRT_NODISCARD bool is_cpp_const() const;
+
+  Type& set_customized_type(const std::string& t);
+  const std::string& customized_type() const;
+  INFRT_NODISCARD bool is_customized_type() const;
+
+  // Get a new type with bits set to \p x.
+  Type with_bits(int x) const;
+  // Get a new type with type set to \p x.
+  Type with_type(type_t x) const;
+  // Get a new type with lanes set to \p x.
+  Type with_lanes(int x) const;
+  // Get a new type with cpp_const set to \p x.
+  Type with_cpp_const(bool x = true) const;
+
+  //! Getters
+  // @{
+  type_t type() const;
+  int bits() const;
+  int lanes() const;
+  cpp_type_t cpp_type() const;
+  // @}
+
+  //! Compare two types for equality.
+  bool operator==(const Type& other) const;
+
+  //! Compare two types for inequality.
+  bool operator!=(const Type& other) const { return !(*this == other); }
+
+  //! Generate a vector of this type, with `w` elements.
+  Type VectorOf(int w) const;
+  //! Generate a element type of this type.
+  Type ElementOf() const;
+  //! Generate the address type.
+  Type PointerOf() const;
+  //! Ignore const.
+  Type IgnoreConst() const;
+  //! Add const.
+  Type ConstOf() const;
+
+  friend std::ostream& operator<<(std::ostream& os, const Type& t);
+
+  ~Type();
+
+ private:
+  void CheckTypeValid() const;
+
+  struct Storage;
+  Storage& GetStorage();
+  const Storage& GetStorage() const;
+
+  std::unique_ptr<Storage> storage_;
+};  // namespace common
+
+inline Type Void() { return Type(Type::type_t::Void, 1, 0); }
+inline Type Int(int bits, int lanes = 1) {
+  return Type(Type::type_t::Int, bits, lanes);
+}
+inline Type UInt(int bits, int lanes = 1) {
+  return Type(Type::type_t::UInt, bits, lanes);
+}
+inline Type Float(int bits, int lanes = 1) {
+  return Type(Type::type_t::Float, bits, lanes);
+}
+inline Type Bool(int lanes = 1) { return Type(Type::type_t::UInt, 1, lanes); }
+inline Type String() { return Type(Type::type_t::String, 1, 1); }
+
+//! Builtin native types as global singletons.
+// @{
+const Type& F16();
+const Type& F32();
+const Type& F64();
+const Type& I8();
+const Type& I16();
+const Type& I32();
+const Type& I64();
+const Type& UI8();
+const Type& UI16();
+const Type& UI32();
+const Type& UI64();
+const Type& I1();
+const Type& UI1();
+// @}
+
+template <typename T>
+Type type_of();
+
+// clang-format off
+template <> inline Type type_of<float>() { return F32(); }
+template <> inline Type type_of<double>() { return F64(); }
+template <> inline Type type_of<unsigned char>() { return UI8(); }
+template <> inline Type type_of<int16_t>() { return UI16(); }
+template <> inline Type type_of<int32_t>() { return I32(); }
+template <> inline Type type_of<uint32_t>() { return UI32(); }
+template <> inline Type type_of<bool>() { return UI1(); }
+template <> inline Type type_of<char>() { return I8(); }
+template <> inline Type type_of<int64_t>() { return I64(); }
+template <> inline Type type_of<uint64_t>() { return UI64(); }
+template <> inline Type type_of<signed char>() { return I8(); }
+template <> inline Type type_of<void>() { return Void(); }
+// clang-format on
+template <>
+inline Type type_of<int8_t*>() {
+  Type x = Int(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void*>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void**>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle2();
+  return x;
+}
+template <>
+inline Type type_of<float*>() {
+  Type x = type_of<float>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<double*>() {
+  Type x = type_of<double>();
+  x.set_cpp_handle();
+  return x;
+}
+
+std::ostream& operator<<(std::ostream& os, Type::type_t t);
+
+}  // namespace common
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
new file mode 100644
index 00000000000000..c1517beab0662b
--- /dev/null
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -0,0 +1,61 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    dialect.cc
+    types.cc
+    basic_kernels.cc
+    test_kernels.cc
+    infrt_base.cc
+    init_infrt_dialects.cc
+    tensor_shape.cc
+    dense_tensor.cc
+    mlir_loader.cc
+    diagnostic_utils.cc
+    pd_types.cc
+    pd_ops.cc
+    )
+
+mlir_tablegen_on(ops)
+mlir_tablegen_on(basic_kernels)
+mlir_tablegen_on(test_kernels)
+mlir_tablegen_on(infrt_base DIALECT infrt)
+mlir_tablegen_on(tensor_shape DIALECT ts)
+mlir_tablegen_on(dense_tensor DIALECT dt)
+mlir_tablegen_on(pd_op_base DIALECT pd)
+mlir_tablegen_on(pd_ops)
+mlir_add_rewriter(rewrite)
+
+# TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
+add_executable(infrtopt opt.cc)
+target_link_libraries(infrtopt infrt ${mlir_libs})
+add_dependencies(infrtopt infrt)
+
+add_executable(print-ir print_ir.cc)
+target_link_libraries(print-ir infrt ${mlir_libs})
+add_dependencies(print-ir pd_ops_inc)
+
+
+# MLIR opt tests
+# %{
+set(infrt_opt_path ${CMAKE_BINARY_DIR}/infrt/dialect/infrtopt)
+
+add_test(test_infrt_mlir_opt_on_basic ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/basic.mlir)
+add_test(test_infrt_mlir_opt_on_tensor_shape ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_shape.mlir)
+add_test(test_infrt_mlir_opt_on_paddle_ops
+        ${infrt_opt_path}
+        ${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/paddle_ops.mlir)
+# %}
+
+cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS})
+
+# execute mlir and run FileCheck
+infrt_exec_check(run_and_check_tensor_type mlir_tests/tensor_type.mlir)
+infrt_exec_check(run_and_check_basic mlir_tests/basic.mlir)
+infrt_exec_check(run_and_check_benchmark mlir_tests/benchmark.mlir)
+#infrt_exec_check(run_and_check_dense_tensor mlir_tests/dense_tensor.mlir)
+add_test(test_infrt_mlir_dense_tensor
+        ${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec
+        -i
+        ${CMAKE_CURRENT_SOURCE_DIR}/mlir_tests/dense_tensor.mlir)
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
new file mode 100644
index 00000000000000..b4d2b9182b0c50
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+
+namespace infrt::dialect {
+using namespace mlir;  // NOLINT
+
+static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
+                               OperationState &result) {  // NOLINT
+  SymbolRefAttr callee_attr;
+  FunctionType callee_type;
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  auto callee_loc = parser.getNameLoc();
+  if (parser.parseAttribute(callee_attr, "callee", result.attributes) ||
+      parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(callee_type) ||
+      parser.addTypesToList(callee_type.getResults(), result.types) ||
+      parser.resolveOperands(
+          operands, callee_type.getInputs(), callee_loc, result.operands))
+    return failure();
+  return success();
+}
+
+static ParseResult parseConstantOp(Type attrType,
+                                   OpAsmParser &parser,       // NOLINT
+                                   OperationState &result) {  // NOLINT
+  Attribute valueAttr;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(valueAttr, attrType, "value", result.attributes) ||
+      parser.addTypeToList(attrType, result.types))
+    return failure();
+  return success();
+}
+
+static ParseResult parseConstantF32Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      FloatType::getF32(result.getContext()), parser, result);
+}
+static ParseResult parseConstantF64Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      FloatType::getF64(result.getContext()), parser, result);
+}
+static ParseResult parseConstantI32Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      IntegerType::get(32, result.getContext()), parser, result);
+}
+static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
+                                      OperationState &result) {  // NOLINT
+  return parseConstantOp(
+      IntegerType::get(64, result.getContext()), parser, result);
+}
+
+static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
+                                 OperationState &result) {  // NOLINT
+  SmallVector<OpAsmParser::OperandType, 2> opInfo;
+  SmallVector<Type, 2> types;
+  llvm::SMLoc loc = parser.getCurrentLocation();
+  return failure(parser.parseOperandList(opInfo) ||
+                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
+                 parser.resolveOperands(opInfo, types, loc, result.operands));
+}
+
+static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
+  p << "infrt.call " << op.getAttr("callee") << "(";
+  p.printOperands(op.getOperands());
+  p << ")";
+  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+  p << " : ";
+}
+
+static void printConstant(OpAsmPrinter &p, mlir::Operation *op) {  // NOLINT
+  p << op->getName() << " ";
+  p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"});
+
+  if (op->getAttrs().size() > 1) p << ' ';
+  Attribute attr = op->getAttr("value");
+  if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+    bool is_signed = int_attr.getType().isIndex() ||
+                     int_attr.getType().getIntOrFloatBitWidth() != 1;
+    int_attr.getValue().print(p.getStream(), is_signed);
+  } else if (auto float_attr = attr.dyn_cast<FloatAttr>()) {
+    p << float_attr.getValue().convertToFloat();
+  } else {
+    op->emitOpError("unknown attribute type");
+  }
+}
+
+static void print(OpAsmPrinter &p, ConstantF32Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantF64Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantI32Op op) {  // NOLINT
+  printConstant(p, op);
+}
+static void print(OpAsmPrinter &p, ConstantI64Op op) {  // NOLINT
+  printConstant(p, op);
+}
+
+static void print(OpAsmPrinter &p, ReturnOp op) {  // NOLINT
+  p << "infrt.return";
+  if (op.getNumOperands() > 0) {
+    p << ' ';
+    p.printOperands(op.getOperands());
+    p << " : ";
+    llvm::interleaveComma(op.getOperands(), p);
+  }
+}
+
+static LogicalResult verify(CallOp op) { return success(); }
+
+static LogicalResult verify(ConstantF32Op op) { return success(); }
+static LogicalResult verify(ConstantI32Op op) { return success(); }
+static LogicalResult verify(ConstantF64Op op) { return success(); }
+static LogicalResult verify(ConstantI64Op op) { return success(); }
+
+static LogicalResult verify(ReturnOp op) {
+  auto function = dyn_cast<FuncOp>(op.getParentOp());
+
+  if (!function) return success();
+
+  auto results = function.getType().getResults();
+  if (op.getNumOperands() != results.size())
+    return op.emitOpError("has ")
+           << op.getNumOperands()
+           << " operands, but enclosing function returns " << results.size();
+
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h
new file mode 100644
index 00000000000000..65316bc1437c02
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+using namespace mlir;  // NOLINT
+
+namespace infrt::dialect {
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/basic_kernels.hpp.inc"
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
new file mode 100644
index 00000000000000..df5e4d8a2c6a1c
--- /dev/null
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -0,0 +1,139 @@
+// Operation definitions for basic kernels.
+
+#ifdef BASIC_OPS
+#else
+#define BASIC_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> : Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+
+  // Each registered op needs to provide all of a printer, parser and verifier.
+  let printer = [{ return infrt::dialect::print(p, *this); }];
+  let verifier = [{ return infrt::dialect::verify(*this); }];
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+}
+
+def CallOp : INFRT_Op<"call"> {
+  let summary = "call a host operation";
+  let description = [{
+      The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type.
+
+          %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32
+    }];
+
+  let arguments = (ins FlatSymbolRefAttr:$callee, Variadic<AnyType>:$operands);
+  let results = (outs Variadic<AnyType>);
+
+  let extraClassDeclaration = [{
+      StringRef getCallee() { return callee(); }
+      mlir::FunctionType getCalleeType();
+    }];
+}
+
+class ConstantOp<string suffix, Type baseType, Attr attr>
+    : INFRT_Op<"constant." # suffix, [NoSideEffect]> {
+  let summary = "constant value constructor in host";
+
+  let arguments = (ins attr:$value);
+  let results = (outs baseType);
+}
+
+def ConstantI32Op : ConstantOp<"i32", I32, I32Attr>;
+def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>;
+def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>;
+def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>;
+
+def ReturnOp : INFRT_Op<"return", [Terminator]> {
+  let summary = "host executor return operation";
+  let description = [{
+      The "infrt.return" operation represents a return operation within a function.
+
+        func @foo() : (i32, f8) {
+        infrt.return %0, %1 : i32, f8
+        }
+    }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let builders = [OpBuilder<
+                  "OpBuilder &b, OperationState &result",
+                  [{ build(b, result, llvm::None); }]>];
+}
+
+class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
+  let summary = "infrt.add operation";
+  let description = [{
+      An operation that takes two inputs and returns their sum as result.
+    }];
+
+  let arguments = (ins type, type);
+  let results = (outs type);
+  let assemblyFormat = "operands attr-dict";
+  let verifier = ?;
+}
+
+def AddI32Op : AddOp<"i32", I32>;
+def AddI64Op : AddOp<"i64", I64>;
+def AddF32Op : AddOp<"f32", F32>;
+def AddF64Op : AddOp<"f64", F64>;
+
+class MulOp<string suffix, Type type> : INFRT_Op<"mul." # suffix, [NoSideEffect]> {
+    let summary = "infrt.mul operation";
+    let description = [{
+        An operation that takes two inputs and returns their mul as result.
+    }];
+
+    let arguments = (ins type, type);
+let results = (outs type);
+let assemblyFormat = "operands attr-dict";
+let verifier = ?;
+}
+
+def MulI32Op : MulOp<"i32", I32>;
+def MulI64Op : MulOp<"i64", I64>;
+def MulF32Op : MulOp<"f32", F32>;
+def MulF64Op : MulOp<"f64", F64>;
+
+class PrintOp<string suffix, Type type> : INFRT_Op<"print." # suffix> {
+  let summary = "infrt.print operation";
+  let description = [{
+      An operation takes a number as input and prints to stdout.
+    }];
+
+  let arguments = (ins type);
+  let assemblyFormat = "operands attr-dict";
+  let verifier = ?;
+}
+
+//def PrintI32Op : PrintOp<"i32", I32>;
+//def PrintI64Op : PrintOp<"i64", I64>;
+def PrintF32Op : PrintOp<"f32", F32>;
+//def PrintF64Op : PrintOp<"f64", F64>;
+
+def GetStringOp : INFRT_Op<"get_string"> {
+  let summary = "infrt.get_string";
+  let description = [{
+    Get a !infrt.string value from the given string attribute.
+  }];
+
+  let arguments = (ins StrAttr:$value);
+  let results = (outs StringType);
+  let assemblyFormat = "`(` $value `)` attr-dict";
+  let verifier = ?;
+}
+
+def PrintStringOp : INFRT_Op<"print_string"> {
+  let summary = "infrt.print_string";
+  let description = [{
+      An operation that prints a string.
+  }];
+
+  let arguments = (ins StringType:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `)` attr-dict";
+  let verifier = ?;
+}
+
+#endif  // basic kernels
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
new file mode 100644
index 00000000000000..629a7b16523fca
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -0,0 +1,277 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <tuple>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+namespace infrt::dt {
+
+void DTDialect::initialize() {
+  allowUnknownTypes();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/dense_tensor.cpp.inc"
+      >();
+}
+
+namespace detail {
+struct TensorTypeStorage : public mlir::TypeStorage {
+  TensorTypeStorage(TargetType target,
+                    LayoutType layout,
+                    PrecisionType precision)
+      : target_(target), layout_(layout), precision_(precision) {}
+
+  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(target_, layout_, precision_);
+  }
+
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  static TensorTypeStorage *construct(
+      mlir::TypeStorageAllocator &allocator,  // NOLINT
+      const KeyTy &key) {
+    return new (allocator.allocate<TensorTypeStorage>())
+        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  TargetType target_;
+  LayoutType layout_;
+  PrecisionType precision_;
+};
+}  // namespace detail
+
+llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
+  if (key.equals_lower("x86"))
+    return TargetType::X86;
+  else if (key.equals_lower("cuda"))
+    return TargetType::CUDA;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
+  if (key.equals_lower("nchw"))
+    return LayoutType::NCHW;
+  else if (key.equals_lower("nhwc"))
+    return LayoutType::NHWC;
+  else
+    return llvm::None;
+}
+
+llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
+  if (key.equals_lower("i32"))
+    return PrecisionType::I32;
+  else if (key.equals_lower("f32"))
+    return PrecisionType::F32;
+  else
+    return llvm::None;
+}
+
+TensorType TensorType::get(TargetType target,
+                           LayoutType layout,
+                           PrecisionType precision) {
+  return Base::get(
+      ::infrt::Global::getMLIRContext(), target, layout, precision);
+}
+
+TargetType TensorType::target() { return getImpl()->target_; }
+
+LayoutType TensorType::layout() { return getImpl()->layout_; }
+
+PrecisionType TensorType::precision() { return getImpl()->precision_; }
+
+raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) {
+  os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
+     << ", " << tensorType.precision() << ">";
+  return os;
+}
+
+TensorMapType TensorMapType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+TensorMapType TensorMapType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
+StringType StringType::get() {
+  return Base::get(::infrt::Global::getMLIRContext());
+}
+
+StringType StringType::get(mlir::MLIRContext *context) {
+  return Base::get(context);
+}
+
+raw_ostream &operator<<(raw_ostream &os, TargetType type) {
+  switch (type) {
+    case (TargetType::X86):
+      os << "X86";
+      break;
+    case (TargetType::CUDA):
+      os << "CUDA";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
+  switch (type) {
+    case (LayoutType::NCHW):
+      os << "NCHW";
+      break;
+    case (LayoutType::NHWC):
+      os << "NHWC";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
+  switch (type) {
+    case (PrecisionType::I32):
+      os << "I32";
+      break;
+    case (PrecisionType::F32):
+      os << "F32";
+      break;
+    default:
+      os << "Unsupported";
+  }
+  return os;
+}
+
+static Type getTensorType(mlir::MLIRContext *context) {
+  auto t_dialect = Identifier::get("t", context);
+  return OpaqueType::get(t_dialect, "tensor", context);
+}
+
+static ParseResult parseCreateUninitTensorOp(
+    OpAsmParser &parser,       // NOLINT
+    OperationState &result) {  // NOLINT
+  auto loc = parser.getCurrentLocation();
+  ::mlir::Type outputRawTypes[1];
+  ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes);
+
+  mlir::ArrayAttr shapeAttr;
+  if (parser.parseAttribute(shapeAttr,
+                            parser.getBuilder().getI64Type(),
+                            "shape",
+                            result.attributes))
+    return failure();
+  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+
+  if (parser.parseArrow()) return failure();
+  if (parser.parseType(outputRawTypes[0])) return failure();
+  if (!outputRawTypes[0].isa<TensorType>())
+    return parser.emitError(loc, "invalid kind of type specified");
+  result.addTypes(outputTypes);
+  return success();
+}
+
+template <typename CreateUninitTensorOp>
+static void printCreateUninitTensorOp(OpAsmPrinter &p,  // NOLINT
+                                      CreateUninitTensorOp op) {
+  p << CreateUninitTensorOp::getOperationName();
+  p << " ";
+  p.printAttributeWithoutType(op.shapeAttr());
+  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"});
+  p << " -> ";
+  p << op.getOperation()->getResultTypes();
+}
+
+// TODO(shibo): can be removed?
+// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser,
+// OperationState& result) {
+//  auto loc = parser.getCurrentLocation();
+//  ::mlir::OpAsmParser::OperandType inputRawOperands[1];
+//  ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType>
+//  inputOperands(inputRawOperands);
+//  ::mlir::Type inputRawTypes[1];
+//  ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes);
+//
+//  if (parser.parseOperand(inputRawOperands[0])) return failure();
+//
+//  if (parser.parseColon()) return failure();
+//  if (parser.parseType(inputRawTypes[0])) return failure();
+//  if (!inputRawTypes[0].isa<TensorType>())
+//    return parser.emitError(loc, "invalid kind of type specified");
+//
+//  Attribute value_attr;
+//  if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands))
+//  return failure();
+//  if (parser.parseAttribute(value_attr, "value", result.attributes)) return
+//  failure();
+//  return success();
+//}
+
+// TODO(shibo): can be removed?
+// template <typename FillTensorOp>
+// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) {
+//  p << FillTensorOp::getOperationName();
+//  p << " ";
+//  p.printOperand(op.getOperand());
+//  p << " : ";
+//  p << op.getOperation()->getOperandTypes();
+//  p << " ";
+//  p << op.getAttr("value");
+//}
+
+static ParseResult parseSetTensorOp(OpAsmParser &parser,       // NOLINT
+                                    OperationState &result) {  // NOLINT
+  SmallVector<OpAsmParser::OperandType, 1> operands;
+  if (parser.parseOperandList(operands, 1)) return failure();
+
+  auto tensor_type = getTensorType(result.getContext());
+
+  Attribute value_attr;
+  return failure(
+      parser.resolveOperand(operands[0], tensor_type, result.operands) ||
+      parser.parseAttribute(value_attr, "values", result.attributes));
+}
+
+template <typename SetTensorOp>
+static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
+  p << SetTensorOp::getOperationName() << " ";
+  p.printOperand(op.getOperand());
+  p << " " << op.getAttr("values");
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/dense_tensor.cpp.inc"  // NOLINT
+
+}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
new file mode 100644
index 00000000000000..866c62213ab058
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include <string>
+
+using namespace mlir;  // NOLINT
+namespace infrt::dt {
+
+namespace detail {
+struct TensorTypeStorage;
+}  // namespace detail
+
+enum class TargetType : uint8_t { X86, CUDA };
+enum class LayoutType : uint8_t { NCHW, NHWC };
+enum class PrecisionType : uint8_t { I32, F32 };
+
+llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
+llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
+llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
+
+raw_ostream &operator<<(raw_ostream &os, TargetType type);
+raw_ostream &operator<<(raw_ostream &os, LayoutType type);
+raw_ostream &operator<<(raw_ostream &os, PrecisionType type);
+
+class TensorType : public mlir::Type::TypeBase<TensorType,
+                                               mlir::Type,
+                                               detail::TensorTypeStorage> {
+ public:
+  using Base::Base;
+  static TensorType get(TargetType target,
+                        LayoutType layout,
+                        PrecisionType precision);
+
+  TargetType target();
+  LayoutType layout();
+  PrecisionType precision();
+};
+
+raw_ostream &operator<<(raw_ostream &os, TensorType tensorType);
+
+class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
+                                                  mlir::Type,
+                                                  mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static TensorMapType get();
+  static TensorMapType get(mlir::MLIRContext *context);
+};
+
+class StringType
+    : public mlir::Type::TypeBase<StringType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+  static StringType get();
+  static StringType get(mlir::MLIRContext *context);
+};
+
+#include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/dense_tensor.hpp.inc"
+
+}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td
new file mode 100644
index 00000000000000..07e70cb2ca1eea
--- /dev/null
+++ b/paddle/infrt/dialect/dense_tensor.td
@@ -0,0 +1,150 @@
+#ifdef DT_OPS
+#else
+#define DT_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/tensor_shape_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def DT_Dialect : Dialect {
+  let name = "dt";
+
+  let description = [{
+      The DenseTensor dialect.
+  }];
+
+  let cppNamespace = "::infrt::dt";
+}
+
+class DT_Op<string mnemonic, list<OpTrait> traits = []> :
+      Op<DT_Dialect, mnemonic, traits>;
+
+class CreateUninitTensorOp<string dtype>
+      : DT_Op<"create_uninit_tensor." # dtype, [NoSideEffect]> {
+  let summary = "dt.create_uninit_tensor operation";
+
+  let description = [{
+      An operation that creates an uninitialized tensor.
+  }];
+
+  let arguments = (ins I64ArrayAttr:$shape);
+  let results = (outs TensorType:$output);
+
+  let parser  = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }];
+  let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }];
+}
+
+
+def ShallowCopyTensorOp
+      : DT_Op<"shallow_copy_tensor", [NoSideEffect]> {
+  let summary = "dt.shallow_copy_tensor operation";
+
+  let description = [{
+      An operation that copy a tensor shallowly.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs TensorType:$output);
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+}
+
+
+class FillTensorWithConstantOp<string dtype> :
+      DT_Op<"fill_tensor_with_constant." # dtype> {
+  let summary = "dt.fill_tensor_with_constant operation";
+
+  let description = [{
+      An operation that fills an input tensor with a value.
+  }];
+
+  let arguments = (ins
+      TensorType:$input,
+      AnyAttr:$value
+  );
+  let results = (outs);
+
+  // TODO: can be removed?
+  //let parser  = [{ return infrt::dt::parseFillTensorWithConstantOp(parser, result); }];
+  //let printer = [{ return infrt::dt::printFillTensorWithConstantOp(p, *this); }];
+  let assemblyFormat = "`(` $input `:` type($input) `)`  attr-dict";
+}
+
+def PrintTensorOp : DT_Op<"print_tensor"> {
+  let summary = "dt.print_tensor operation";
+
+  let description = [{
+    An operation that prints a tensor.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class SetTensorOp<string dtype> :
+      DT_Op<"set_tensor_with_constant_values." # dtype> {
+  let summary = "dt.set_tensor_with_constant_values operation";
+
+  let description = [{
+    An operation that sets an input tensor with given values.
+  }];
+
+  let arguments = (ins TensorType);
+  let results = (outs);
+
+  let parser  = [{ return infrt::dt::parseSetTensorOp(parser, result); }];
+  let printer = [{ return infrt::dt::printSetTensorOp(p, *this); }];
+}
+
+def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> {
+  let summary = "dt.load_params operation";
+
+  let description = [{
+    An operation that can load tensors to TensorMap.
+  }];
+
+  // input path of model params.
+  let arguments = (ins StringType:$path);
+  let results = (outs TensorMapType);
+
+  let assemblyFormat = "`(` operands `)` attr-dict";
+  let verifier = ?;
+}
+
+def GetParamOp : DT_Op<"get_param", [NoSideEffect]> {
+  let summary = "dt.get_param operation";
+
+  let description = [{
+    An operation that can get a tensor from TensorMap.
+  }];
+
+  // input path of model params.
+  let arguments = (ins
+          TensorMapType:$map,
+          StrAttr:$name
+          );
+  let results = (outs TensorType:$output);
+  let assemblyFormat = "`(` $map `,` $name `)` attr-dict `->` type($output)";
+  let verifier = ?;
+}
+
+def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> {
+  let summary = "dt.get_tensor_shape operation";
+
+  let description = [{
+      An operation that returns the shape of the input tensor.
+  }];
+
+  let arguments = (ins TensorType:$input);
+  let results = (outs TS_Shape:$output);
+  let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)";
+}
+
+foreach dtype = ["ui8", "ui16", "ui32", "ui64", "i32", "f32", "f64", "i64"] in {
+  def DT_CreateUninitTensorOp_#dtype : CreateUninitTensorOp<dtype>;
+  def DT_FillTensorOp_#dtype : FillTensorWithConstantOp<dtype>;
+  def DT_SetTensorOp_#dtype : SetTensorOp<dtype>;
+}
+
+#endif  // DT_OPS
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
new file mode 100644
index 00000000000000..a28176e38fdc71
--- /dev/null
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/diagnostic_utils.h"
+
+#include <string>
+
+namespace infrt::dialect {
+
+struct MyScopedDiagnosicHandler::Impl {
+  Impl() : diag_stream_(diag_str_) {}
+
+  // String stream to assemble the final error message.
+  std::string diag_str_;
+  llvm::raw_string_ostream diag_stream_;
+
+  // A SourceMgr to use for the base handler class.
+  llvm::SourceMgr source_mgr_;
+
+  // Log detail information.
+  bool log_info_{};
+};
+
+MyScopedDiagnosicHandler::MyScopedDiagnosicHandler(mlir::MLIRContext *ctx,
+                                                   bool propagate)
+    : mlir::SourceMgrDiagnosticHandler(
+          impl_->source_mgr_, ctx, impl_->diag_stream_),
+      impl_(new Impl) {
+  setHandler([this](mlir::Diagnostic &diag) { return this->handler(&diag); });
+}
+
+mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) {
+  if (diag->getSeverity() != mlir::DiagnosticSeverity::Error &&
+      !impl_->log_info_)
+    return mlir::success();
+  emitDiagnostic(*diag);
+  impl_->diag_stream_.flush();
+  return mlir::failure(true);
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h
new file mode 100644
index 00000000000000..3a8098cf751812
--- /dev/null
+++ b/paddle/infrt/dialect/diagnostic_utils.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/IR/Diagnostics.h>
+
+#include <memory>
+
+namespace infrt::dialect {
+
+/**
+ * A scoped diagnostic handler to help debug MLIR process.
+ */
+class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler {
+ public:
+  MyScopedDiagnosicHandler(mlir::MLIRContext* ctx, bool propagate);
+
+  mlir::LogicalResult handler(mlir::Diagnostic* diag);
+
+  ~MyScopedDiagnosicHandler();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc
new file mode 100644
index 00000000000000..cbcd5d0f0fa785
--- /dev/null
+++ b/paddle/infrt/dialect/dialect.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LogicalResult.h>
+
+namespace infrt::hlir::dialect {
+
+class CinnDialect : public ::mlir::Dialect {
+ public:
+  explicit CinnDialect(::mlir::MLIRContext* ctx);
+
+  //! We should register this function in dialect
+  static llvm::StringRef getDialectNamespace() {
+    return "infrt::hlir::dialect";
+  }
+};
+
+}  // namespace infrt::hlir::dialect
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
new file mode 100644
index 00000000000000..b28ad5ad4b5a59
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/infrt_base.h"
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/test_kernels.h"
+
+namespace infrt::dialect {
+
+// ----INFRTDialect definition begin----
+void INFRTDialect::initialize() {
+  allowUnknownTypes();
+  allowUnknownOperations();
+
+  addTypes<infrt::dt::StringType>();
+  addTypes<infrt::dt::TensorType>();
+  addTypes<infrt::dt::TensorMapType>();
+
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/basic_kernels.cpp.inc"
+      >();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+      >();
+}
+
+mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+  // parse TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  if (keyword == "tensor") {
+    llvm::StringRef target;
+    llvm::StringRef layout;
+    llvm::StringRef precision;
+
+    // parse "<"
+    if (parser.parseLess()) return mlir::Type();
+    // parse target
+    if (parser.parseKeyword(&target)) return mlir::Type();
+    auto targetType = infrt::dt::GetTargetType(target);
+    if (!targetType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown target type: ")
+          << target;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse layout
+    if (parser.parseKeyword(&layout)) return mlir::Type();
+    auto layoutType = infrt::dt::GetLayoutType(layout);
+    if (!layoutType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown layout type: ")
+          << layout;
+      return mlir::Type();
+    }
+
+    // parse ","
+    if (parser.parseComma()) return mlir::Type();
+    // parse precision
+    if (parser.parseKeyword(&precision)) return mlir::Type();
+    auto precisionType = infrt::dt::GetPrecisionType(precision);
+    if (!precisionType) {
+      parser.emitError(parser.getCurrentLocation(), "unknown precision type: ")
+          << precision;
+      return mlir::Type();
+    }
+
+    // parse ">"
+    if (parser.parseGreater()) return mlir::Type();
+
+    return infrt::dt::TensorType::get(*targetType, *layoutType, *precisionType);
+  }
+  // parse TensorMapType, for example: !infrt.tensor_map
+  if (keyword == "tensor_map") {
+    return infrt::dt::TensorMapType::get();
+  }
+  // parse StringType, for example: !infrt.string
+  if (keyword == "string") {
+    return infrt::dt::StringType::get();
+  }
+
+  parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ")
+      << keyword;
+  return mlir::Type();
+}
+
+void INFRTDialect::printType(mlir::Type type,
+                             mlir::DialectAsmPrinter &printer) const {
+  // print TensorType, for example: !infrt.tensor<X86, CUDA, F32>
+  if (type.isa<infrt::dt::TensorType>()) {
+    auto tensorType = type.cast<infrt::dt::TensorType>();
+    printer << "tensor<" << tensorType.target() << ", " << tensorType.layout()
+            << ", " << tensorType.precision() << ">";
+    return;
+  }
+  // print TensorMapType, for example: !infrt.tensor_map
+  if (type.isa<infrt::dt::TensorMapType>()) {
+    printer << "tensor_map";
+    return;
+  }
+  // print StringType, for example: !infrt.string
+  if (type.isa<infrt::dt::StringType>()) {
+    printer << "string";
+    return;
+  }
+  llvm_unreachable("unknown infrt type.");
+}
+
+// ----INFRTDialect definition end----
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
new file mode 100644
index 00000000000000..1398378957069d
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
+
+#include "paddle/infrt/dialect/infrt_base.hpp.inc"
+
+namespace infrt::dialect {
+
+class INFRTDialect : public ::mlir::Dialect {
+  explicit INFRTDialect(::mlir::MLIRContext *context)
+      : ::mlir::Dialect(getDialectNamespace(),
+                        context,
+                        ::mlir::TypeID::get<INFRTDialect>()) {
+    initialize();
+  }
+
+  // parse types registered to the dialect.
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  // print types registered to the dialect.
+  void printType(mlir::Type type,
+                 mlir::DialectAsmPrinter &printer) const override;
+
+  void initialize();
+  friend class ::mlir::MLIRContext;
+
+ public:
+  static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
+};
+
+}  // namespace infrt::dialect
+
+namespace mlir {
+
+template <typename T>
+static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
+                                       mlir::Location loc,
+                                       T constant) {
+  return b.getIntegerAttr(b.getI32Type(), constant);
+}
+
+static mlir::ValueRange cvtValueToValueRange(const mlir::Value &operand) {
+  return mlir::ValueRange(operand);
+}
+
+static mlir::ValueRange concatTwoValueRange(mlir::ValueRange operand_0,
+                                            mlir::ValueRange operand_1) {
+  mlir::SmallVector<::mlir::Value, 4> operands;
+  operands.append(operand_0.begin(), operand_0.end());
+  operands.append(operand_1.begin(), operand_1.end());
+  return operands;
+}
+
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
new file mode 100644
index 00000000000000..61dcfe5bfb1c37
--- /dev/null
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -0,0 +1,42 @@
+#ifndef INFRT_BASE
+#define INFRT_BASE
+
+include "mlir/IR/OpBase.td"
+
+def INFRT_Dialect : Dialect {
+  let name = "infrt";
+
+  let description = [{
+    The INFRT host dialect.
+  }];
+
+  let cppNamespace = "::infrt::dialect";
+}
+
+// Type definitions
+def StringType :
+    Type<CPred<"$_self.isa<::infrt::dt::StringType>()">, "!infrt.string type">,
+    BuildableType<"$_builder.getType<::infrt::dt::StringType>()">;
+
+def TensorType :
+    Type<CPred<"$_self.isa<::infrt::dt::TensorType>()">, "!infrt.tensor type">;
+
+def TensorMapType :
+    Type<CPred<"$_self.isa<::infrt::dt::TensorMapType>()">, "!infrt.tensor_map type">,
+    BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">;
+
+def BufferType : OpaqueType<"b", "buffer", "buffer">;
+
+class INFRT_createI32Attr<string value> : NativeCodeCall<
+    "mlir::createI32Attr($_builder, $_loc, " # value # ")">;
+
+def INFRT_cvtValueToValueRange : NativeCodeCall<
+    "mlir::cvtValueToValueRange($0)">;
+
+def INFRT_concatTwoValueRange : NativeCodeCall<
+    "mlir::concatTwoValueRange($0, $1)">;
+
+class IsBoolAttrEq<string value> : Constraint<
+    CPred<"($0.getValue() ==" # value # ")">,
+    "Bool attrbute value constraint">;
+#endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
new file mode 100644
index 00000000000000..4bc2bf70942d29
--- /dev/null
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/dialect/basic_kernels.h"
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+#include "paddle/infrt/dialect/pd_ops.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+namespace infrt {
+
+void RegisterCinnDialects(mlir::DialectRegistry& registry) {  // NOLINT
+  registry.insert<ts::TensorShapeDialect>();
+  registry.insert<dialect::INFRTDialect>();
+  registry.insert<dt::DTDialect>();
+  registry.insert<mlir::pd::PaddleDialect>();
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h
new file mode 100644
index 00000000000000..50caca018980d0
--- /dev/null
+++ b/paddle/infrt/dialect/init_infrt_dialects.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/IR/Dialect.h"
+
+namespace infrt {
+
+void RegisterCinnDialects(mlir::DialectRegistry& registry);  // NOLINT
+
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
new file mode 100644
index 00000000000000..8df8727dbe2b09
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OperationSupport.h>
+#include <mlir/Parser.h>
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/infrt/dialect/diagnostic_utils.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace infrt::dialect {
+
+mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
+                                     const std::string& mlir_source) {
+  context->allowUnregisteredDialects();
+  RegisterCinnDialects(context->getDialectRegistry());
+  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
+
+  mlir::ScopedDiagnosticHandler scope_handler(
+      context, [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
+          return mlir::success();
+        LOG(INFO) << "diag: " << diag.str();
+        return mlir::failure(true);
+      });
+
+  auto res = mlir::parseSourceString(
+      llvm::StringRef(mlir_source.data(), mlir_source.length()), context);
+  CHECK(*res) << "failed to parse MLIR string";
+  return res;
+}
+
+mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
+                                   mlir::MLIRContext* context) {
+  context->allowUnregisteredDialects();
+  RegisterCinnDialects(context->getDialectRegistry());
+  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
+
+  mlir::ScopedDiagnosticHandler scope_handler(
+      context, [](mlir::Diagnostic& diag) {
+        if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
+          return mlir::success();
+        LOG(INFO) << "diag: " << diag.str();
+        return mlir::failure(true);
+      });
+
+  return mlir::parseSourceFile(std::string(file_name), context);
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
new file mode 100644
index 00000000000000..092da7d9ce03f6
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <mlir/IR/Module.h>
+#include <string>
+
+#include <memory>
+
+namespace infrt::dialect {
+
+mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
+                                     const std::string& mlir_source);
+mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
+                                   mlir::MLIRContext* context);
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
new file mode 100644
index 00000000000000..1b622d585ad8ee
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/IR/Function.h>
+#include <mlir/Parser.h>
+
+#include <string>
+
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace infrt::dialect {
+
+TEST(MlirLoader, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v0) : (f32) -> ()
+
+  infrt.return %value : f32
+}
+)ROC";
+
+  auto module = LoadMlirSource(&context, source);
+  module->verify();
+
+  LOG(INFO) << "module name: " << module->getOperationName().data();
+  for (auto func : module->getOps<mlir::FuncOp>()) {
+    LOG(INFO) << "get func " << func.getName().str();
+    int num_args = func.getNumArguments();
+    for (int i = 0; i < num_args; i++) {
+      LOG(INFO) << "arg: " << func.getArgument(i).getArgNumber();
+    }
+  }
+}
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/mlir_tests/basic.mlir b/paddle/infrt/dialect/mlir_tests/basic.mlir
new file mode 100644
index 00000000000000..84b9b0fbd71cbd
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/basic.mlir
@@ -0,0 +1,40 @@
+// CHECK-LABEL: @basic_f32
+func @basic_f32() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK-NEXT: 3
+  "infrt.print.f32"(%value) : (f32) -> ()
+
+  infrt.return %value : f32
+}
+
+/// ================================================================
+/// @caller call the other function @callee
+func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 {
+  %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32
+  %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32
+  infrt.return %z1 : f32
+}
+
+// CHECK-LABEL: @caller.add.f32
+func @caller.add.f32() -> f32 {
+  %x = infrt.constant.f32 1.0
+  %y = infrt.constant.f32 2.0
+  %y1 = infrt.constant.f32 3.0
+  %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32
+
+  // CHECK-NEXT: 6
+  "infrt.print.f32"(%z) : (f32) -> ()
+  infrt.return %z : f32
+}
+/// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+// CHECK-LABEL: @string_test
+func @string_test() {
+  %path = infrt.get_string("this is get_string op.")
+  // CHECK-LABEL: string = this is get_string op.
+  infrt.print_string(%path)
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/benchmark.mlir b/paddle/infrt/dialect/mlir_tests/benchmark.mlir
new file mode 100644
index 00000000000000..8b4530689df7e5
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/benchmark.mlir
@@ -0,0 +1,23 @@
+// CHECK-LABEL: @benchmark
+func @benchmark() {
+  // CHECK-LABEL: BM:add.f32:Count: 3
+  // CHECK-LABEL: BM:add.f32:Duration(ns)
+  // CHECK-LABEL: BM:add.f32:Time Min(ns)
+  // CHECK-LABEL: BM:add.f32:Time 50%(ns)
+  // CHECK-LABEL: BM:add.f32:Time 95%(ns)
+  // CHECK-LABEL: BM:add.f32:Time 99%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU Min(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 50%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 95%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU 99%(ns)
+  // CHECK-LABEL: BM:add.f32:CPU utilization(percent)
+  infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3
+  {
+    %0 = infrt.constant.f32 1.0
+    %1 = infrt.constant.f32 2.0
+    %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32
+    "infrt.print.f32"(%res) : (f32) -> ()
+    infrt.return %res : f32
+  }
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
new file mode 100644
index 00000000000000..cca7445cd58d8f
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/dense_tensor.mlir
@@ -0,0 +1,22 @@
+func @dense_shape0() {
+  %shape = ts.build_shape [1:i64, 57:i64]
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+
+  infrt.return
+}
+
+func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+  %a0 = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+  %b0 = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> !infrt.tensor<X86, NCHW, F32>
+
+  infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+}
+
+
+func @main() {
+  %shape = ts.build_shape [1:i64, 57:i64]
+  %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor<X86, NCHW, F32>
+
+  %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>)
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
new file mode 100644
index 00000000000000..1855a68dd91c34
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/paddle_ops.mlir
@@ -0,0 +1,8 @@
+func @ops() {
+  %a = pd.Feed() : tensor<?xf32>
+  %b = pd.Feed() : tensor<?xf32>
+
+  %c = "pd.Matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
new file mode 100644
index 00000000000000..c984fda3e62111
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -0,0 +1,24 @@
+// CHECK-LABEL: @main
+func @main() -> tensor<?xf32> {
+  %a = "pd.Feed"() : () -> tensor<?xf32>
+  %b = "pd.Feed"() : () -> tensor<?xf32>
+  %bias = "pd.Feed"() : () -> tensor<?xf32>
+
+  %b1 = "pd.Feed"() : () -> tensor<?xf32>
+  %b2 = "pd.Feed"() : () -> tensor<?xf32>
+  %bias1 = "pd.Feed"() : () -> tensor<?xf32>
+  %bias2 = "pd.Feed"() : () -> tensor<?xf32>
+
+  %c = "pd.Matmul"(%a, %b) {transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d = "pd.ElementwiseAdd"(%c, %bias) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e = "pd.Relu6"(%d) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c1 = "pd.Matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d1 = "pd.ElementwiseAdd"(%c1, %bias1) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e1 = "pd.Relu"(%d1) {} : (tensor<?xf32>) -> tensor<?xf32>
+
+  %c2 = "pd.Matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %d2 = "pd.ElementwiseAdd"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %e2 = "pd.Relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
+  infrt.return %e2 : tensor<?xf32>
+}
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
new file mode 100644
index 00000000000000..d41d4b2f9f6bc3
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -0,0 +1,15 @@
+// CHECK-LABEL: @main
+func @main() -> tensor<?xf32> {
+  %a = "pd.Feed"() : () -> tensor<?x3x256x256xf32>
+  %filter = "pd.Constant"(){value = dense<1.000000e+00> : tensor<3x64x3x3xf32>} : () -> tensor<3x64x3x3xf32> 
+  %bias = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+
+  %scale = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %bias2 = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %mean = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+  %var = "pd.Constant"(){value = dense<1.000000e+00> : tensor<64xf32>} : () -> tensor<64xf32>
+
+  %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
+  %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
+  infrt.return %d : tensor<?x3x256x256xf32>
+}
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_map.mlir b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir
new file mode 100644
index 00000000000000..111c01c9a108ba
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_map.mlir
@@ -0,0 +1,31 @@
+// CHECK-LABEL: @predict
+func @predict(%input:!infrt.tensor<X86, NCHW, F32>, %map: !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>) {
+  %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor<X86, NCHW, F32>
+  %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor<X86, NCHW, F32>
+
+  %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+
+  // fc
+  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  //dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+}
+
+// CHECK-LABEL: @main
+func @main() {
+  %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model")
+  // CHECK-LABEL: loading params
+  %map = dt.load_params(%path)
+
+  %out = infrt.call @predict(%input, %map): (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor_map) -> (!infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
+
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
new file mode 100644
index 00000000000000..504b5b36be038f
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_shape.mlir
@@ -0,0 +1,5 @@
+func @build_tensor1() {
+  %a = ts.build_shape [1:i64, 57:i64, 92:i64]
+  ts.print_shape %a
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/mlir_tests/tensor_type.mlir b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir
new file mode 100644
index 00000000000000..c331097ab1072c
--- /dev/null
+++ b/paddle/infrt/dialect/mlir_tests/tensor_type.mlir
@@ -0,0 +1,9 @@
+// CHECK-LABEL: test_tensor_type
+func @test_tensor_type() {
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td
new file mode 100644
index 00000000000000..264134a447c63f
--- /dev/null
+++ b/paddle/infrt/dialect/ops.td
@@ -0,0 +1,6 @@
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/infrt_base.td"
+
+
+class INFRT_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<INFRT_Dialect, mnemonic, traits>;
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
new file mode 100644
index 00000000000000..d90d25230d0c24
--- /dev/null
+++ b/paddle/infrt/dialect/opt.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Dialect/Affine/IR/AffineOps.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/InitAllDialects.h>
+#include <mlir/InitAllPasses.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/FileUtilities.h>
+#include <mlir/Support/MlirOptMain.h>
+#include <mlir/Transforms/Passes.h>
+
+#include <iostream>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+
+int main(int argc, char **argv) {
+  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
+
+  auto &registry = context->getDialectRegistry();
+  infrt::RegisterCinnDialects(registry);
+
+  mlir::registerCanonicalizerPass();
+
+  return mlir::failed(
+      mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry));
+}
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
new file mode 100644
index 00000000000000..af53df113dfb3e
--- /dev/null
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -0,0 +1,77 @@
+// This file defines some basic elements of Paddle(alias pd) dialect.
+// We learned much from TensorFlow mlir dialect https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+
+#ifndef PD_OP_BASE
+#define PD_OP_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+def PD_Dialect : Dialect {
+  let name = "pd";
+
+  let description = [{
+    The PaddlePaddle dialect.
+
+    This dialect contains the PaddlePaddle operators.
+  }];
+
+  let cppNamespace = "::mlir::pd";
+}
+
+class PD_Op<string mnemonic, list<OpTrait> traits = []> :
+      Op<PD_Dialect, mnemonic, traits>;
+
+
+class PD_PaddleAttr <string name, string description> :
+      Attr<CPred<"$_self.isa<mlir::pd::" # name # "Attr>()">,
+          "PaddlePaddle " # description # " attribute">;
+
+
+//===----------------------------------------------------------------------===//
+// PaddlePaddle type definitions
+//===----------------------------------------------------------------------===//
+
+def PD_PDDialectType : Type<CPred<"$_self.isa<mlir::pd::PDType>()">, "PaddlePaddle type">;
+
+class PD_PaddleType <string name, string description> :
+      Type<CPred<"$_self.isa<mlir::pd::" # name #"Type>()">,
+         "Paddle " # description # " type">,
+      BuildableType<"getType<mlir::pd::" # name # "Type>()">;
+
+//===----------------------------------------------------------------------===//
+// Integer types
+def PD_Bool : AnyTypeOf<[I<1>], "bool">;
+def PD_Int8 : AnyTypeOf<[I8], "8-bit integer">;
+def PD_Int16 : AnyTypeOf<[I16], "16-bit integer">;
+def PD_Int32 : AnyTypeOf<[I32], "32-bit integer">;
+def PD_Int64 : AnyTypeOf<[I64], "64-bit integer">;
+
+def PD_UInt8 : AnyTypeOf<[UI<8>], "8-bit unsigned integer">;
+def PD_UInt16 : AnyTypeOf<[UI<16>], "16-bit unsigned integer">;
+def PD_UInt32 : AnyTypeOf<[UI<32>], "32-bit unsigned integer">;
+def PD_UInt64 : AnyTypeOf<[UI<64>], "64-bit unsigned integer">;
+
+def PD_SInt : AnyTypeOf<[PD_Int8, PD_Int16, PD_Int32, PD_Int64], "signed integer">;
+def PD_UInt : AnyTypeOf<[PD_UInt8, PD_UInt16, PD_UInt32, PD_UInt64], "unsigned integer">;
+def PD_Int : AnyTypeOf<[PD_SInt, PD_UInt], "integer">;
+
+// Float types
+def PD_Float16 : AnyTypeOf<[F16], "16-bit float">;
+def PD_Float32 : AnyTypeOf<[F32], "32-bit float">;
+def PD_Float64 : AnyTypeOf<[F64], "64-bit float">;
+
+def PD_Float : AnyTypeOf<[PD_Float16, PD_Float32, PD_Float64], "floating-point">;
+
+
+// Tensor types
+
+def PD_ElementType : Type<Or<[PD_Float.predicate,
+                              PD_Bool.predicate,
+                              PD_Int.predicate]>,
+                              "pd.dtype">;
+
+def PD_Tensor : TensorOf<[PD_ElementType]>;
+
+
+#endif // PD_OP_BASE
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
new file mode 100644
index 00000000000000..7ca07dd5fcbba4
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd_ops.h"
+
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/PatternMatch.h"
+#include "paddle/infrt/dialect/infrt_base.h"
+
+namespace mlir {
+namespace pd {
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
+#undef GET_OP_CLASSES
+
+PaddleDialect::PaddleDialect(MLIRContext *context)
+    : Dialect("pd", context, TypeID::get<PaddleDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+      >();
+#undef GET_OP_LIST
+}
+
+mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
+                                                    mlir::Attribute value,
+                                                    mlir::Type type,
+                                                    mlir::Location loc) {
+  return builder.create<ConstantOp>(loc, value);
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+#undef GET_OP_CLASSES
+
+#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
+
+void ConstantOp::build(OpBuilder &builder,
+                       OperationState &state,
+                       Attribute value) {
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    return ConstantOp::build(builder, state, elem_attr);
+  } else if (value.isa<BoolAttr, FloatAttr, IntegerAttr>()) {
+    ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType());
+    state.addAttribute("value", DenseElementsAttr::get(type, value));
+    state.addTypes(type);
+    return;
+  }
+  llvm_unreachable("unsupported attribute type for building pd.constant");
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(attributes.get("value").getType());
+  return success();
+}
+::mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<::mlir::Attribute> operands) {
+  return value();
+}
+
+LogicalResult ElementwiseAdd::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+void ElementwiseAdd::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseMulAdd>(context);
+}
+
+::mlir::OpFoldResult ElementwiseAdd::fold(
+    llvm::ArrayRef<mlir::Attribute> operands) {
+  if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
+    if (!operands[0] || !operands[1]) return {};
+    DenseElementsAttr lhs = operands[0].dyn_cast<DenseElementsAttr>();
+    DenseElementsAttr rhs = operands[1].dyn_cast<DenseElementsAttr>();
+    if (!lhs || !rhs) return {};
+    ShapedType type = getType().template cast<ShapedType>();
+    if (!type.hasStaticShape()) return {};
+    Type etype = type.getElementType();
+    if (!etype.isa<FloatType>()) return {};
+    SmallVector<APFloat, 6> values;
+    values.reserve(lhs.getNumElements());
+    for (const auto zip :
+         llvm::zip(lhs.getValues<APFloat>(), rhs.getValues<APFloat>())) {
+      values.push_back(
+          std::plus<APFloat>()(std::get<0>(zip), std::get<1>(zip)));
+    }
+    return DenseElementsAttr::get(type, values);
+  }
+  return {};
+}
+
+LogicalResult ElementwiseDiv::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult ElementwiseMul::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult ElementwiseSub::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+LogicalResult MulOp::inferReturnTypes(
+    MLIRContext *context,
+    Optional<Location> location,
+    ValueRange operands,
+    DictionaryAttr attributes,
+    RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.push_back(operands[0].getType());
+  return success();
+}
+
+void ReluOp::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseFCRelu>(context);
+}
+
+void FusedRepeatedFCRelu::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseRepeatedFCRelu2>(context);
+}
+
+void BatchNormOp::getCanonicalizationPatterns(
+    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+  results.insert<FuseBatchNormWithConvPattern>(context);
+}
+
+}  // namespace pd
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
new file mode 100644
index 00000000000000..d09b6032257a22
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mlir/Dialect/Traits.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir {
+namespace pd {
+
+class PaddleDialect : public Dialect {
+ public:
+  explicit PaddleDialect(MLIRContext* context);
+
+  static StringRef getDialectNamespace() { return "pd"; }
+
+  /// A hook used to materialize constant values with the given type.
+  Operation* materializeConstant(OpBuilder& builder,
+                                 Attribute value,
+                                 Type type,
+                                 Location loc) override;
+
+  Type parseType(DialectAsmParser& parser) const override {
+    return Dialect::parseType(parser);
+  }
+  void printType(Type type, DialectAsmPrinter& printer) const override {
+    Dialect::printType(type, printer);
+  }
+};
+
+}  // namespace pd
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
new file mode 100644
index 00000000000000..9e906ad0c02ccd
--- /dev/null
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -0,0 +1,182 @@
+#ifndef PD_OPS
+#define PD_OPS
+
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/IR/OpBase.td"
+include "paddle/infrt/dialect/pd_op_base.td"
+
+def PD_FeedOp : PD_Op<"Feed", [NoSideEffect]> {
+  let summary = "Feed Op";
+
+  let description = [{
+    Feed a tensor into the model.
+  }];
+
+  let arguments = (ins);
+  let results = (outs PD_Tensor:$out);
+
+  let assemblyFormat = [{
+      `(` `)` attr-dict `:` type($out)
+  }];
+}
+
+def PD_ConstantOp : PD_Op<"Constant", [NoSideEffect, ConstantLike, DeclareOpInterfaceMethods<InferTypeOpInterface>, AllTypesMatch<["value", "output"]>]> {
+  let summary = "constant Op";
+  let description = [{}];
+
+  let arguments = (ins ElementsAttr:$value);
+  let results = (outs PD_Tensor:$output);
+  let hasFolder = 1;
+
+  let builders = [
+    OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">,
+  ];
+}
+
+def PD_AbsOp : PD_Op<"Abs", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the absolute value of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_SqrtOp : PD_Op<"sqrt", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the sqrt value of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_ReluOp : PD_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Relu of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+  let hasCanonicalizer = 1;
+}
+
+def PD_Relu6Op : PD_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Computes the Relu6 of a tensor";
+
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x);
+  let results = (outs PD_Tensor:$y);
+}
+
+def PD_ElementwiseAdd : PD_Op<"ElementwiseAdd", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseAdd Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
+def PD_ElementwiseSub : PD_Op<"ElementwiseSub", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseSub Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_ElementwiseMul : PD_Op<"ElementwiseMul", [NoSideEffect, Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseMul Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_ElementwiseDiv : PD_Op<"ElementwiseDiv", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "ElementwiseDiv Op";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y, DefaultValuedAttr<I32Attr, "-1">:$axis);
+  let results = (outs PD_Tensor:$out);
+}
+
+def PD_MatmulOp : PD_Op<"Matmul", [NoSideEffect]> {
+  let summary = "Computes the matrix mulplication result of two tensors";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y,
+                  DefaultValuedAttr<BoolAttr, "false">:$transpose_x,
+                  DefaultValuedAttr<BoolAttr, "false">:$transpose_y,
+                  DefaultValuedAttr<F32Attr, "1.0">:$alpha);
+  let results = (outs PD_Tensor:$out);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_MulOp : PD_Op<"mul", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "paddle mul op";
+  let description = [{}];
+
+  let arguments = (ins PD_Tensor:$x, PD_Tensor:$y);
+  let results = (outs PD_Tensor:$out);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_Conv2dOp : PD_Op<"conv2d", [NoSideEffect]> {
+  let summary = "paddle conv2d operation";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$Input, PD_Tensor:$Filter, PD_Tensor:$Bias);
+  let results = (outs PD_Tensor:$Output);
+
+  //let hasCanonicalizer = 1;
+}
+
+def PD_BatchNormOp : PD_Op<"batch_norm", [NoSideEffect]> {
+  let summary = "paddle batch_norm operation";
+  let description = [{
+  }];
+
+  let arguments = (ins PD_Tensor:$X, PD_Tensor:$Scale, PD_Tensor:$Bias,
+                   PD_Tensor:$Mean, PD_Tensor:$Variance,
+                   DefaultValuedAttr<F32Attr, "1e-05">:$epsilon);
+  let results = (outs PD_Tensor:$Y);
+
+  let hasCanonicalizer = 1;
+}
+
+def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> {
+    let summary = "Computes the Fully Connected result of two tensors";
+    let description = [{
+    }];
+
+    let arguments = (ins PD_Tensor:$input, PD_Tensor:$w, PD_Tensor:$bias, DefaultValuedAttr<I32Attr, "1">:$in_num_col_dims);
+    let results = (outs PD_Tensor:$out);
+}
+
+def PD_FusedRepeatedFCRelu : PD_Op<"RepeatedFCRelu", [SameVariadicOperandSize, NoSideEffect]> {
+    let summary = "";
+    let description = [{ }];
+
+    let arguments = (ins PD_Tensor:$input, Variadic<PD_Tensor>:$w, Variadic<PD_Tensor>:$bias);
+    let results = (outs PD_Tensor:$out);
+    let hasCanonicalizer = 1;
+}
+
+#endif  // PD_OPS
diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/pd_types.cc
new file mode 100644
index 00000000000000..94856e362d3019
--- /dev/null
+++ b/paddle/infrt/dialect/pd_types.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/pd_types.h"
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
new file mode 100644
index 00000000000000..6f9fe56338a9fd
--- /dev/null
+++ b/paddle/infrt/dialect/pd_types.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file defines the types used in PaddlePaddle MLIR dialect.
+// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in
+// tensorflow).
+
+#pragma once
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+
+namespace mlir {
+namespace PD {
+
+class PaddleType : public Type {
+ public:
+  using Type::Type;
+
+  static bool classof(Type type);
+};
+
+namespace detail {
+
+template <typename Derived>
+class PaddleTypeImpl : public Type::TypeBase<Derived, PaddleType, TypeStorage> {
+ public:
+  using Base = typename Type::TypeBase<Derived, PaddleType, TypeStorage>;
+  using PDBase = PaddleTypeImpl<Derived>;
+  using Base::Base;
+};
+
+}  // namespace detail
+
+#define HANDLE_PD_TYPE(pdtype, enumerant, name)                      \
+  class pdtype##Type : public detail::PaddleTypeImpl<pdtype##Type> { \
+   public:                                                           \
+    using PDBase::PDBase;                                            \
+  };
+
+}  // namespace PD
+}  // namespace mlir
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
new file mode 100644
index 00000000000000..3c5a2b6a7bf904
--- /dev/null
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/Parser.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/Passes.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/init_infrt_dialects.h"
+
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+llvm::raw_ostream &printIndent(int indent = 0) {
+  for (int i = 0; i < indent; ++i) llvm::outs() << "    ";
+  return llvm::outs();
+}
+
+void printOperation(mlir::Operation *op, int indent);
+void printRegion(mlir::Region &region, int indent);  // NOLINT
+void printBlock(mlir::Block &block, int indent);     // NOLINT
+
+void printOperation(mlir::Operation *op, int indent) {
+  llvm::Optional<mlir::ModuleOp> module_op = llvm::None;
+  if (llvm::isa<mlir::ModuleOp>(op))
+    module_op = llvm::dyn_cast<mlir::ModuleOp>(op);
+  llvm::Optional<mlir::FuncOp> func_op = llvm::None;
+  if (llvm::isa<mlir::FuncOp>(op)) func_op = llvm::dyn_cast<mlir::FuncOp>(op);
+
+  printIndent(indent) << "op: '" << op->getName();
+  // This getName is inherited from Operation::getName
+  if (module_op) {
+    printIndent() << "@" << module_op->getName();
+  }
+  // This getName is inherited from SymbolOpInterfaceTrait::getName,
+  // which return value of "sym_name" in ModuleOp or FuncOp attributes.
+  if (func_op) {
+    printIndent() << "@" << func_op->getName();
+  }
+  printIndent() << "' with " << op->getNumOperands() << " operands"
+                << ", " << op->getNumResults() << " results"
+                << ", " << op->getAttrs().size() << " attributes"
+                << ", " << op->getNumRegions() << " regions"
+                << ", " << op->getNumSuccessors() << " successors\n";
+  if (!op->getAttrs().empty()) {
+    printIndent(indent) << op->getAttrs().size() << " attributes:\n";
+    for (mlir::NamedAttribute attr : op->getAttrs()) {
+      printIndent(indent + 1) << "- {" << attr.first << " : " << attr.second
+                              << "}\n";
+    }
+  }
+
+  if (op->getNumRegions() > 0) {
+    printIndent(indent) << op->getNumRegions() << " nested regions:\n";
+    for (mlir::Region &region : op->getRegions()) {
+      printRegion(region, indent + 1);
+    }
+  }
+}
+
+void printRegion(mlir::Region &region, int indent) {  // NOLINT
+  printIndent(indent) << "Region with " << region.getBlocks().size()
+                      << " blocks:\n";
+  for (mlir::Block &block : region.getBlocks()) {
+    printBlock(block, indent + 1);
+  }
+}
+
+void printBlock(mlir::Block &block, int indent) {  // NOLINT
+  printIndent(indent) << "Block with " << block.getNumArguments()
+                      << " arguments"
+                      << ", " << block.getNumSuccessors() << " successors"
+                      << ", " << block.getOperations().size()
+                      << " operations\n";
+
+  for (mlir::Operation &operation : block.getOperations()) {
+    printOperation(&operation, indent + 1);
+  }
+}
+
+int main(int argc, char **argv) {
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  mlir::registerPassManagerCLOptions();
+  cl::ParseCommandLineOptions(argc, argv, "mlir demo");
+
+  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
+  context->allowUnregisteredDialects();
+  auto &registry = context->getDialectRegistry();
+  infrt::RegisterCinnDialects(registry);
+
+  // mlir will verify module automatically after parsing.
+  // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051
+  // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source,
+  // context);
+  mlir::OwningModuleRef module_ref =
+      mlir::parseSourceFile(inputFilename, context);
+  std::cout << "----------print IR Structure begin----------" << std::endl;
+  printOperation(module_ref->getOperation(), 0);
+  std::cout << "----------print IR Structure end----------" << std::endl;
+
+  module_ref->dump();
+  return 0;
+}
diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td
new file mode 100644
index 00000000000000..aa81dd72d059b4
--- /dev/null
+++ b/paddle/infrt/dialect/rewrite.td
@@ -0,0 +1,90 @@
+#ifndef INFRT_REWRITE
+#define INFRT_REWRITE
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "paddle/infrt/dialect/pd_ops.td"
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'.
+//
+// We have:
+//   (Matmul)      z = x * y
+//   (Add)         out = z + bias 
+//
+// which corresponds to the following computation:
+//   (FusedFC)  out = x * y + bias
+// 
+// Todo:
+//  1. Make the constrait more completely.
+//  2. Consider the case of : out = bias + z
+//===----------------------------------------------------------------------===//
+def FuseMulAdd : Pat<(PD_ElementwiseAdd (PD_MatmulOp $x, $y, $transpose_x, $transpose_y, $alpha), $bias, $axis),
+                     (PD_FusedFC $x, $y, $bias, (INFRT_createI32Attr<"1">)),
+                     [(IsBoolAttrEq<"false"> $transpose_x),(IsBoolAttrEq<"false"> $transpose_y)]>;
+
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'FusedFC o Relu' into 'FusedRepeatedFCRelu'.
+//
+// We have:
+//   (FusedFC)      z = fc(x, y, bias)
+//   (Relu)         out = relu(z)
+//
+// which corresponds to the following computation:
+//   (FusedRepeatedFCRelu)  out = RepeatedFCRelu(x, [y], [bias])
+// 
+//===----------------------------------------------------------------------===//
+def FuseFCRelu : Pat<(PD_ReluOp (PD_FusedFC $x, $y, $bias, $_)),
+                     (PD_FusedRepeatedFCRelu $x, (INFRT_cvtValueToValueRange $y), (INFRT_cvtValueToValueRange $bias))>;
+
+//===----------------------------------------------------------------------===//
+// This is to fold 'FusedRepeatedFCRelu' op.
+//
+// We have:
+//   (FusedRepeatedFCRelu)      z = RepeatedFCRelu(x, [y, ...], [bias, ...])
+//   (FusedRepeatedFCRelu)      out = RepeatedFCRelu(z, [y1, ...], [bias1, ...])
+//
+// which corresponds to the following computation:
+//   (FusedRepeatedFCRelu)  out = RepeatedFCRelu(x, [y, ..., y1, ...], [bias, ..., bias1, ....])
+// 
+//===----------------------------------------------------------------------===//
+def FuseRepeatedFCRelu2 : Pat<(PD_FusedRepeatedFCRelu (PD_FusedRepeatedFCRelu $x, $y, $bias), $y_2, $bias_2),
+                     (PD_FusedRepeatedFCRelu $x, (INFRT_concatTwoValueRange $y, $y_2), (INFRT_concatTwoValueRange $bias, $bias_2))>;
+
+
+//===----------------------------------------------------------------------===//
+// This is to fuse the composition: 'BatchNorm o Conv' into 'Conv'
+// by deriving new 'w' and 'b' for 'Conv':
+//
+// We have:
+//   (Conv)      z = w * x + b 
+//   (BatchNorm) y = scale * (z - mean) / sqrt(var + eps) + bias
+//
+// which corresponds to the following computation:
+//   y = w_ * x + b_
+// where
+//   w_ = scale * w / sqrt(var + eps)
+//   b_ = B + scale * (b - mean) / sqrt(var + eps)
+//
+//===----------------------------------------------------------------------===//
+def FuseBatchNormWithConvPattern: Pat<
+    (PD_BatchNormOp
+        (PD_Conv2dOp $input, $filter, $bias),
+        $scale, $bias_2, $mean, $var, $epsilon),
+    (PD_Conv2dOp
+        $input,
+        (PD_MulOp $filter,
+            (PD_ElementwiseDiv:$coefficientW
+                $scale,
+                (PD_SqrtOp (PD_ElementwiseAdd $var, (PD_ConstantOp $epsilon), (INFRT_createI32Attr<"1">))),
+                (INFRT_createI32Attr<"1">))),
+        (PD_ElementwiseAdd
+            $bias,
+            (PD_MulOp 
+                (PD_ElementwiseSub $bias, $mean, (INFRT_createI32Attr<"1">)),
+                $coefficientW),
+            (INFRT_createI32Attr<"1">)))
+>;
+
+#endif // INFRT_REWRITE
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
new file mode 100644
index 00000000000000..ef5a5525cb22f3
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/tensor_shape.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/DialectImplementation.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/StandardTypes.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Support/LogicalResult.h>
+
+namespace infrt::ts {
+using namespace mlir;  // NOLINT
+
+void TensorShapeDialect::initialize() {
+  allowUnknownTypes();
+  addTypes<ShapeType, PartialShapeType>();
+  addOperations<
+#define GET_OP_LIST
+#include "paddle/infrt/dialect/tensor_shape.cpp.inc"
+      >();
+}
+
+Type TensorShapeDialect::parseType(DialectAsmParser &parser) const {
+  StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return Type();
+  if (keyword == "shape") return ShapeType::get(getContext());
+  if (keyword == "partial_shape") return PartialShapeType::get(getContext());
+
+  parser.emitError(parser.getNameLoc(), "unknown shape type: ") << keyword;
+  return Type();
+}
+
+void TensorShapeDialect::printType(::mlir::Type type,
+                                   ::mlir::DialectAsmPrinter &os) const {
+  if (type.isa<ShapeType>()) {
+    os << "shape";
+    return;
+  }
+
+  if (type.isa<PartialShapeType>()) {
+    os << "partial_shape";
+    return;
+  }
+  llvm_unreachable("unexpected 'shape' type kind");
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
+
+}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h
new file mode 100644
index 00000000000000..bd3fa8853675af
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+namespace infrt::ts {
+
+class ShapeType
+    : public mlir::Type::TypeBase<ShapeType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+class PartialShapeType : public mlir::Type::TypeBase<PartialShapeType,
+                                                     mlir::Type,
+                                                     mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+using namespace mlir;  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensor_shape.hpp.inc"
+#include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc"
+
+}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td
new file mode 100644
index 00000000000000..d3714c8ed14d3f
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape.td
@@ -0,0 +1,49 @@
+#ifdef INFRT_OPS
+#else
+#define INFRT_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "paddle/infrt/dialect/tensor_shape_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for the operation in the TensorShape dialect
+class TS_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<TensorShapeDialect, mnemonic, traits> {
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+  let printer = " return infrt::dialect::printOpWithOperands(p, *this)" ";";
+}
+
+def TS_BuildShapeOp : TS_Op<"build_shape", [NoSideEffect]> {
+  let summary = "Build tensor shape operation";
+  let description = [{
+    An operation that builds a tensor shape of given ranks and extents.
+  }];
+
+  let arguments = (ins I64ArrayAttr:$value);
+  let results = (outs TS_Shape:$output);
+  let assemblyFormat = "$value attr-dict";
+}
+
+def TS_GetNumElementsOp : TS_Op<"get_num_elements"> {
+  let summary = "Returns the number of elements in the shape";
+
+  let description = [{
+    An operation that returns the number of elements in the given shape.
+  }];
+
+  let arguments = (ins TS_Shape);
+  let results = (outs I64);
+  let assemblyFormat = "operands attr-dict";
+}
+
+def TS_PrintShapeOp : TS_Op<"print_shape"> {
+  let summary = "Print tensor shape operation";
+  let description = [{
+    An operation that prints a tensor shape.
+  }];
+
+  let arguments = (ins TS_Shape:$shape);
+  let assemblyFormat = "operands attr-dict";
+}
+
+#endif
diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td
new file mode 100644
index 00000000000000..ea1c1854d77ca5
--- /dev/null
+++ b/paddle/infrt/dialect/tensor_shape_base.td
@@ -0,0 +1,36 @@
+#ifdef TS_OPS_BASE
+#else
+#define TS_OPS_BASE
+
+// Tensor shape dialect.
+def TensorShapeDialect : Dialect {
+    let name = "ts";
+
+    let description = [{
+        The Tensor Shape dialect.
+
+        This dialect contains operations for working with tensor shapes.
+    }];
+
+    let cppNamespace = "::infrt::ts";
+}
+
+// Type definition.
+def TS_Shape : DialectType<TensorShapeDialect,
+CPred<"$_self.isa<::infrt::ts::ShapeType>()">, "!ts.shape type">,
+BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
+    let typeDescription = [{
+        `!ts.shape type` represents a static tensor shape.
+}];
+}
+
+def TS_PartialShape : DialectType<TensorShapeDialect,
+CPred<"$_self.isa<::infrt::ts::PartialShapeType>()">, "!ts.partial_shape type">,
+BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> {
+    let typeDescription = [{
+        `!ts.partial_shape type` represents either a static tensor shape, unranked
+        tensor shape or a ranked tensor shape with unknown dimension sizes.
+}];
+}
+
+#endif  // TS_OPS_BASE
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
new file mode 100644
index 00000000000000..894d96f95ad5cb
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/test_kernels.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+
+namespace infrt::dialect {
+
+//===----------------------------------------------------------------------===//
+// BenchmarkOp
+//===----------------------------------------------------------------------===//
+
+// Parse the BenchmarkOp in the following format
+// infrt.benchmark "add.i32"(%c : i32, %d : f32)
+//       max_count = 100, duration_secs = 1 {
+// ...
+// }
+
+static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
+                                    OperationState &result) {  // NOLINT
+  StringAttr nameAttr;
+  if (parser.parseAttribute(nameAttr, "name", result.attributes))
+    return failure();
+
+  // Parse the operands, e.g. (%c : i32, %d : f32)
+  if (parser.parseLParen()) return failure();
+
+  SmallVector<OpAsmParser::OperandType, 4> operands;
+  SmallVector<Type, 4> types;
+  llvm::SMLoc type_loc = parser.getCurrentLocation();
+
+  if (parser.parseOptionalRParen()) {
+    // Parse non-empty operands
+    do {
+      // Parse %c : i32,
+      OpAsmParser::OperandType operand;
+      Type type;
+
+      if (parser.parseOperand(operand) || parser.parseColonType(type))
+        return failure();
+
+      operands.push_back(operand);
+      types.push_back(type);
+    } while (succeeded(parser.parseOptionalComma()));
+
+    if (parser.parseRParen()) return failure();
+  }
+
+  if (parser.resolveOperands(operands, types, type_loc, result.operands))
+    return failure();
+
+  // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1
+  do {
+    StringRef attr;
+    Attribute resultAttr;
+    if (parser.parseKeyword(&attr) || parser.parseEqual() ||
+        parser.parseAttribute(resultAttr,
+                              parser.getBuilder().getIntegerType(32),
+                              attr,
+                              result.attributes))
+      return failure();
+  } while (succeeded(parser.parseOptionalComma()));
+
+  // Set the default attribute num_warmup_runs to 1 if unset
+  auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) {
+    bool found = llvm::any_of(result.attributes,
+                              [attr_name](const NamedAttribute &attr) {
+                                return attr.first == attr_name;
+                              });
+    if (!found) {
+      IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value);
+      result.addAttribute(attr_name, default_val);
+    }
+  };
+  setDefaultAttrIfUnset("num_warmup_runs", 1);
+
+  Region *target = result.addRegion();
+  return parser.parseRegion(*target,
+                            operands,
+                            types,
+                            /*enableNameShadowing=*/true);
+}
+
+// Print the BenchmarkOp in the following format
+// infrt.benchmark "add.i32"(%c : i32, %d : f32)
+//       max_count = 100, duration_secs = 1 {
+// ...
+// }
+static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
+  p << "infrt.benchmark ";
+
+  // Print the name attribute, e.g "add.i32"
+  auto name_attr = op.getAttr("name");
+  p << name_attr;
+
+  // Print the operands and types, e.g. (%c : i32, %d : f32)
+  p << '(';
+  llvm::interleaveComma(llvm::zip(op.getOperands(), op.getOperandTypes()),
+                        p,
+                        [&](const auto &it) {
+                          p << std::get<0>(it) << " : " << std::get<1>(it);
+                        });
+  p << ") ";
+
+  bool need_comma = false;
+  // Print the attributes, e.g. max_count = 100, duration_secs = 1
+  for (auto &name_attr : op.getAttrs()) {
+    auto id = name_attr.first;
+    if (id == "name") continue;
+    if (need_comma) p << ", ";
+    auto attr = name_attr.second;
+    p << id << " = ";
+    if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+      int_attr.getValue().print(p.getStream(), /*isSigned=*/false);
+    } else {
+      op.emitOpError("Unexpected attribute");
+    }
+    need_comma = true;
+  }
+  p << ' ';
+
+  // Print the region
+  // Reuse the argument names provided to the op for the bbarg names within
+  // the region.
+  p.shadowRegionArgs(op.region(), op.getOperands());
+  p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
+}
+
+static LogicalResult verify(BenchmarkOp op) {
+  // Verify that the target benchmark region has exactly one return value.
+  auto &region = op.region();
+  auto &last_op = region.front().back();
+  if (last_op.getName().getStringRef() != "infrt.return") {
+    return op.emitOpError("missing return statement");
+  }
+  if (last_op.getNumOperands() != 1) {
+    return op.emitOpError(
+        "incorrect number of return values. One return value is expected");
+  }
+
+  return success();
+}
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/test_kernels.cpp.inc"
+
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h
new file mode 100644
index 00000000000000..29d4209cb7280e
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace infrt::dialect {
+using namespace mlir;  // NOLINT
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/test_kernels.hpp.inc"
+}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td
new file mode 100644
index 00000000000000..6aa12f252d0144
--- /dev/null
+++ b/paddle/infrt/dialect/test_kernels.td
@@ -0,0 +1,65 @@
+// Operation definitions for testing.
+
+#ifdef TEST_OPS
+#else
+#define TEST_OPS
+
+include "paddle/infrt/dialect/infrt_base.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for Test dialect ops.
+class Test_Op<string mnemonic, list<OpTrait> traits = []> :
+    Op<INFRT_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+
+  // Each registered op in the Test namespace needs to provide all of a printer,
+  // parser and verifier.
+  let printer = [{ return infrt::dialect::print(p, *this); }];
+  let verifier = [{ return infrt::dialect::verify(*this); }];
+  let parser = [{ return infrt::dialect::parse$cppClass(parser, result); }];
+}
+
+def BenchmarkOp : Test_Op<"benchmark"> {
+  let summary = "benchmark operation";
+  let description = [{
+     The "infrt.benchmark" operation benchmarks the performance of an MLIR
+     region by executing the given MLIR region repeatedly up to the
+     `duratino_secs` seconds or `max_count` times. `num_warmup_runs` specifies
+     the number of warm up runs to run the given MLIR region before the
+     benchmark starts.
+
+     The target MLIR region can take an arbitrary number of arguments and
+     should return exactly one value. The arguments for the MLIR region are
+     provided as the operands of the infrt.benchmark op.
+
+     Example:
+       infrt.benchmark "add.i32"(%c : i32, %d : f32) max_count = 100, duration_secs = 1 {
+         // code for benchmarking
+         ...
+       }
+
+       infrt.benchmark "add.i32"(%c : i32)
+         duration_secs = 1,
+         max_count = 100,
+         num_warmup_runs = 10 {
+         // The MLIR code to be benchmarked goes here.
+         // The following code benchmarks the infrt.add.i32 kernel.
+         %x = infrt.add.i32 %c, %c
+         // The benchmarked function needs to return exactly one value.
+         infrt.return %x : i32
+       }
+  }];
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let arguments = (ins
+    Variadic<AnyType>,
+    I32Attr:$duration_secs,
+    I32Attr:$max_count,
+    StrAttr:$name,
+    DefaultValuedAttr<I32Attr, "1">:$num_warmup_runs
+  );
+
+  let results = (outs);
+}
+
+#endif  // TEST_OPS
diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc
new file mode 100644
index 00000000000000..6d6f6a20b46c90
--- /dev/null
+++ b/paddle/infrt/dialect/types.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/types.h"
+
+namespace infrt::hlir::mlir {}  // namespace infrt::hlir::mlir
diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h
new file mode 100644
index 00000000000000..a9a2b61871cc09
--- /dev/null
+++ b/paddle/infrt/dialect/types.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/StandardTypes.h>
diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt
new file mode 100644
index 00000000000000..faffc3909bc1e7
--- /dev/null
+++ b/paddle/infrt/external_kernels/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(external_kernels_src "basic_kernels.cc")
+
+cc_library(external_kernels SHARED SRCS ${external_kernels_src})
+set_target_properties(external_kernels PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+
+set(basic_mlir "${CMAKE_CURRENT_SOURCE_DIR}/basic.mlir")
+set(external_kernels_lib "${CMAKE_CURRENT_BINARY_DIR}/libexternal_kernels.so")
+message(STATUS "basic_mlir: ${basic_mlir}")
+message(STATUS "external_kernels_lib: ${external_kernels_lib}")
+add_test(
+    NAME run_and_check_external_kernels
+    COMMAND sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}"
+)
diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir
new file mode 100644
index 00000000000000..843b12ced21a98
--- /dev/null
+++ b/paddle/infrt/external_kernels/basic.mlir
@@ -0,0 +1,21 @@
+// CHECK: basic
+func @basic() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK: 1
+  "external.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 2
+  "external.print.f32"(%v1) : (f32) -> ()
+
+  // CHECK: 3
+  "external.print.f32"(%v2) : (f32) -> ()
+
+  %v3 = "external.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  // CHECK: 6
+  "external.print.f32"(%v3) : (f32) -> ()
+
+  infrt.return %v3 : f32
+}
diff --git a/paddle/infrt/external_kernels/basic_kernels.cc b/paddle/infrt/external_kernels/basic_kernels.cc
new file mode 100644
index 00000000000000..b59a8881fb0923
--- /dev/null
+++ b/paddle/infrt/external_kernels/basic_kernels.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+template <typename T>
+T add(T a, T b) {
+  return a + b;
+}
+
+template <typename T>
+T sub(T a, T b) {
+  return a - b;
+}
+
+template <typename T>
+T mul(T a, T b) {
+  return a * b;
+}
+
+template <typename T>
+T div(T a, T b) {
+  return a / b;
+}
+
+template <typename T>
+void print(T a) {
+  std::cout << a << std::endl;
+}
+
+void RegisterKernels(infrt::host_context::KernelRegistry *registry) {
+  // int32
+  registry->AddKernel("external.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("external.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("external.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("external.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("external.print.i32", INFRT_KERNEL(print<int32_t>));
+
+  // float
+  registry->AddKernel("external.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("external.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("external.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("external.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("external.print.f32", INFRT_KERNEL(print<float>));
+}
diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir
new file mode 100644
index 00000000000000..bdac9ded2ef65d
--- /dev/null
+++ b/paddle/infrt/external_kernels/fc.mlir
@@ -0,0 +1,43 @@
+// CHECK-LABEL: @fc
+func @fc(%input : !infrt.tensor<X86, NCHW, F32>,
+         %w : !infrt.tensor<X86, NCHW, F32>,
+         %bias : !infrt.tensor<X86, NCHW, F32>) -> !infrt.tensor<X86, NCHW, F32>
+{
+  %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+
+  // fc1
+  "external.matmul"(%input, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+
+  // fc2
+  "external.matmul"(%out, %w, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  "external.sigmoid"(%out, %out) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+
+  infrt.return %out : !infrt.tensor<X86, NCHW, F32>
+}
+
+// CHECK-LABEL: @benchmark
+func @benchmark() {
+  %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+
+  %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+
+  infrt.benchmark "add.f32"(
+          %input:!infrt.tensor<X86, NCHW, F32>,
+          %w:!infrt.tensor<X86, NCHW, F32>,
+          %bias:!infrt.tensor<X86, NCHW, F32>)
+          duration_secs = 100, max_count = 300000, num_warmup_runs = 3
+  {
+    %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>)
+    infrt.return %res : !infrt.tensor<X86, NCHW, F32>
+  }
+  infrt.return
+}
diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir
new file mode 100644
index 00000000000000..e7b8e9efba838b
--- /dev/null
+++ b/paddle/infrt/external_kernels/paddle.mlir
@@ -0,0 +1,50 @@
+// CHECK: paddle_func
+func @paddle_func() -> () {
+  %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+
+  %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor<X86, NCHW, F32>) {value=2.0:f32}
+
+  %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+
+  %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+
+  "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%input : !infrt.tensor<X86, NCHW, F32>)
+  // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+  dt.print_tensor (%w : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%bias : !infrt.tensor<X86, NCHW, F32>)
+  dt.print_tensor (%out : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.matmul
+  %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out1 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.elementwise_add
+  %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor<X86, NCHW, F32>) {value=3.0:f32}
+  "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out2 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.relu
+  %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.relu"(%out1, %out3) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out3 : !infrt.tensor<X86, NCHW, F32>)
+
+  // test external.sigmoid
+  %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor<X86, NCHW, F32>) {value=0.0:f32}
+  "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) -> ()
+  dt.print_tensor (%out4 : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/gtest_main.cc b/paddle/infrt/gtest_main.cc
new file mode 100644
index 00000000000000..26e2b5dcfc61ad
--- /dev/null
+++ b/paddle/infrt/gtest_main.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, false);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/infrt/host_context/CMakeLists.txt b/paddle/infrt/host_context/CMakeLists.txt
new file mode 100644
index 00000000000000..fdba9af4a59120
--- /dev/null
+++ b/paddle/infrt/host_context/CMakeLists.txt
@@ -0,0 +1,29 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    kernel_frame.cc
+    kernel_registry.cc
+    value.cc
+    kernel_utils.cc
+    symbol_table.cc
+    op_executable.cc
+    core_runtime.cc
+    mlir_to_runtime_translate.cc
+    function.cc
+    mlir_function_executable.cc
+    mlir_program_executor.cc
+    )
+
+cc_test_tiny(test_infrt_host_context_value SRCS value_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_utils SRCS kernel_utils_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_kernel_registry SRCS kernel_registry_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_op_executable SRCS op_executable_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_core_runtime SRCS core_runtime_test.cc DEPS infrt ${MLIR_IR_LIBS})
+cc_test_tiny(test_infrt_mlir_to_runtime_translate SRCS mlir_to_runtime_translate_test.cc DEPS infrt ${MLIR_IR_LIBS})
+
+infrt_exec_check(test_infrt_mlir_exec_on_basic mlir_tests/basic.mlir)
+infrt_exec_check(test_infrt_mlir_exec_on_shape mlir_tests/shape.mlir)
+infrt_exec_check(test_infrt_mlir_exec_on_dense_tensor mlir_tests/dense_tensor.mlir)
+
+add_executable(infrt-exec mlir_exec.cc)
+target_link_libraries(infrt-exec infrt ${MLIR_IR_LIBS})
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
new file mode 100644
index 00000000000000..cdb8cc99ecb263
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/core_runtime.h"
+
+#include <unordered_map>
+
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt::host_context {
+
+struct CoreRuntime::Impl {
+  KernelRegistry* kernel_registry{};
+  SymbolTable symbol_table;
+  std::vector<OpExecutableBuilder> op_executables;
+
+  mutable std::vector<ValueRef> results;
+};
+
+SymbolTable* CoreRuntime::symbol_table() { return &impl_->symbol_table; }
+
+CoreRuntime::CoreRuntime(CoreRuntime::Impl* impl) : impl_(impl) { CHECK(impl); }
+
+void CoreRuntime::Execute() {
+  // std::cout << "CoreRuntime::Execute" << std::endl;
+  int op_offset = 0;
+  for (auto& op : impl_->op_executables) {
+    VLOG(3) << "running op " << op_offset++ << " " << op.name();
+    op.Execute();
+  }
+}
+
+KernelRegistry* CoreRuntime::kernel_registry() const {
+  return impl_->kernel_registry;
+}
+
+size_t CoreRuntime::num_ops() const { return impl_->op_executables.size(); }
+
+CoreRuntimeBuilder::CoreRuntimeBuilder(KernelRegistry* kernel_registry)
+    : CoreRuntime(new Impl) {
+  impl_->kernel_registry =
+      kernel_registry ? kernel_registry : GetCpuKernelRegistry();
+}
+
+OpExecutableBuilder* CoreRuntimeBuilder::NewOpExecutable(
+    const std::string& op_name) {
+  CHECK(impl_.get());
+  impl_->op_executables.emplace_back(
+      op_name, symbol_table(), impl_->kernel_registry);
+  return &impl_->op_executables.back();
+}
+
+void CoreRuntimeBuilder::FeedInArgs(
+    llvm::ArrayRef<std::pair<std::string, ValueRef>> args) {
+  for (auto& item : args) {
+    symbol_table()->Register(item.first, item.second);
+  }
+}
+
+void CoreRuntimeBuilder::SetKernelRegistry(KernelRegistry* x) {
+  CHECK(x);
+  impl_->kernel_registry = x;
+}
+
+llvm::SmallVector<ValueRef, 4> CoreRuntime::GetResults(
+    llvm::ArrayRef<std::string> arg_names) {
+  llvm::SmallVector<ValueRef, 4> results;
+  for (auto& name : arg_names) {
+    results.push_back(ValueRef(symbol_table()->GetValue(name)));
+  }
+
+  return results;
+}
+
+CoreRuntime::~CoreRuntime() {}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
new file mode 100644
index 00000000000000..802f8b17bb0105
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt::host_context {
+
+class KernelRegistry;
+class OpExecutable;
+class OpExecutableBuilder;
+class SymbolTable;
+
+/**
+ * CoreRuntime encapsulate the execution for a sequence of ops.
+ * Each function call will bind to a CoreRuntime instance, push the argument
+ * Values in to the argument-list, and get the
+ * result Values from the return-list.
+ */
+class CoreRuntime : public std::enable_shared_from_this<CoreRuntime> {
+ public:
+  //! Execute a program.
+  void Execute();
+
+  //! Return the number of ops.
+  size_t num_ops() const;
+
+  //! Get the results of the execution.
+  llvm::SmallVector<ValueRef, 4>  //
+      GetResults(llvm::ArrayRef<std::string> arg_names);
+
+  std::shared_ptr<CoreRuntime> getptr() {
+    return std::shared_ptr<CoreRuntime>(this);
+  }
+
+  KernelRegistry* kernel_registry() const;
+
+  ~CoreRuntime();
+
+ protected:
+  //! Get the symbol table.
+  SymbolTable* symbol_table();
+
+  class Impl;
+  explicit CoreRuntime(Impl* impl);
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * The builder for CoreRuntime, help to construct a function.
+ */
+class CoreRuntimeBuilder : public CoreRuntime {
+ public:
+  explicit CoreRuntimeBuilder(KernelRegistry* kernel_registry);
+
+  using CoreRuntime::symbol_table;
+
+  void SetKernelRegistry(KernelRegistry* x);
+
+  //! Feed the input arguments, each item is a pair of arg-name and arg-value.
+  void FeedInArgs(llvm::ArrayRef<std::pair<std::string, ValueRef>> args);
+
+  llvm::ArrayRef<const std::string&> attr_names() const;
+
+  OpExecutableBuilder* NewOpExecutable(const std::string& op_name);
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/core_runtime_test.cc b/paddle/infrt/host_context/core_runtime_test.cc
new file mode 100644
index 00000000000000..3c0dadaad42e73
--- /dev/null
+++ b/paddle/infrt/host_context/core_runtime_test.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/core_runtime.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt {
+namespace host_context {
+
+int add(int a, int b) { return a + b; }
+int sub(int a, int b) { return a - b; }
+
+TEST(CoreRuntime, basic) {
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add));
+  registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub));
+
+  CoreRuntimeBuilder builder(&registry);
+  auto* table = builder.symbol_table();
+  table->Register("a", 1);
+  table->Register("b", 2);
+  table->Register("d", 4);
+
+  // c = a + b
+  auto* op0 = builder.NewOpExecutable("infrt.test.addi32");
+  op0->AppendArgument("a");
+  op0->AppendArgument("b");
+  op0->SetResults({"c"});
+
+  // e = c - d
+  auto* op1 = builder.NewOpExecutable("infrt.test.subi32");
+  op1->AppendArgument("c");
+  op1->AppendArgument("d");
+  op1->SetResults({"e"});
+
+  builder.Execute();
+
+  ASSERT_EQ(table->GetValue("d")->get<int>(), 4);
+  ASSERT_EQ(table->GetValue("c")->get<int>(), 3);
+  ASSERT_EQ(table->GetValue("e")->get<int>(), -1);
+}
+
+TEST(CoreRuntime, function) {
+  // The function:
+  // func(int a, int b) {
+  //   int c = a + b
+  //   return c
+  // }
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.addi32", INFRT_KERNEL(add));
+  registry.AddKernel("infrt.test.subi32", INFRT_KERNEL(sub));
+
+  CoreRuntimeBuilder builder(&registry);
+  auto* table = builder.symbol_table();
+
+  std::vector<std::pair<std::string, ValueRef>> feeds{
+      {std::make_pair("a", ValueRef(new Value(1))),  //
+       std::make_pair("b", ValueRef(new Value(2)))}};
+  builder.FeedInArgs(llvm::ArrayRef<std::pair<std::string, ValueRef>>(
+      feeds.data(), feeds.size()));
+
+  ASSERT_EQ(table->Get<int>("a"), 1);
+  ASSERT_EQ(table->Get<int>("b"), 2);
+  ASSERT_EQ(table->size(), 2UL);
+
+  auto* op = builder.NewOpExecutable("infrt.test.addi32");
+  op->AppendArgument("a");
+  op->AppendArgument("b");
+  op->SetResults({"c"});
+
+  builder.Execute();
+
+  auto res = builder.GetResults({"c"});
+  ASSERT_EQ(res.size(), 1UL);
+  ASSERT_EQ(res[0].get<int>(), 3);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/function.cc b/paddle/infrt/host_context/function.cc
new file mode 100644
index 00000000000000..8b111f2645a80c
--- /dev/null
+++ b/paddle/infrt/host_context/function.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/function.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/function.h b/paddle/infrt/host_context/function.h
new file mode 100644
index 00000000000000..030e3b6cfbc09b
--- /dev/null
+++ b/paddle/infrt/host_context/function.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct Value;
+struct ValueRef;
+
+/**
+ * Base class of all executable Function.
+ *
+ * This is used by `infrt.call` op, to execute a function.
+ */
+class Function {
+ public:
+  Function(Function&& other)
+      : name_(other.name_),
+        num_arguments_(other.num_arguments_),
+        num_results_(other.num_results_) {}
+
+  Function() = delete;
+
+  std::string name() const { return name_; }
+
+  size_t num_arguments() const { return num_arguments_; }
+  size_t num_results() const { return num_results_; }
+
+  virtual void Execute(llvm::ArrayRef<Value*> arguments,
+                       llvm::MutableArrayRef<ValueRef> results,
+                       bool is_region = false) const {}
+
+  virtual ~Function() = default;
+
+ protected:
+  Function(std::string name, size_t num_arguments, size_t num_results)
+      : name_(name), num_arguments_(num_arguments), num_results_(num_results) {}
+
+ private:
+  std::string name_;
+  size_t num_arguments_{};
+  size_t num_results_{};
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc
new file mode 100644
index 00000000000000..1acb35e898308a
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_frame.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+
+#include <memory>
+
+namespace infrt {
+namespace host_context {
+
+std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) {
+  os << "KernelFrame: " << frame.GetNumArgs() << " args, "
+     << frame.GetNumResults() << " res, " << frame.GetNumResults() << " attrs";
+  return os;
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
new file mode 100644
index 00000000000000..20cb17dc7fbe24
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <llvm/ADT/ArrayRef.h>
+
+#include <utility>
+
+#include "llvm/ADT/SmallVector.h"
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt::host_context {
+
+/**
+ * KernelFrame captures the states(input arguments, attributes, results)
+ * associated with a kernel invocation.
+ */
+class KernelFrame {
+ public:
+  int GetNumArgs() const { return num_arguments_; }
+  int GetNumResults() const { return num_results_; }
+  int GetNumAttributes() const {
+    return value_or_attrs_.size() - num_arguments_ -
+           (num_results_ == -1 ? 0 : num_results_);
+  }
+
+  template <typename T>
+  T& GetArgAt(int index) {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index]->get<T>();
+  }
+  template <typename T>
+  const T& GetArgAt(int index) const {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index]->get<T>();
+  }
+
+  Value* GetArgAt(int index) {
+    CHECK_LT(index, GetNumArgs());
+    return value_or_attrs_[index];
+  }
+
+  // Get all arguments.
+  llvm::ArrayRef<Value*> GetArguments() const {
+    return GetValues(0, num_arguments_);
+  }
+
+  Value* GetAttributeAt(int idx) {
+    CHECK_NE(num_results_, -1)
+        << "Must call SetNumResults before GetAttributeAt";
+    CHECK_LT(idx,
+             static_cast<int>(value_or_attrs_.size() - num_arguments_ -
+                              num_results_));
+    return value_or_attrs_[num_arguments_ + num_results_ + idx];
+  }
+
+  void AddAttribute(Value* v) {
+    CHECK_NE(num_results_, -1)
+        << "Must call SetNumResults before calling AddAttribute";
+    value_or_attrs_.emplace_back(v);
+  }
+
+  template <typename T, typename... Args>
+  void EmplaceResult(Args&&... args) {
+    EmplaceResult<T>(0, std::forward<Args>(args)...);
+  }
+
+  template <typename T, typename... Args>
+  void EmplaceResult(int index, Args&&... args) {
+    SetResultAt(index, T(std::forward<Args>(args)...));
+  }
+
+  template <typename T>
+  void SetResultAt(int index, T&& value) {
+    CHECK_LT(index, num_results_) << "Invalid result index";
+    CHECK(value_or_attrs_[num_arguments_ + index]);
+    value_or_attrs_[num_arguments_ + index]->set(std::move(value));
+  }
+
+  llvm::ArrayRef<Value*> GetResults() const {
+    return GetValues(num_arguments_, num_results_);
+  }
+  llvm::MutableArrayRef<Value*> GetResults() {
+    return GetMutableValues(num_arguments_, num_results_);
+  }
+
+  llvm::ArrayRef<Value*> GetValues(size_t from, size_t length) const {
+    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    if (length == 0) return {};
+
+    return llvm::makeArrayRef(&value_or_attrs_[from], length);
+  }
+
+  llvm::MutableArrayRef<Value*> GetMutableValues(size_t from, size_t length) {
+    CHECK_LE(static_cast<int>(from + length), num_arguments_ + num_results_);
+    if (length == 0) return {};
+    return llvm::makeMutableArrayRef(&value_or_attrs_[from], length);
+  }
+
+ protected:
+  int num_arguments_{};
+  int num_results_{-1};
+
+  llvm::SmallVector<Value*, 8> value_or_attrs_;
+};
+
+std::ostream& operator<<(std::ostream& os, const KernelFrame& frame);
+
+class KernelFrameBuilder : public KernelFrame {
+ public:
+  void AddArgument(Value* value) {
+    CHECK(value);
+    CHECK_EQ(num_results_, -1)
+        << "Should call AddArgument before calling SetNumResults";
+    value_or_attrs_.push_back(value);
+    ++num_arguments_;
+  }
+
+  void SetResults(llvm::ArrayRef<Value*> values) {
+    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
+    CHECK_EQ(num_results_, -1);
+    for (Value* x : values) {
+      value_or_attrs_.push_back(x);
+    }
+    num_results_ = values.size();
+  }
+
+  void SetNumResults(size_t n) {
+    CHECK_EQ(num_arguments_, static_cast<int>(value_or_attrs_.size()));
+    CHECK_EQ(num_results_, -1);
+    num_results_ = n;
+    for (size_t i = 0; i < n; i++) {
+      value_or_attrs_.emplace_back(new Value);
+    }
+  }
+
+  void SetResultAt(int result_id, Value* value) {
+    CHECK_EQ(static_cast<int>(value_or_attrs_.size()),
+             num_arguments_ + num_results_)
+        << "Call SetNumResults first";
+    CHECK_LT(result_id + num_arguments_,
+             static_cast<int>(value_or_attrs_.size()));
+    CHECK(value);
+    value_or_attrs_[num_arguments_ + result_id]->set(value);
+  }
+
+  void Reset() {
+    value_or_attrs_.clear();
+    num_arguments_ = 0;
+    num_results_ = -1;
+  }
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc
new file mode 100644
index 00000000000000..f343dfc71b040e
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+
+#include <unordered_map>
+
+#include "glog/logging.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry::Impl {
+  std::unordered_map<std::string, KernelImplementation> data;
+  std::unordered_map<std::string, llvm::SmallVector<std::string, 4>> attr_names;
+};
+
+KernelRegistry::KernelRegistry() : impl_(std::make_unique<Impl>()) {}
+
+void KernelRegistry::AddKernel(const std::string &key,
+                               KernelImplementation fn) {
+  CHECK(!impl_->data.count(key)) << "kernel [" << key
+                                 << "] is registered twice";
+  impl_->data.emplace(key, fn);
+}
+
+void KernelRegistry::AddKernelAttrNameList(
+    const std::string &key, const std::vector<std::string> &names) {
+  CHECK(!impl_->attr_names.count(key))
+      << "kernel [" << key << "] is registered twice in attribute names";
+  impl_->attr_names.emplace(
+      key, llvm::SmallVector<std::string, 4>(names.begin(), names.end()));
+}
+
+KernelImplementation KernelRegistry::GetKernel(const std::string &key) const {
+  auto it = impl_->data.find(key);
+  return it != impl_->data.end() ? it->second : KernelImplementation{};
+}
+
+std::vector<std::string> KernelRegistry::GetKernelList() const {
+  std::vector<std::string> res(impl_->data.size());
+  for (auto i : impl_->data) {
+    res.push_back(i.first);
+  }
+  return res;
+}
+
+KernelRegistry::~KernelRegistry() {}
+
+size_t KernelRegistry::size() const { return impl_->data.size(); }
+
+KernelRegistry *GetCpuKernelRegistry() {
+  static auto registry = std::make_unique<KernelRegistry>();
+  return registry.get();
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h
new file mode 100644
index 00000000000000..d65969999f6ed0
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace infrt {
+namespace host_context {
+
+class KernelFrame;
+
+using KernelImplementation = void (*)(KernelFrame *frame);
+
+/**
+ * Hold the kernels registered in the system.
+ */
+class KernelRegistry {
+ public:
+  KernelRegistry();
+
+  void AddKernel(const std::string &key, KernelImplementation fn);
+  void AddKernelAttrNameList(const std::string &key,
+                             const std::vector<std::string> &names);
+
+  KernelImplementation GetKernel(const std::string &key) const;
+  std::vector<std::string> GetKernelList() const;
+
+  size_t size() const;
+
+  ~KernelRegistry();
+
+ private:
+  class Impl;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+//! The global CPU kernel registry.
+KernelRegistry *GetCpuKernelRegistry();
+
+}  // namespace host_context
+}  // namespace infrt
+
+/**
+ * compile function RegisterKernels in C way to avoid C++ name mangling.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void RegisterKernels(infrt::host_context::KernelRegistry *registry);
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc
new file mode 100644
index 00000000000000..f36ec2a1cac7de
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_registry_test.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt::host_context {
+
+int add_i32(int a, int b) { return a + b; }
+
+TEST(KernelRegistry, basic) {
+  KernelRegistry registry;
+  std::string key = "infrt.test.add.i32";
+  registry.AddKernel(key, INFRT_KERNEL(add_i32));
+
+  auto* kernel_impl = registry.GetKernel(key);
+  ASSERT_TRUE(kernel_impl);
+
+  ValueRef a(1);
+  ValueRef b(2);
+  KernelFrameBuilder fbuilder;
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  kernel_impl(&fbuilder);
+
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results[0]->get<int>(), 3);
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/kernel_utils.cc b/paddle/infrt/host_context/kernel_utils.cc
new file mode 100644
index 00000000000000..cf9476da032beb
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h
new file mode 100644
index 00000000000000..33812912ba029c
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils.h
@@ -0,0 +1,352 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+#include <llvm/ADT/ArrayRef.h>
+
+#include <utility>
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt {
+namespace host_context {
+
+template <typename T>
+class Argument {
+ public:
+  explicit Argument(ValueRef value) : value_(value) {}
+
+  ValueRef& value() { return value_; }
+  const ValueRef& value() const { return value_; }
+
+  T& get() const { return value_.get<T>(); }
+
+ private:
+  ValueRef value_;
+};
+
+/**
+ * RemainingArguments collects all remaining arguments in an ArrayRef.
+ */
+class RemainingArguments {
+ public:
+  explicit RemainingArguments(llvm::ArrayRef<Value*> remaining_arguments)
+      : remaining_arguments_(remaining_arguments) {}
+
+  llvm::ArrayRef<Value*> values() const { return remaining_arguments_; }
+  size_t size() const { return remaining_arguments_.size(); }
+  const Value* operator[](size_t i) const { return remaining_arguments_[i]; }
+
+ private:
+  llvm::ArrayRef<Value*> remaining_arguments_;
+};
+
+/**
+ * RemainingResults collects all remaining results in a MutableArrayRef.
+ */
+class RemainingResults {
+ public:
+  explicit RemainingResults(llvm::MutableArrayRef<ValueRef> remaining_results)
+      : remaining_results_(remaining_results) {}
+  llvm::MutableArrayRef<ValueRef> values() { return remaining_results_; }
+  size_t size() const { return remaining_results_.size(); }
+
+  template <typename T>
+  const ValueRef& AllocateAt(int index) {
+    // eagerly create a ValueRef
+    if (remaining_results_[index].get()) return remaining_results_[index];
+    remaining_results_[index] = ValueRef(new Value);
+    return remaining_results_[index];
+  }
+  ValueRef& operator[](size_t i) const { return remaining_results_[i]; }
+
+ private:
+  llvm::MutableArrayRef<ValueRef> remaining_results_;
+};
+
+template <typename T>
+class Result {
+ public:
+  explicit Result(ValueRef* result) : result_(result) {}
+
+  template <typename... Args>
+  void Emplace(Args&&... args) {
+    ValueRef v;
+    Set(T(std::forward<Args>(args)...));
+  }
+
+  void Set(Argument<T> argument) {
+    CHECK(!result_->IsValid());
+    *result_ = argument.value();
+  }
+
+ private:
+  ValueRef* result_{};
+};
+
+template <typename T>
+class Attribute {
+ public:
+  explicit Attribute(const Value* value) : value_(value) {}
+
+  const T& get() const { return value_->get<T>(); }
+
+ private:
+  const Value* value_;
+};
+
+template <typename ViewT>
+class ArgumentView {
+  using UnderlyingT = typename ViewT::UnderlyingT;
+
+ public:
+  explicit ArgumentView(Value* value)
+      : value_(value), arg_(&value->template get<UnderlyingT>()) {}
+
+  Value* value() const { return value_; }
+  ViewT& get() const { return arg_; }
+  ViewT* operator->() const { return &get(); }
+  ViewT& operator*() const { return get(); }
+
+ private:
+  Value* value_{};
+  mutable ViewT arg_;
+};
+
+template <typename F, F f>
+struct KernelImpl;
+
+template <typename T>
+struct TypeTag {};
+
+#define INFRT_KERNEL(...)                                   \
+  ::infrt::host_context::KernelImpl<decltype(&__VA_ARGS__), \
+                                    &__VA_ARGS__>::Invoke
+
+template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
+struct KernelImpl<Return (*)(Args...), impl_fn> {
+  static void Invoke(KernelFrame* frame) {
+    KernelCallHelper<Args..., TypeTag<int>>::template Invoke<0, 0, 0>(frame);
+  }
+
+  // Helper that introspects the arguments to derive the signature and cast
+  // parts of the KernelFrame to their type before passing them to impl_fn.
+  template <typename... RemainingArgs>
+  struct KernelCallHelper;
+
+  // Casts the return value of the kernel, if non-void.
+  // bool _ is an unnecessary parameter to make compiler allow templace specific
+  // in non-namespace scope.
+  template <typename T, bool _>
+  struct KernelReturnHelper {
+    static void Invoke(KernelFrame* frame, const Args&... args) {
+      HandleReturn(frame, impl_fn(args...));
+    }
+  };
+
+  template <bool _>
+  struct KernelReturnHelper<void, _> {
+    static void Invoke(KernelFrame* frame, const Args&... args) {
+      impl_fn(args...);
+    }
+  };
+
+  // Specialization to cast a single input argument(Head).
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Argument<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      Argument<Head> arg(frame->GetArgAt(in_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<ArgumentView<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      ArgumentView<Head> arg(frame->GetArgAt(in_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Specialization to cast a single result argument (Head).
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Result<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(out_idx != -1,
+                    "Do not place Results after RemainingResults");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      Result<Head> arg(&frame->GetResults()[out_idx]);
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx, out_idx + 1, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Specialization to cast a single attribute.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Attribute<Head>, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(const_idx != -1,
+                    "Do not place Attributes after RemainingAttributes");
+      Attribute<Head> arg(frame->GetAttributeAt(const_idx));
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx, out_idx, const_idx + 1>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Treat other pointer as an Argument.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Head*, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+      auto* arg = &frame->GetArgAt<Head>(in_idx);
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // Treat any other type as an Argument.
+  template <typename Head, typename... Tail>
+  struct KernelCallHelper<Head, Tail...> {
+    using ArgT = std::decay_t<Head>;
+
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not place Arguments after RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes.");
+
+      auto* value = frame->GetArgAt(in_idx);
+      auto&& arg = value->get<ArgT>();
+
+      KernelCallHelper<
+          Tail...>::template Invoke<in_idx + 1, out_idx, const_idx>(frame,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
+  // RemainingArguments provides an ArrayRef<AsyncValue*> containing all
+  // remaining arguments. Useful for variadic
+  // kernels.
+  template <typename... Tail>
+  struct KernelCallHelper<RemainingArguments, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(in_idx != -1,
+                    "Do not use more than one RemainingArguments");
+      static_assert(out_idx == 0, "Arguments should appear before results.");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      RemainingArguments remaining_arguments(
+          frame->GetArguments().drop_front(in_idx));
+
+      KernelCallHelper<Tail...>::template Invoke<-1, out_idx, const_idx>(
+          frame, pargs..., remaining_arguments);
+    }
+  };
+
+  // RemainingResults provides an MutableArrayRef<AsyncValue*> containing all
+  // remaining results.
+  template <typename... Tail>
+  struct KernelCallHelper<RemainingResults, Tail...> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      static_assert(out_idx != -1, "Do not use more than one RemainingResults");
+      static_assert(const_idx == 0,
+                    "Arguments and results should appear before attributes");
+      llvm::MutableArrayRef<Value*> returned_results =
+          frame->GetResults().drop_front(out_idx);
+
+      llvm::SmallVector<ValueRef, 4> result_values;
+      for (size_t i = 0; i < returned_results.size(); i++)
+        result_values.emplace_back(returned_results[i]);
+
+      RemainingResults remaining_results(result_values);
+      KernelCallHelper<Tail...>::template Invoke<in_idx, -1, const_idx>(
+          frame, pargs..., remaining_results);
+    }
+  };
+
+  // No arguments left.
+  template <typename T>
+  struct KernelCallHelper<TypeTag<T>> {
+    template <int in_idx, int out_idx, int const_idx, typename... PreviousArgs>
+    static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) {
+      KernelReturnHelper<Return, false>::Invoke(frame, pargs...);
+    }
+  };
+
+  // Handle pair result
+  template <typename T0, typename T1>
+  static void HandleReturn(KernelFrame* frame, std::pair<T0, T1>&& t) {
+    CHECK_EQ(frame->GetNumResults(), 2);
+    StoreResultAt(frame, 0, std::move(t.first));
+    StoreResultAt(frame, 1, std::move(t.second));
+  }
+
+  // Store the function result back to the output Value in KernelFrame.
+  template <typename T>
+  static void HandleReturn(KernelFrame* frame, T&& t) {
+    assert(frame->GetNumResults() == 1 && "Extra results passed to kernel.");
+    StoreResultAt(frame, 0, std::forward<T>(t));
+  }
+
+  // Store result as an Value output in KernelFrame.
+  template <typename T>
+  static void StoreResultAt(KernelFrame* frame, int index, T&& t) {
+    frame->EmplaceResult<std::decay_t<T>>(index, std::forward<T>(t));
+  }
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
new file mode 100644
index 00000000000000..1904eb106a2937
--- /dev/null
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace infrt::host_context {
+
+int add_i32(int a, int b) { return a + b; }
+float add_f32(float a, float b) { return a + b; }
+std::pair<int, float> add_pair(int a, float b) { return {a, b}; }
+
+TEST(KernelImpl, i32) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1));
+  ValueRef b(new Value(2));
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  INFRT_KERNEL(add_i32)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<int>(), 3);
+}
+
+TEST(KernelImpl, f32) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1.f));
+  ValueRef b(new Value(2.f));
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(1);
+
+  INFRT_KERNEL(add_f32)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<float>(), 3.f);
+}
+
+TEST(KernelImpl, pair) {
+  KernelFrameBuilder fbuilder;
+  ValueRef a(new Value(1));
+  ValueRef b(new Value(3.f));
+
+  fbuilder.AddArgument(a.get());
+  fbuilder.AddArgument(b.get());
+  fbuilder.SetNumResults(2);
+
+  INFRT_KERNEL(add_pair)(&fbuilder);
+  auto results = fbuilder.GetResults();
+  ASSERT_EQ(results.size(), 2UL);
+  ASSERT_EQ(results[0]->get<int>(), 1);
+  ASSERT_EQ(results[1]->get<float>(), 3.f);
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
new file mode 100644
index 00000000000000..b0d70af5ef9f2a
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <llvm/Support/CommandLine.h>
+
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+static llvm::cl::list<std::string> cl_shared_libs(  // NOLINT
+    "shared_libs",
+    llvm::cl::desc("Specify shared library with kernels."),
+    llvm::cl::ZeroOrMore,
+    llvm::cl::MiscFlags::CommaSeparated);
+
+int main(int argc, char** argv) {
+  using namespace llvm;   // NOLINT
+  using namespace infrt;  // NOLINT
+  cl::opt<std::string> input_file("i",
+                                  cl::desc("Specify input filename"),
+                                  cl::value_desc("input file name"));
+  cl::ParseCommandLineOptions(argc, argv);
+
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+  auto module = dialect::LoadMlirFile(input_file.c_str(), context);
+
+  host_context::KernelRegistry registry;
+
+  kernel::RegisterBasicKernels(&registry);
+  kernel::RegisterTestKernels(&registry);
+  kernel::RegisterTensorShapeKernels(&registry);
+  kernel::RegisterTensorKernels(&registry);
+  kernel::RegisterControlFlowKernels(&registry);
+
+  // load extra shared library
+  for (const auto& lib_path : cl_shared_libs) {
+    std::string err;
+    llvm::sys::DynamicLibrary dynLib =
+        llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err);
+    if (!dynLib.isValid()) {
+      llvm::errs() << "Load shared library failed. Error: " << err << "\n";
+      return 1;
+    }
+    if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) {
+      auto reg_func =
+          reinterpret_cast<void (*)(host_context::KernelRegistry*)>(reg_sym);
+      reg_func(&registry);
+    } else {
+      llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path
+                   << "\". Skip.\n";
+    }
+  }
+
+  host_context::TestMlir(module.get(), &registry);
+
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
new file mode 100644
index 00000000000000..5f8dacf8e448ac
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+
+#include <glog/logging.h>
+
+#include <string>  // NOLINT
+
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+
+namespace infrt {
+namespace host_context {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+MlirFunctionExecutable::MlirFunctionExecutable(
+    mlir::FuncOp func_op,
+    KernelRegistry* kernel_registry,
+    MlirToRuntimeTranslator::function_defs_t& function_table)
+    : Function(func_op.getName().str(),
+               func_op.getNumArguments(),
+               func_op.getNumResults()),
+      MlirToRuntimeTranslator(&core_runtime_builder_),
+      region_(&func_op.getRegion()),
+      core_runtime_builder_(kernel_registry),
+      function_table_(function_table) {}
+
+MlirFunctionExecutable::MlirFunctionExecutable(
+    mlir::Region* region,
+    mlir::FunctionType func_type,
+    KernelRegistry* kernel_registry,
+    MlirToRuntimeTranslator::function_defs_t& function_table)
+    : Function("", func_type.getNumInputs(), func_type.getNumResults()),
+      MlirToRuntimeTranslator(&core_runtime_builder_),
+      region_(region),
+      core_runtime_builder_(kernel_registry),
+      function_table_(function_table) {}
+
+void MlirFunctionExecutable::BuildExecutables(
+    llvm::ArrayRef<Value*> arguments,
+    llvm::MutableArrayRef<ValueRef> results,
+    bool is_region) {
+  CHECK_EQ(arguments.size(), num_arguments());
+  // We use the function call's arguments as op_executable's operands to avoid
+  // copy.
+  for (size_t i = 0; i < num_arguments(); i++) {
+    AddValue(region_->getArgument(i), arguments[i]);
+  }
+
+  // build the program
+  auto& blocks = region_->getBlocks();
+  CHECK_EQ(blocks.size(), 1UL)
+      << "function with more than one block is not supported yet";
+
+  llvm::SmallVector<Value*, 3> runtime_results;
+  for (auto& op : blocks.front()) {
+    if (EmitConstantOp(&op)) continue;
+    if (EmitBuildShapeOp(&op)) continue;
+
+    llvm::SmallVector<mlir::Value, 3> mlir_results;
+    if (EmitReturnOp(&op, &mlir_results)) {
+      if (!is_region) {
+        for (auto v : mlir_results) {
+          runtime_results.push_back(GetValue(v));
+        }
+      }
+      continue;
+    }
+
+    if (EmitCallOp(&op, &function_table_)) continue;
+
+    if (EmitGeneralOp(&op)) continue;
+    LOG(FATAL) << "Not supported op: " << DumpToString(op);
+  }
+
+  // after the block is built, we can get the result values of the whole
+  // function call in the runtime_results.
+
+  mlir::SmallVector<Value*, 3> results_copied;
+  if (!is_region) {
+    for (ValueRef& x : results) {
+      results_copied.push_back(x.get());
+    }
+  }
+
+  // set a lambda function to help copy the results from the runtime results in
+  // the local function to outer program.
+  CHECK_EQ(results_copied.size(), runtime_results.size());
+  this->copy_res_fn_ = [results_copied, runtime_results] {
+    VLOG(4) << "copy results to result";
+    for (size_t i = 0; i < results_copied.size(); i++) {
+      VLOG(4) << ".. copy " << runtime_results[i] << " to "
+              << results_copied[i];
+      CopyTo(*runtime_results[i], results_copied[i]);
+    }
+  };
+}
+
+void MlirFunctionExecutable::Execute(llvm::ArrayRef<Value*> arguments,
+                                     llvm::MutableArrayRef<ValueRef> results,
+                                     bool is_region) const {
+  CHECK_EQ(arguments.size(), num_arguments());
+  CHECK_EQ(results.size(), num_results());
+
+  if (core_runtime_builder_.num_ops() == 0) {
+    Reference(this).BuildExecutables(arguments, results, is_region);
+  }
+
+  Reference(&core_runtime_builder_).Execute();
+
+  copy_res_fn_();
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
new file mode 100644
index 00000000000000..ba5fa154d6fcc3
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mlir/IR/Function.h>
+
+#include <string>
+#include <unordered_map>
+
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+/**
+ * Executable function for a given MLIR function definition, mainly used in two
+ * scenerios:
+ * 1. infrt.call op
+ * 2. main function call
+ *
+ * A MlirFunctionExecutable might have one or more arguments and results.
+ */
+class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator {
+ public:
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  MlirFunctionExecutable(mlir::FuncOp func_op,
+                         KernelRegistry* kernel_registry,
+                         function_defs_t& function_table);  // NOLINT
+
+  MlirFunctionExecutable(
+      mlir::Region* region,
+      mlir::FunctionType func_type,
+      KernelRegistry* kernel_registry,
+      MlirToRuntimeTranslator::function_defs_t& function_table);  // NOLINT
+
+  /**
+   * Execute the function with the given arguments and results.
+   * NOTE the \param arguments and \param results should not be altered.
+   */
+  void Execute(llvm::ArrayRef<Value*> arguments,
+               llvm::MutableArrayRef<ValueRef> results,
+               bool is_region = false) const;
+
+ private:
+  /**
+   * Build the runtime executables once the function call arguments and results
+   * are passed in.
+   * This will trigger in the first execution.
+   */
+  void BuildExecutables(llvm::ArrayRef<Value*> arguments,
+                        llvm::MutableArrayRef<ValueRef> results,
+                        bool is_region);
+
+ private:
+  mlir::Region* region_{};
+  CoreRuntimeBuilder core_runtime_builder_;
+  MlirToRuntimeTranslator::function_defs_t& function_table_;
+  std::function<void()> copy_res_fn_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_program_executor.cc b/paddle/infrt/host_context/mlir_program_executor.cc
new file mode 100644
index 00000000000000..c5009bcc97c5cb
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_program_executor.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_program_executor.h"
+
+namespace infrt {
+namespace host_context {}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
new file mode 100644
index 00000000000000..b2af4d2d79db54
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/Module.h>
+#include <mlir/IR/OperationSupport.h>
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+#include "paddle/infrt/host_context/op_executable.h"
+
+namespace infrt {
+namespace host_context {
+
+/**
+ * This get a MLIR program as input, it compiles it into runtime program, and
+ * one can retrieve the function and execute
+ * it by passing the input arguments.
+ */
+class MlirProgramExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder runtime_builder;
+  mlir::ModuleOp module;
+  function_defs_t function_defs;
+
+  MlirProgramExecutor(mlir::ModuleOp module, KernelRegistry* registry)
+      : MlirToRuntimeTranslator(module, &runtime_builder),
+        runtime_builder(registry),
+        module(module) {}
+
+  // Build functions and generate executables.
+  void BuildFunctions() { EmitFunctions(); }
+
+  void EmitFunction(mlir::FuncOp op) override {
+    LOG(INFO) << "Emit function: " << op.getName().str();
+    function_defs[op.getName().str()] = op;
+
+    func_executables_.emplace(
+        op.getName().str(),
+        new MlirFunctionExecutable(
+            op, runtime_builder.kernel_registry(), function_defs));
+  }
+
+  MlirFunctionExecutable* LookupFunc(const std::string& name) {
+    auto it = func_executables_.find(name);
+    if (it != func_executables_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<MlirFunctionExecutable>>
+      func_executables_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir
new file mode 100644
index 00000000000000..263d5884134b14
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/basic.mlir
@@ -0,0 +1,30 @@
+// CHECK-LABEL: basic
+func @basic() -> f32 {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+
+  // CHECK: 1
+  "infrt.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 2
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  // CHECK: 3
+  "infrt.print.f32"(%v2) : (f32) -> ()
+
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  // CHECK: 6
+  "infrt.print.f32"(%v3) : (f32) -> ()
+
+  infrt.return %v3 : f32
+}
+
+// CHECK-LABEL: basic1
+// Check the mlir executor can work with more than one function in a file.
+func @basic1() -> () {
+  %v0 = infrt.constant.f32 1.0
+  "infrt.print.f32"(%v0) : (f32) -> ()
+  // CHECK: 1
+  infrt.return
+}
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
new file mode 100644
index 00000000000000..83afa1db8a91c0
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir
@@ -0,0 +1,9 @@
+// CHECK-LABEL: build_tensor1
+func @build_tensor1() {
+  %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor<X86, NCHW, F32>
+  dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor<X86, NCHW, F32>) {value=1.0:f32}
+  // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+  dt.print_tensor (%a : !infrt.tensor<X86, NCHW, F32>)
+
+  infrt.return
+}
diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir
new file mode 100644
index 00000000000000..a3130857b0ef7d
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_tests/shape.mlir
@@ -0,0 +1,7 @@
+// CHECK-LABEL: build_tensor1
+func @build_tensor1() {
+  %a = ts.build_shape [1:i64, 57:i64, 92:i64]
+  // CHECK: shape[1,57,92]
+  ts.print_shape %a
+  infrt.return
+}
\ No newline at end of file
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
new file mode 100644
index 00000000000000..25324b1291582b
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -0,0 +1,558 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+#include <llvm/Support/SourceMgr.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Function.h>
+#include <mlir/IR/OperationSupport.h>
+#include <mlir/Parser.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "boost/optional.hpp"
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/dialect/tensor_shape.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/op_executable.h"
+#include "paddle/infrt/host_context/value.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::host_context {
+
+template <typename T>
+std::string DumpToString(T& op) {  // NOLINT
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  op.print(os);
+  os.flush();
+  return buffer;
+}
+
+struct MlirToRuntimeTranslator::Impl {
+  mlir::ModuleOp module;
+  // The runtime for a function call.
+  CoreRuntimeBuilder* runtime{};
+  // The current working op, the translator process the ops one by one, each
+  // time it updates `cur_op` here to current op
+  // working on.
+  OpExecutableBuilder* cur_op{};
+
+  // record the current function name.
+  std::string cur_func_name;
+
+  // Name to function definitions.
+  std::unordered_map<std::string, mlir::FuncOp> func_defs;
+
+  // Map from an operation to its results.
+  std::unordered_map<const mlir::Operation*, std::vector<ValueRef>> op_results;
+  llvm::DenseMap<mlir::Value, ValueRef> value_map;
+};
+
+bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
+  if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant"))
+    return false;
+  VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str()
+          << "]";
+
+  auto attr = op->getAttr("value");
+  if (attr.isa<mlir::FloatAttr>()) {
+    if (attr.getType().isF32()) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<float>(attr.cast<mlir::FloatAttr>().getValueAsDouble()))};
+    } else if (attr.getType().isF64()) {
+      impl_->op_results[op] = {ValueRef(static_cast<double>(
+          attr.cast<mlir::FloatAttr>().getValueAsDouble()))};
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+    return true;
+  }
+
+  if (attr.isa<mlir::IntegerAttr>()) {
+    if (attr.getType().isInteger(32)) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<int32_t>(attr.cast<mlir::IntegerAttr>().getSInt()))};
+    } else if (attr.getType().isInteger(64)) {
+      impl_->op_results[op] = {ValueRef(
+          static_cast<int64_t>(attr.cast<mlir::IntegerAttr>().getSInt()))};
+    } else if (attr.getType().isInteger(1)) {
+      impl_->op_results[op] = {
+          ValueRef(static_cast<bool>(attr.cast<mlir::IntegerAttr>().getInt()))};
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+    return true;
+  }
+
+  LOG(FATAL) << "Not supported constant attribute type";
+  return true;
+}
+
+template <>
+boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr->isa<mlir::IntegerAttr>()) {
+    auto val = attr->cast<mlir::IntegerAttr>();
+    if (val.getType().isInteger(32)) {
+      return val.getInt();
+    }
+  }
+  return boost::none;
+}
+template <>
+boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr->isa<mlir::IntegerAttr>()) {
+    auto val = attr->cast<mlir::IntegerAttr>();
+    if (val.getType().isInteger(64)) {
+      return val.getInt();
+    }
+  }
+  return boost::none;
+}
+
+// TODO(Superjomn) Make double and float parsing share some thing.
+template <>
+boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
+  if (attr->isa<mlir::FloatAttr>()) {
+    auto val = attr->cast<mlir::FloatAttr>();
+    if (val.getType().isF32()) return val.getValueAsDouble();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
+  if (attr->isa<mlir::FloatAttr>()) {
+    auto val = attr->cast<mlir::FloatAttr>();
+    if (val.getType().isF64()) return val.getValueAsDouble();
+  }
+  return boost::none;
+}
+
+template <>
+boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::StringAttr>()) return boost::none;
+  return attr->cast<mlir::StringAttr>().getValue().str();
+}
+
+#define PROCESS_ARRAY_INT(type__, bits__)                                      \
+  template <>                                                                  \
+  boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
+      const mlir::Attribute* attr) {                                           \
+    if (!attr->isa<mlir::ArrayAttr>()) return boost::none;                     \
+    auto array = attr->cast<mlir::ArrayAttr>();                                \
+    CHECK(!array.empty());                                                     \
+                                                                               \
+    if (!array[0].getType().isInteger(bits__)) {                               \
+      return boost::none;                                                      \
+    }                                                                          \
+                                                                               \
+    std::vector<type__> res;                                                   \
+    for (auto& v : array) {                                                    \
+      res.push_back(v.cast<mlir::IntegerAttr>().getInt());                     \
+    }                                                                          \
+    return res;                                                                \
+  }
+
+PROCESS_ARRAY_INT(int16_t, 16);
+PROCESS_ARRAY_INT(int32_t, 32);
+PROCESS_ARRAY_INT(int64_t, 64);
+
+template <>
+boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr->cast<mlir::ArrayAttr>();
+  CHECK(!array.empty());
+
+  if (!array[0].getType().isF32()) return boost::none;
+
+  std::vector<float> res;
+  for (auto& v : array) {
+    res.push_back(v.cast<mlir::FloatAttr>().getValueAsDouble());
+  }
+  return res;
+}
+
+template <>
+boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
+    const mlir::Attribute* attr) {
+  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr->cast<mlir::ArrayAttr>();
+  CHECK(!array.empty());
+
+  if (!array[0].getType().isF64()) return boost::none;
+
+  std::vector<double> res;
+  for (auto& v : array) {
+    res.push_back(v.cast<mlir::FloatAttr>().getValueAsDouble());
+  }
+  return res;
+}
+
+static bool IsReturn(mlir::Operation* op) {
+  return op->getName().getStringRef() == "infrt.return";
+}
+
+bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
+  CHECK(impl_->runtime);
+  impl_->cur_op =
+      impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
+
+  VLOG(3) << "processing general op : " << op->getName().getStringRef().str();
+
+  // process operands
+  for (int i = 0, e = op->getNumOperands(); i < e; i++) {
+    // function argument as value
+    auto operand = op->getOperand(i);
+    if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+      mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
+      Value* arg_value = GetValue(arg);
+      impl_->cur_op->AppendArgument(arg_value);
+      VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " "
+              << GetValue(arg);
+      continue;
+    }
+
+    // normal value
+    Value* arg_value = GetValue(operand);
+    if (!arg_value) {
+      auto upstream_op = operand.getDefiningOp();
+      arg_value = GetOpResult(upstream_op);
+    }
+    CHECK(arg_value) << "No-exist argument value found: "
+                     << DumpToString(operand);
+    impl_->cur_op->AppendArgument(arg_value);
+
+    VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " "
+            << GetValue(operand) << " vs " << arg_value;
+  }
+
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+
+    VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res);
+  }
+  impl_->cur_op->SetResults(res_values);
+
+#ifdef INFRT_DEBUG
+  {
+    VLOG(3) << "check result";
+    for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) {
+      VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i];
+    }
+  }
+#endif
+
+  // process attributes
+  auto attrs = op->getAttrs();
+
+  for (size_t i = 0; i < attrs.size(); i++) {
+    auto& attr = attrs[i];
+    if (auto v = EmitAttribute<int32_t>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<int64_t>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<float>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<double>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(*v));
+    } else if (auto v = EmitAttribute<std::string>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int16_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int32_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<int64_t>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<float>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else if (auto v = EmitAttribute<std::vector<double>>(&attr.second)) {
+      impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
+    } else {
+      LOG(FATAL) << "Not supported attribute type";
+    }
+  }
+
+  // process regions, we treat regions as attribute.
+  auto num_regions = op->getNumRegions();
+  if (num_regions > 0) {
+    CHECK_EQ(num_regions, 1UL)
+        << "op with more than one region is not supported yet.";
+    auto& region = op->getRegions().front();
+    auto num_blocks = region.getBlocks().size();
+    CHECK_EQ(num_blocks, 1UL)
+        << "region with more than one block is not supported yet.";
+
+    // process arguments
+    llvm::SmallVector<mlir::Type, 4> inputs;
+    auto& block = region.getBlocks().front();
+    for (auto arg : block.getArguments()) inputs.push_back(arg.getType());
+
+    // process results
+    // NOTE: if an op contains a region, we simply ignore the region's return
+    // values,
+    //       or its return values will conflict with op's return values.
+    llvm::SmallVector<mlir::Type, 0> results;
+
+    auto func_type =
+        mlir::FunctionType::get(inputs, results, region.getContext());
+    auto* function = impl_->cur_op->CreateFunctionExecutable(
+        &region, func_type, &impl_->func_defs);
+    impl_->cur_op->AppendAttribute(new Value(function));
+  }
+
+  return true;
+}
+
+bool MlirToRuntimeTranslator::EmitReturnOp(
+    mlir::Operation* op, llvm::SmallVectorImpl<mlir::Value>* results) {
+  CHECK(results);
+  if (op->getName().getStringRef() == "infrt.return") {
+    for (size_t i = 0; i < op->getNumOperands(); i++) {
+      results->push_back(op->getOperand(i));
+    }
+
+    return true;
+  }
+  return false;
+}
+
+bool MlirToRuntimeTranslator::EmitFunctions() {
+  for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+    EmitFunction(func_op);
+  }
+  return true;
+}
+
+void MlirToRuntimeTranslator::EmitFunction(mlir::FuncOp op) {
+  impl_->func_defs[op.getName().str()] = op;
+}
+
+Value* MlirToRuntimeTranslator::GetOpResult(mlir::Operation* op) {
+  auto it = impl_->op_results.find(op);
+  return it == impl_->op_results.end() ? nullptr : it->second.front().get();
+}
+
+Value* MlirToRuntimeTranslator::GetValue(mlir::Value value) {
+  auto it = impl_->value_map.find(value);
+  return it == impl_->value_map.end() ? nullptr : it->second.get();
+}
+
+Value* MlirToRuntimeTranslator::AddValue(mlir::Value value) {
+  auto res = impl_->value_map.try_emplace(value, ValueRef(new Value));
+  CHECK(res.second) << "Duplicate add mlir value [" << DumpToString(value)
+                    << "]";
+  return res.first->second.get();
+}
+
+MlirToRuntimeTranslator::~MlirToRuntimeTranslator() {}
+
+void MlirToRuntimeTranslator::UpdateCurFuncName(const std::string& name) {
+  impl_->cur_func_name = std::string(name);
+}
+
+MlirToRuntimeTranslator::MlirToRuntimeTranslator(mlir::ModuleOp module,
+                                                 CoreRuntimeBuilder* runtime)
+    : impl_(new Impl) {
+  CHECK(runtime);
+  impl_->module = module;
+  impl_->runtime = runtime;
+}
+
+bool MlirToRuntimeTranslator::EmitBuildShapeOp(mlir::Operation* op) {
+  if (op->getName().getStringRef() != "ts.build_shape") return false;
+
+  auto value = op->getAttr("value");
+
+  CHECK(value.isa<mlir::ArrayAttr>());
+  auto values = value.cast<mlir::ArrayAttr>().getValue();
+  std::vector<int64_t> dims;
+  for (auto& attr_v : values) {
+    dims.push_back(attr_v.cast<mlir::IntegerAttr>().getInt());
+  }
+  impl_->op_results[op] = {
+      ValueRef(new Value(tensor::TensorShape(llvm::ArrayRef<int64_t>(dims))))};
+
+  return true;
+}
+
+bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op,
+                                         function_defs_t* function_table) {
+  CHECK(op);
+  CHECK(function_table);
+  if (op->getName().getStringRef() != "infrt.call") return false;
+
+  impl_->cur_op =
+      impl_->runtime->NewOpExecutable(op->getName().getStringRef().str());
+
+  auto callee = op->getAttr("callee");
+  auto callee_name = callee.dyn_cast<mlir::FlatSymbolRefAttr>();
+
+  // process arguments
+  for (size_t i = 0; i < op->getNumOperands(); i++) {
+    auto operand = op->getOperand(i);
+    auto* arg_value = GetValue(operand);
+
+    if (!arg_value) {
+      auto upstream_op = operand.getDefiningOp();
+      arg_value = GetOpResult(upstream_op);
+    }
+    CHECK(arg_value) << "No-exist argument value found: "
+                     << DumpToString(operand);
+    impl_->cur_op->AppendArgument(arg_value);
+  }
+
+  // process results
+  llvm::SmallVector<Value*, 4> res_values;
+  for (int i = 0, e = op->getNumResults(); i < e; i++) {
+    auto res = op->getResult(i);
+    res_values.push_back(AddValue(res));
+  }
+  impl_->cur_op->SetResults(res_values);
+
+  // process attribute
+  auto& table = function_table ? *function_table : impl_->func_defs;
+  {
+    // lookup the callee function
+    auto it = table.find(callee_name.getValue().str());
+    CHECK(it != table.end()) << "can't find function ["
+                             << callee_name.getValue().str() << "]";
+    auto* function =
+        impl_->cur_op->CreateFunctionExecutable(it->second, &impl_->func_defs);
+    impl_->cur_op->AppendAttribute(new Value(function));
+  }
+
+  VLOG(3) << "Emit call " << callee_name.getValue().str() << " "
+          << impl_->cur_op->frame();
+  return true;
+}
+
+MlirToRuntimeTranslator::MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime)
+    : impl_(new Impl) {
+  CHECK(runtime);
+  impl_->runtime = runtime;
+}
+
+Value* MlirToRuntimeTranslator::AddValue(mlir::Value mlir_value, Value* value) {
+  auto it = impl_->value_map.try_emplace(mlir_value, ValueRef(value));
+  CHECK(it.second) << "duplicate add value " << DumpToString(mlir_value);
+  return value;
+}
+
+void MlirToRuntimeTranslate(mlir::ModuleOp module,
+                            CoreRuntimeBuilder* runtime) {
+  MlirToRuntimeTranslator(module, runtime).Run();
+}
+
+/**
+ * Execute the mlir program in test mode -- print some debug information to
+ * stdout.
+ */
+class MlirProgramTestExecutor : public MlirToRuntimeTranslator {
+ public:
+  CoreRuntimeBuilder core_runtime;
+
+  MlirProgramTestExecutor(mlir::ModuleOp module, KernelRegistry* registry)
+      : MlirToRuntimeTranslator(module, &core_runtime),
+        core_runtime(registry),
+        registry(registry) {
+    CHECK(registry);
+  }
+
+  void Run() {
+    EmitFunctions();
+
+    CHECK(registry);
+    for (auto func_op : impl_->module.getOps<mlir::FuncOp>()) {
+      VLOG(3) << "Running function " << func_op.getName().str();
+      EmitAndRunFuncWithoutArguments(func_op);
+    }
+  }
+
+ protected:
+  std::unordered_map<std::string, mlir::FuncOp> func_def_table;
+
+  void EmitFunction(mlir::FuncOp op) override {
+    CHECK(!impl_->func_defs.count(op.getName().str()))
+        << "Duplicate function defition found for function ["
+        << op.getName().str();
+    impl_->func_defs.emplace(op.getName().str(), op);
+  }
+
+ private:
+  void EmitAndRunFuncWithoutArguments(mlir::FuncOp func) {
+    // print the function name for llvm FileChecker macro, CHECK-LABEL
+    std::cout << '@' << func.getName().str() << std::endl;
+    if (func.getNumArguments() ==
+        0) {  // an entry function, execute it immediately
+      VLOG(3) << "executing function " << func.getName().str();
+      // Emit and execute each function
+      CoreRuntimeBuilder runtime(registry);
+      impl_->runtime = &runtime;
+
+      auto& blocks = func.getBlocks();
+      CHECK_EQ(blocks.size(), 1UL)
+          << "function with more than one block is not supported yet";
+
+      for (auto& op : blocks.front()) {
+        if (EmitConstantOp(&op)) continue;
+        if (EmitBuildShapeOp(&op)) continue;
+        llvm::SmallVector<mlir::Value, 3> results;
+        if (EmitReturnOp(&op, &results)) continue;
+        if (EmitCallOp(&op, &impl_->func_defs)) continue;
+        if (EmitGeneralOp(&op)) continue;
+        LOG(FATAL) << "Not supported op: " << DumpToString(op);
+      }
+
+      runtime.Execute();
+
+    } else {
+      VLOG(2) << "get an callable function: " << func.getName().str();
+    }
+  }
+
+ private:
+  KernelRegistry* registry{};
+};
+
+void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) {
+  MlirProgramTestExecutor execute(module, registry);
+  execute.Run();
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
new file mode 100644
index 00000000000000..598e81bfd96d8a
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/SmallVector.h>
+
+#include <boost/optional.hpp>
+#include <memory>         // NOLINT
+#include <string>         //NOLINT
+#include <unordered_map>  // NOLINT
+
+namespace mlir {
+class FuncOp;
+class ModuleOp;
+class Operation;
+class Attribute;
+class Value;
+}  // namespace mlir
+
+namespace infrt::host_context {
+
+class CoreRuntimeBuilder;
+class Value;
+class ValueRef;
+class KernelRegistry;
+
+/**
+ * MlirToRuntimeTranslator helps to translate a MLIR program to a CoreRuntime.
+ * This is the base class of all the modules those parse a MLIR program and
+ * finally generate a CoreRuntime.
+ */
+class MlirToRuntimeTranslator {
+ public:
+  //! Holds all the function definitions.
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  explicit MlirToRuntimeTranslator(CoreRuntimeBuilder* runtime);
+  MlirToRuntimeTranslator(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
+
+  void Run() { EmitFunctions(); }
+
+  virtual ~MlirToRuntimeTranslator();
+
+ protected:
+  //! Emit a "infrt.constant.*" operation, return true if succeed.
+  bool EmitConstantOp(mlir::Operation* op);
+  //! Emit a "infrt.return" operation.
+  bool EmitReturnOp(mlir::Operation* op,
+                    llvm::SmallVectorImpl<mlir::Value>* results);
+  //! Emit a "ts.build_shape" operation.
+  bool EmitBuildShapeOp(mlir::Operation* op);
+  //! Emit an operation other than the special cases above.
+  bool EmitGeneralOp(mlir::Operation* op);
+  //! Emit all the functions.
+  bool EmitFunctions();
+
+  //! Emit a single function, this is an API that should be implemented by
+  //! inherients.
+  virtual void EmitFunction(mlir::FuncOp op);
+
+  bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
+
+  template <typename T>
+  boost::optional<T> EmitAttribute(const mlir::Attribute* attr);
+
+  Value* GetOpResult(mlir::Operation* op);
+
+  Value* GetValue(mlir::Value value);
+
+  Value* AddValue(mlir::Value value);
+
+  Value* AddValue(mlir::Value mlir_value, Value* value);
+
+  void UpdateCurFuncName(const std::string& name);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * Build a CoreRuntime from a MLIR module.
+ */
+void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
+
+/**
+ * Execute a MLIR program, that is execute all the functions without input
+ * arguments.
+ * This is mainly used by testcase.
+ * @param module a MLIR module.
+ * @param registry the kernel registry containing all the valid kernels.
+ */
+void TestMlir(mlir::ModuleOp module, KernelRegistry* registry);
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
new file mode 100644
index 00000000000000..9b85be977ab6c1
--- /dev/null
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
+
+#include <gtest/gtest.h>
+#include <llvm/Support/FormatVariadic.h>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/dialect/mlir_loader.h"
+#include "paddle/infrt/host_context/core_runtime.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/mlir_program_executor.h"
+#include "paddle/infrt/kernel/basic_kernels.h"
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+#include "paddle/infrt/kernel/tensor_kernels.h"
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+#include "paddle/infrt/kernel/test_kernels.h"
+
+namespace infrt::host_context {
+
+TEST(MlirToRuntimeTranslate, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> () {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  infrt.return
+}
+)ROC";
+
+  auto module = dialect::LoadMlirSource(&context, source);
+  module->verify();
+
+  KernelRegistry registry;
+  kernel::RegisterFloatBasicKernels(&registry);
+  kernel::RegisterIntBasicKernels(&registry);
+
+  TestMlir(module.get(), &registry);
+}
+
+TEST(TestMlir, basic) {
+  mlir::MLIRContext context;
+
+  auto source = R"ROC(
+func @main() -> () {
+  %v0 = infrt.constant.f32 1.0
+  %v1 = infrt.constant.f32 2.0
+  %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32
+  %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32
+
+  "infrt.print.f32"(%v1) : (f32) -> ()
+
+  infrt.return
+}
+)ROC";
+
+  auto module = dialect::LoadMlirSource(&context, source);
+  module->verify();
+
+  KernelRegistry registry;
+  kernel::RegisterFloatBasicKernels(&registry);
+  kernel::RegisterIntBasicKernels(&registry);
+
+  TestMlir(module.get(), &registry);
+}
+
+TEST(TestMlir, shadow_copy_tensor_profile) {
+  mlir::MLIRContext* context = infrt::Global::getMLIRContext();
+
+  auto head = R"ROC(
+func @predict(%a: !infrt.tensor<X86, NCHW, F32>, %b: !infrt.tensor<X86, NCHW, F32>) -> (!infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>) {
+)ROC";
+
+  auto tpl0 =
+      "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor<X86, NCHW, F32> -> "
+      "!infrt.tensor<X86, NCHW, F32>";
+  auto tpl1 =
+      "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor<X86, NCHW, F32> -> "
+      "!infrt.tensor<X86, NCHW, F32>";
+
+  auto end = R"ROC(
+infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F32>
+}
+  )ROC";
+
+  std::stringstream ss;
+  ss << head;
+  for (int i = 0; i < 2000; i++) {
+    ss << llvm::formatv(tpl0, i).str() << "\n";
+    ss << llvm::formatv(tpl1, i).str() << "\n";
+  }
+  ss << end;
+
+  auto content = ss.str();
+
+  // LOG(INFO) << "content: " << content << std::endl;
+
+  auto module = dialect::LoadMlirSource(context, content);
+  module->verify();
+
+  host_context::KernelRegistry registry;
+
+  kernel::RegisterBasicKernels(&registry);
+  kernel::RegisterTestKernels(&registry);
+  kernel::RegisterTensorShapeKernels(&registry);
+  kernel::RegisterTensorKernels(&registry);
+  kernel::RegisterControlFlowKernels(&registry);
+
+  MlirProgramExecutor executor(*module, &registry);
+  executor.BuildFunctions();
+
+  auto* func = executor.LookupFunc("predict");
+  ASSERT_TRUE(func);
+
+  std::vector<Value*> in_args;
+  std::vector<ValueRef> out_args(
+      {ValueRef(new Value(tensor::DenseHostTensor())),
+       ValueRef(new Value(tensor::DenseHostTensor()))});
+
+  auto create_tensor = [] {
+    tensor::DenseHostTensor a(tensor::TensorShape{{200, 3000}},
+                              DType(DType::Kind::F32));
+    auto* data = reinterpret_cast<float*>(a.raw_data());
+    for (int i = 0; i < a.shape().GetNumElements(); i++) {
+      data[i] = i;
+    }
+    return a;
+  };
+
+  std::vector<ValueRef> inputs({ValueRef(new Value(create_tensor())),
+                                ValueRef(new Value(create_tensor()))});
+  in_args.assign({inputs[0].get(), inputs[1].get()});
+
+  for (int i = 0; i < 500; i++) {
+    func->Execute(
+        llvm::ArrayRef<Value*>(in_args.data(), in_args.size()),
+        llvm::MutableArrayRef<ValueRef>(out_args.data(), out_args.size()));
+  }
+}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
new file mode 100644
index 00000000000000..6b10ed473719e2
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/op_executable.h"
+
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_frame.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt::host_context {
+
+struct OpExecutable::Impl {
+  Impl(const std::string& op_name,
+       SymbolTable* symbol_table,
+       KernelRegistry* kernel_registry)
+      : name(op_name),
+        symbol_table(symbol_table),
+        kernel_registry(kernel_registry ? kernel_registry
+                                        : GetCpuKernelRegistry()) {
+    CHECK(kernel_registry);
+  }
+
+  inline bool to_execute() const {
+    return !run_once || run_once && !has_executed;
+  }
+  inline void MarkRun() { has_executed = true; }
+
+  std::string name;
+  SymbolTable* symbol_table{};
+  KernelFrameBuilder frame;
+  KernelRegistry* kernel_registry{};
+
+  std::unique_ptr<MlirFunctionExecutable> mlir_function_executable;
+
+  KernelImplementation kernel_impl{};
+
+  //! Tell whether this Op should be executed only once.
+  bool run_once{};
+  //! Tell whether this op has been executed.
+  bool has_executed{};
+};
+
+OpExecutable::OpExecutable(OpExecutable::Impl* impl) : impl_(impl) {}
+
+const std::string& OpExecutable::name() const { return impl_->name; }
+
+OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name,
+                                         SymbolTable* symbol_table,
+                                         KernelRegistry* kernel_registry)
+    : OpExecutable(new Impl(op_name, symbol_table, kernel_registry)) {
+  CHECK(impl_);
+  // CPU kernel registry is the default KernelRegistry.
+  impl_->kernel_impl = impl_->kernel_registry->GetKernel(
+      std::string(op_name.data(), op_name.size()));
+  // TODO(Superjomn) support other device other than CPU.
+  CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name;
+
+  if (op_name == "dt.get_param") {
+    impl_->run_once = true;
+  }
+}
+
+void OpExecutableBuilder::AppendArgument(const std::string& name) {
+  if (!impl_->symbol_table->GetValue(name)) {
+    impl_->symbol_table->Register(name);
+  }
+  impl_->frame.AddArgument(impl_->symbol_table->GetValue(name));
+}
+
+void OpExecutableBuilder::AppendArgument(Value* value) {
+  impl_->frame.AddArgument(value);
+}
+
+KernelFrame& OpExecutable::frame() { return impl_->frame; }
+const KernelFrame& OpExecutable::frame() const { return impl_->frame; }
+
+void OpExecutableBuilder::SetResults(llvm::ArrayRef<std::string> result_names) {
+  llvm::SmallVector<Value*, 3> results;
+  for (size_t result_id = 0; result_id < result_names.size(); result_id++) {
+    Value* value = impl_->symbol_table->Register(result_names[result_id]);
+    results.push_back(value);
+  }
+  impl_->frame.SetResults(results);
+}
+
+void OpExecutableBuilder::SetResults(llvm::ArrayRef<Value*> results) {
+  impl_->frame.SetResults(results);
+}
+
+void OpExecutableBuilder::AppendAttribute(Value* value) {
+  impl_->frame.AddAttribute(value);
+}
+
+OpExecutableBuilder::OpExecutableBuilder(OpExecutableBuilder&& other)
+    : OpExecutable(other.impl_.release()) {}
+
+MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable(
+    mlir::FuncOp op, MlirToRuntimeTranslator::function_defs_t* function_defs) {
+  CHECK(!impl_->mlir_function_executable);
+  impl_->mlir_function_executable.reset(
+      new MlirFunctionExecutable(op, impl_->kernel_registry, *function_defs));
+  return impl_->mlir_function_executable.get();
+}
+
+MlirFunctionExecutable* OpExecutableBuilder::CreateFunctionExecutable(
+    mlir::Region* region,
+    mlir::FunctionType func_type,
+    function_defs_t* function_defs) {
+  CHECK(!impl_->mlir_function_executable);
+  impl_->mlir_function_executable.reset(new MlirFunctionExecutable(
+      region, func_type, impl_->kernel_registry, *function_defs));
+  return impl_->mlir_function_executable.get();
+}
+
+void OpExecutable::Execute() {
+#ifndef NDEBUG
+  VLOG(3) << "execute " << name()
+          << " --- frame args: " << impl_->frame.GetNumArgs() << " results "
+          << impl_->frame.GetNumResults() << " attributes "
+          << impl_->frame.GetNumAttributes();
+  for (int i = 0; i < impl_->frame.GetNumArgs(); i++) {
+    VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i);
+  }
+  for (int i = 0; i < impl_->frame.GetNumResults(); i++) {
+    VLOG(3) << "function result: " << impl_->frame.GetResults()[i];
+  }
+#endif
+
+  if (impl_->to_execute()) {
+    impl_->kernel_impl(&impl_->frame);
+    impl_->MarkRun();
+  }
+}
+
+OpExecutable::~OpExecutable() {}
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
new file mode 100644
index 00000000000000..e2248225a5cafa
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <llvm/ADT/ArrayRef.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Region.h"
+
+namespace mlir {
+class FuncOp;
+}  // namespace mlir
+
+namespace infrt::host_context {
+
+class SymbolTable;
+class KernelRegistry;
+class KernelFrame;
+class Value;
+class CoreRuntimeBuilder;
+class MlirFunctionExecutable;
+
+/**
+ * OpExecutable is a runtime executable instance for an operation. It captures
+ * all the information(Tensors, attributes
+ * and so on) needed for execution.
+ * With the SymbolTable and op definition, it create and hold a KernelFrame once
+ * and execute any times.
+ */
+class OpExecutable {
+ public:
+  KernelFrame& frame();
+  const KernelFrame& frame() const;
+
+  void Execute();
+
+  const std::string& name() const;
+
+  ~OpExecutable();
+
+ protected:
+  class Impl;
+  explicit OpExecutable(Impl* impl);
+
+  std::unique_ptr<Impl> impl_;
+};
+
+/**
+ * Builder to help contruct an OpExecutable.
+ */
+class OpExecutableBuilder : public OpExecutable {
+ public:
+  using function_defs_t = std::unordered_map<std::string, mlir::FuncOp>;
+
+  OpExecutableBuilder(const std::string& op_name,
+                      SymbolTable* symbol_table,
+                      KernelRegistry* kernel_registry = nullptr);
+  OpExecutableBuilder(OpExecutableBuilder&& other);
+
+  void AppendArgument(const std::string& name);
+  void AppendArgument(Value* value);
+
+  void SetResults(llvm::ArrayRef<std::string> result_names);
+  void SetResults(llvm::ArrayRef<Value*> results);
+
+  void AppendAttribute(Value* value);
+
+  MlirFunctionExecutable* CreateFunctionExecutable(
+      mlir::FuncOp op, function_defs_t* function_defs);
+
+  MlirFunctionExecutable* CreateFunctionExecutable(
+      mlir::Region* region,
+      mlir::FunctionType func_type,
+      function_defs_t* function_defs);
+};
+
+}  // namespace infrt::host_context
diff --git a/paddle/infrt/host_context/op_executable_test.cc b/paddle/infrt/host_context/op_executable_test.cc
new file mode 100644
index 00000000000000..f981cca4426c1b
--- /dev/null
+++ b/paddle/infrt/host_context/op_executable_test.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/op_executable.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/symbol_table.h"
+
+namespace infrt {
+namespace host_context {
+
+int add(int a, int b) { return a + b; }
+
+TEST(OpExecutable, basic) {
+  // register kernel
+  KernelRegistry registry;
+  registry.AddKernel("infrt.test.add.i32", INFRT_KERNEL(add));
+
+  SymbolTable table;
+  table.Register("a", 1);
+  table.Register("b", 2);
+
+  OpExecutableBuilder executable("infrt.test.add.i32", &table, &registry);
+  executable.AppendArgument("a");
+  executable.AppendArgument("b");
+  executable.SetResults({"c"});
+
+  executable.Execute();
+
+  // check the kernel frame has the result.
+  auto results = executable.frame().GetResults();
+  ASSERT_EQ(results.size(), 1UL);
+  ASSERT_EQ(results.front()->get<int32_t>(), 3);
+
+  // check symbol table contains the same result instance.
+  LOG(INFO) << "type: " << table.GetValue("c")->type_info();
+  int c = table.GetValue("c")->get<int32_t>();
+  ASSERT_EQ(c, 3);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/symbol_table.cc b/paddle/infrt/host_context/symbol_table.cc
new file mode 100644
index 00000000000000..318dc0cc55624b
--- /dev/null
+++ b/paddle/infrt/host_context/symbol_table.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/symbol_table.h"
+
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct SymbolTable::Impl {
+  std::unordered_map<std::string, ValueRef> data;
+};
+
+SymbolTable::SymbolTable() : impl_(new Impl) {}
+
+Value* SymbolTable::Register(const std::string& key) {
+  CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]";
+  auto newitem = ValueRef(new Value);
+  impl_->data.emplace(key, newitem);
+  return newitem.get();
+}
+
+Value* SymbolTable::Register(const std::string& key, ValueRef value) {
+  CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]";
+  impl_->data.emplace(key, value);
+  return value.get();
+}
+
+Value* SymbolTable::GetValue(const std::string& key) const {
+  auto it = impl_->data.find(std::string(key));
+  return it != impl_->data.end() ? it->second.get() : nullptr;
+}
+
+// @{
+#define REGISTER_TYPE__(T)                                       \
+  template <>                                                    \
+  T SymbolTable::Get<T>(const std::string& key) {                \
+    auto it = impl_->data.find(std::string(key));                \
+    CHECK(it != impl_->data.end()) << "No value called " << key; \
+    return it->second->get<T>();                                 \
+  }
+REGISTER_TYPE__(int32_t);
+REGISTER_TYPE__(float);
+REGISTER_TYPE__(double);
+REGISTER_TYPE__(int64_t);
+#undef REGISTER_TYPE__
+// @}
+
+SymbolTable::~SymbolTable() {}
+
+size_t SymbolTable::size() const { return impl_->data.size(); }
+
+// @{
+#define REGISTER_TYPE__(T)                                                  \
+  template <>                                                               \
+  Value* SymbolTable::Register(const std::string& key, T&& v) {             \
+    CHECK(!impl_->data.count(key)) << "Duplicate register [" << key << "]"; \
+    auto newitem = ValueRef(v);                                             \
+    impl_->data.emplace(key, newitem);                                      \
+    return newitem.get();                                                   \
+  }
+REGISTER_TYPE__(int)
+REGISTER_TYPE__(float)
+REGISTER_TYPE__(double)
+REGISTER_TYPE__(bool)
+#undef REGISTER_TYPE__
+// @}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/symbol_table.h b/paddle/infrt/host_context/symbol_table.h
new file mode 100644
index 00000000000000..805215a78ce0d0
--- /dev/null
+++ b/paddle/infrt/host_context/symbol_table.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include <memory>
+
+#include "paddle/infrt/host_context/value.h"
+
+namespace infrt {
+namespace host_context {
+
+/**
+ * SymbolTable holds all the states of the kernel graph in the runtime.
+ */
+class SymbolTable {
+ public:
+  SymbolTable();
+
+  /**
+   * Register a state called \p key.
+   */
+  Value* Register(const std::string& key);
+
+  Value* Register(const std::string& key, ValueRef value);
+
+  /**
+   * Register a state and set value.
+   */
+  template <typename T>
+  Value* Register(const std::string& key, T&& v);
+
+  size_t size() const;
+
+  /**
+   * Get a state called \p key.
+   */
+  Value* GetValue(const std::string& key) const;
+
+  template <typename T>
+  T Get(const std::string& key);
+
+  ~SymbolTable();
+
+ private:
+  class Impl;
+
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc
new file mode 100644
index 00000000000000..8c3ccba3d0ba55
--- /dev/null
+++ b/paddle/infrt/host_context/value.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/value.h"
+
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+
+namespace infrt {
+namespace host_context {
+
+ValueRef::ValueRef(int32_t val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(int64_t val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(float val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(double val) : Shared<Value>(new Value(val)) {}
+ValueRef::ValueRef(bool val) : Shared<Value>(new Value(val)) {}
+
+const char* Value::type_info() const { return __type_info__; }
+
+void CopyTo(const Value& from, Value* to) {
+  CHECK(from.valid()) << "from value is not valid, can't be copied";
+  CHECK(to) << "to is not valid";
+  visit(
+      [&](auto&& arg) {
+        using T = std::decay_t<decltype(arg)>;
+        if (std::is_same<T, int16_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, int32_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, float>::value)
+          to->data = arg;
+        else if (std::is_same<T, double>::value)
+          to->data = arg;
+        else if (std::is_same<T, uint32_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, uint64_t>::value)
+          to->data = arg;
+        else if (std::is_same<T, bool>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::TensorShape>::value)
+          to->data = arg;
+        else if (std::is_same<T, MlirFunctionExecutable*>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::DenseHostTensor>::value)
+          to->data = arg;
+        else if (std::is_same<T, std::vector<int16_t>>::value)
+          to->data = arg;
+        else if (std::is_same<T, std::vector<int64_t>>::value)
+          to->data = arg;
+        else if (std::is_same<T, tensor::TensorMap>::value)
+          to->data = arg;
+        else
+          LOG(FATAL) << "Not supported Value copy: " << typeid(T).name();
+      },
+      from.data);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
new file mode 100644
index 00000000000000..4a2b92a7e69c59
--- /dev/null
+++ b/paddle/infrt/host_context/value.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/infrt/common/object.h"
+#include "paddle/infrt/common/shared.h"
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/support/variant.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+namespace host_context {
+
+struct MlirFunctionExecutable;
+
+using ValueVariantType = Variant<int16_t,
+                                 int32_t,
+                                 int64_t,
+                                 float,
+                                 double,
+                                 bool,
+                                 std::string,
+                                 tensor::TensorShape,
+                                 tensor::DenseHostTensor,
+                                 MlirFunctionExecutable*,
+                                 tensor::TensorMap,
+                                 std::vector<int16_t>,
+                                 std::vector<int32_t>,
+                                 std::vector<int64_t>,
+                                 std::vector<float>,
+                                 std::vector<double>>;
+
+//! Copy content from \param from to \param to.
+void CopyTo(const Value& from, Value* to);
+
+/**
+ * Represents any data type for value in host context.
+ */
+class Value : public common::Object {
+ public:
+  using variant_type = ValueVariantType;
+
+  explicit Value() {}  // NOLINT
+  explicit Value(int32_t x) : data(x) {}
+  explicit Value(int64_t x) : data(x) {}
+  explicit Value(float x) : data(x) {}
+  explicit Value(double x) : data(x) {}
+  explicit Value(bool x) : data(x) {}
+  explicit Value(std::string x) : data(x) {}
+  explicit Value(tensor::TensorMap&& x) : data(x) {}
+  explicit Value(std::vector<int16_t>&& x) : data(x) {}
+  explicit Value(std::vector<int32_t>&& x) : data(x) {}
+  explicit Value(std::vector<int64_t>&& x) : data(x) {}
+  explicit Value(std::vector<float>&& x) : data(x) {}
+  explicit Value(std::vector<double>&& x) : data(x) {}
+  explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {}
+  explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {}
+  explicit Value(MlirFunctionExecutable* x) : data(x) {}
+
+  template <typename T>
+  const T& get() const {
+    return data.get<T>();
+  }
+  template <typename T>
+  T& get() {
+    return data.get<T>();
+  }
+
+  template <typename T>
+  void set(T&& v) {
+    data = std::move(v);
+  }
+
+  void set(Value* v) { data = std::move(v->data); }
+
+  bool valid() const { return true; }
+
+  const char* type_info() const override;
+
+  friend void CopyTo(const Value& from, Value* to);
+
+ private:
+  ValueVariantType data;
+  static constexpr const char* __type_info__ = "host_context_value";
+};
+
+/**
+ * Represents a counted reference of a Value.
+ */
+class ValueRef : common::Shared<Value> {
+ public:
+  ValueRef() = default;
+  explicit ValueRef(Value* n) : common::Shared<Value>(n) {}
+  explicit ValueRef(int32_t val);
+  explicit ValueRef(int64_t val);
+  explicit ValueRef(float val);
+  explicit ValueRef(double val);
+  explicit ValueRef(bool val);
+
+  using common::Shared<Value>::get;
+  using common::Shared<Value>::Reset;
+  using common::Shared<Value>::operator->;
+  using common::Shared<Value>::operator*;
+  //! Get a readonly data.
+  template <typename T>
+  const T& get() const {
+    CHECK(p_);
+    return p_->get<T>();
+  }
+
+  template <typename T>
+  T& get() {
+    CHECK(p_);
+    return p_->get<T>();
+  }
+
+  //! Assign a data.
+  template <typename T>
+  void Assign(const T& x) {
+    if (!p_) {
+      p_ = common::make_shared<Value>();
+    }
+    *p_ = x;
+  }
+
+  template <typename T, typename... Args>
+  void Assign(Args... args) {
+    p_ = common::make_shared<T>(std::forward<Args>(args)...);
+  }
+
+  inline bool IsValid() { return p_; }
+};
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/value_test.cc b/paddle/infrt/host_context/value_test.cc
new file mode 100644
index 00000000000000..48d49478ce0efb
--- /dev/null
+++ b/paddle/infrt/host_context/value_test.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/host_context/value.h"
+
+#include <gtest/gtest.h>
+
+namespace infrt {
+namespace host_context {
+
+TEST(ValueRef, test) {
+  ValueRef x(12);
+  ASSERT_EQ(x.get<int>(), 12);
+
+  ValueRef y(1.2f);
+  ASSERT_EQ(y.get<float>(), 1.2f);
+
+  ValueRef z(true);
+  ASSERT_EQ(z.get<bool>(), true);
+}
+
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
new file mode 100644
index 00000000000000..da858aad28f818
--- /dev/null
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -0,0 +1,9 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    basic_kernels.cc
+    test_kernels.cc
+    tensor_shape_kernels.cc
+    tensor_kernels.cc
+    control_flow_kernels.cc
+    )
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
new file mode 100644
index 00000000000000..d7f2c3865157dd
--- /dev/null
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/basic_kernels.h"
+
+#include <iostream>
+#include <string>
+
+#include "llvm/Support/raw_ostream.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+using infrt::host_context::Attribute;
+
+namespace infrt::kernel {
+
+template <typename T>
+T add(T a, T b) {
+  return a + b;
+}
+
+template <typename T>
+T sub(T a, T b) {
+  return a - b;
+}
+
+template <typename T>
+T mul(T a, T b) {
+  return a * b;
+}
+
+template <typename T>
+T div(T a, T b) {
+  return a / b;
+}
+
+template <typename T>
+void print(T a) {
+  std::cout << a << std::endl;
+}
+
+static std::string GetString(Attribute<std::string> value) {
+  return value.get();
+}
+
+static void PrintString(const std::string &str) {
+  llvm::outs() << "string = " << str << '\n';
+  llvm::outs().flush();
+}
+
+void RegisterBasicKernels(host_context::KernelRegistry *registry) {
+  RegisterIntBasicKernels(registry);
+  RegisterFloatBasicKernels(registry);
+  registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString));
+  registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString));
+}
+
+void RegisterIntBasicKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add<int32_t>));
+  registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub<int32_t>));
+  registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul<int32_t>));
+  registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div<int32_t>));
+  registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print<int32_t>));
+}
+
+void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add<float>));
+  registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub<float>));
+  registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul<float>));
+  registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div<float>));
+  registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h
new file mode 100644
index 00000000000000..9e98885cf6ebfb
--- /dev/null
+++ b/paddle/infrt/kernel/basic_kernels.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt::host_context {
+
+struct KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+/**
+ * Register all the basic kernels to \p registry.
+ */
+void RegisterBasicKernels(host_context::KernelRegistry* registry);
+
+void RegisterIntBasicKernels(host_context::KernelRegistry* registry);
+void RegisterFloatBasicKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc
new file mode 100644
index 00000000000000..6cc94dbcce0775
--- /dev/null
+++ b/paddle/infrt/kernel/control_flow_kernels.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/control_flow_kernels.h"
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+
+namespace infrt {
+namespace kernel {
+
+static void INFRTCall(
+    host_context::RemainingArguments args,
+    host_context::RemainingResults results,
+    host_context::Attribute<host_context::MlirFunctionExecutable*> fn) {
+  VLOG(3) << "running call kernel ...";
+  CHECK_EQ(fn.get()->num_arguments(), args.size());
+  CHECK_EQ(fn.get()->num_results(), results.size());
+
+  for (auto& v : results.values()) {
+    CHECK(v.get());
+  }
+  fn.get()->Execute(args.values(), results.values());
+}
+
+void RegisterControlFlowKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/control_flow_kernels.h b/paddle/infrt/kernel/control_flow_kernels.h
new file mode 100644
index 00000000000000..5fa6b985f0b171
--- /dev/null
+++ b/paddle/infrt/kernel/control_flow_kernels.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/infrt/host_context/function.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+
+namespace infrt {
+
+namespace host_context {
+struct KernelRegistry;
+}  // namespace host_context
+
+namespace kernel {
+
+void RegisterControlFlowKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
new file mode 100644
index 00000000000000..2fa477aa4dbda6
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensor_kernels.h"
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/infrt/common/global.h"
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+#include "paddle/infrt/tensor/tensor_map.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::kernel {
+using namespace host_context;  // NOLINT
+using namespace tensor;        // NOLINT
+
+/// ===== Kernel begin ====
+
+template <typename T>
+DenseHostTensor CreateUninitTensor(Attribute<std::vector<int64_t>> shape) {
+  const auto &shape_data = shape.get();
+  auto array = llvm::ArrayRef<int64_t>(shape_data.data(), shape_data.size());
+  auto type = GetDType<T>();
+  return DenseHostTensor(TensorShape(array), type);
+}
+
+void PrintTensor(const DenseHostTensor &tensor) {
+  std::cout << tensor << std::endl;
+}
+
+template <typename T>
+void FillTensorWithConstant(DenseHostTensor *tensor, Attribute<T> v) {
+  MutableDTArrayView<T>(tensor).Fill(v.get());
+}
+
+TensorMap LoadParams(const std::string &path) {
+  return *(infrt::tensor::LoadParams(path));
+}
+
+DenseHostTensor GetParam(TensorMap map, Attribute<std::string> nameAttr) {
+  auto &name = nameAttr.get();
+  return *(map[name]);
+}
+
+DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; }
+
+/// ===== Kernel end ====
+
+void RegisterTensorKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("dt.create_uninit_tensor.f32",
+                      INFRT_KERNEL(CreateUninitTensor<float>));
+  registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"});
+  registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
+  registry->AddKernel("dt.fill_tensor_with_constant.f32",
+                      INFRT_KERNEL(FillTensorWithConstant<float>));
+  registry->AddKernel("dt.fill_tensor_with_constant.f64",
+                      INFRT_KERNEL(FillTensorWithConstant<double>));
+  registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams));
+  registry->AddKernel("dt.get_param", INFRT_KERNEL(GetParam));
+  registry->AddKernel("dt.shallow_copy_tensor",
+                      INFRT_KERNEL(ShallowCopyTensor));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h
new file mode 100644
index 00000000000000..8f2180ba80a4f8
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_kernels.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace infrt::host_context {
+struct KernelRegistry;
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+void RegisterTensorKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc
new file mode 100644
index 00000000000000..a04b492819298b
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_shape_kernels.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/tensor_shape_kernels.h"
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/raw_os_ostream.h>
+
+#include <iostream>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt::kernel {
+
+void PrintShape(const tensor::TensorShape& shape) {
+  llvm::raw_os_ostream oos(std::cout);
+  oos << shape << '\n';
+}
+
+void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h
new file mode 100644
index 00000000000000..e87c6c37e88a08
--- /dev/null
+++ b/paddle/infrt/kernel/tensor_shape_kernels.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace infrt::host_context {
+
+class KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+void RegisterTensorShapeKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
new file mode 100644
index 00000000000000..d5f64d09b602fd
--- /dev/null
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/kernel/test_kernels.h"
+
+#include <llvm/ADT/FunctionExtras.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <cassert>
+#include <chrono>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/host_context/mlir_function_executable.h"
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+using infrt::host_context::Attribute;
+using infrt::host_context::MlirFunctionExecutable;
+using infrt::host_context::RemainingArguments;
+
+namespace infrt::kernel {
+namespace {
+class BenchmarkStats {
+ public:
+  BenchmarkStats(std::string name,
+                 int num_warmup_runs,
+                 int max_count,
+                 std::chrono::microseconds benchmark_duration)
+      : name_{name},
+        num_warmup_runs_{num_warmup_runs},
+        max_count_{max_count},
+        benchmark_duration_{benchmark_duration} {}
+
+  void StartRun() {
+    ++cur_count_;
+    // Start recording CPU time.
+    cur_start_walltime_ = std::chrono::steady_clock::now();
+    cur_start_cpu_ = std::clock();
+  }
+
+  void StopRun() {
+    // Do not collect the runtime statistics if we are still in the warm up
+    // period.
+    if (cur_count_ <= num_warmup_runs_) return;
+
+    // Stop the CPU timer.
+    std::clock_t cur_stop_cpu_ = std::clock();
+
+    // Stop the wall clock timer.
+    auto cur_stop_walltime_ = std::chrono::steady_clock::now();
+
+    // Collect the wall clock duration.
+    auto duration_walltime_ = cur_stop_walltime_ - cur_start_walltime_;
+    run_times_walltime_.push_back(duration_walltime_);
+
+    // Collect the CPU duration in microseconds.
+    // First cast to integer that represents microseconds with truncation, as
+    // does std::chrono::duration_cast. Then cast to std::chrono::microseconds.
+    std::clock_t duration_cpu_raw = cur_stop_cpu_ - cur_start_cpu_;
+    auto duration_cpu_ = static_cast<std::chrono::nanoseconds>(
+        static_cast<int64_t>(1e9 * duration_cpu_raw / CLOCKS_PER_SEC));
+
+    run_times_cpu_.push_back(duration_cpu_);
+
+    total_duration_walltime_ += duration_walltime_;
+    total_duration_cpu_ += duration_cpu_;
+  }
+  // Return if we should we run more rounds.
+  bool MoreRun() const {
+    return cur_count_ < max_count_ + num_warmup_runs_ &&
+           total_duration_walltime_ < benchmark_duration_;
+  }
+
+  // Summarize the benchmark results.
+  void Summarize() {
+    std::sort(run_times_walltime_.begin(), run_times_walltime_.end());
+    std::sort(run_times_cpu_.begin(), run_times_cpu_.end());
+
+    auto percentile = [](
+        double p, const std::vector<std::chrono::nanoseconds> &run_times) {
+      assert(p >= 0.0 && p <= 1.0);
+      return run_times[run_times.size() * p];
+    };
+
+    // BM: prefix is added to make grepping results from lit output easier.
+    std::string prefix;
+    llvm::raw_string_ostream(prefix) << "BM:" << name_ << ':';
+    auto cpu_utilization =
+        total_duration_cpu_.count() * 100.0 / total_duration_walltime_.count();
+
+    llvm::outs() << prefix << "Count: " << run_times_walltime_.size() << '\n';
+    llvm::outs() << prefix
+                 << "Duration(ns): " << total_duration_walltime_.count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "Time Min(ns): " << run_times_walltime_.front().count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "Time Max(ns): " << run_times_walltime_.back().count()
+                 << '\n';
+    llvm::outs() << prefix << "Time 50%(ns): "
+                 << percentile(0.5, run_times_walltime_).count() << '\n';
+    llvm::outs() << prefix << "Time 95%(ns): "
+                 << percentile(0.95, run_times_walltime_).count() << '\n';
+    llvm::outs() << prefix << "Time 99%(ns): "
+                 << percentile(0.99, run_times_walltime_).count() << '\n';
+    // Log CPU time statistics.
+    llvm::outs() << prefix
+                 << "CPU Duration(ns): " << total_duration_cpu_.count() << '\n';
+    llvm::outs() << prefix << "CPU Min(ns): " << run_times_cpu_.front().count()
+                 << '\n';
+    llvm::outs() << prefix << "CPU Max(ns): " << run_times_cpu_.back().count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 50%(ns): " << percentile(0.5, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 95%(ns): " << percentile(0.95, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix
+                 << "CPU 99%(ns): " << percentile(0.99, run_times_cpu_).count()
+                 << '\n';
+    llvm::outs() << prefix << "CPU utilization(percent): " << cpu_utilization
+                 << "\n";
+    llvm::outs().flush();
+  }
+
+ private:
+  const std::string name_;
+  const int num_warmup_runs_;
+  const int max_count_;
+  int cur_count_ = 0;
+  const std::chrono::nanoseconds benchmark_duration_;
+  std::chrono::nanoseconds total_duration_walltime_{};
+  std::chrono::nanoseconds total_duration_cpu_{};
+  std::chrono::time_point<std::chrono::steady_clock> cur_start_walltime_{};
+  std::clock_t cur_start_cpu_;
+  std::vector<std::chrono::nanoseconds> run_times_walltime_;
+  // CPU run times in microseconds.
+  std::vector<std::chrono::nanoseconds> run_times_cpu_;
+};
+
+}  // anonymous namespace
+
+// This op benchmarks the input function by running the function in a loop
+// up to a max count or max time as specified in the function's attributes.
+//
+// Attributes:
+// duration_secs: Benchmark duration in seconds.
+// max_count: Max run count of input function.
+// name: The name used to tag the benchmark results.
+// num_warmup_runs: Number of warm up runs before benchmarking starts.
+// fn: The input function to be benchmarked.
+static void benchmark(RemainingArguments args,
+                      host_context::RemainingResults results,
+                      Attribute<int32_t> duration_secs,
+                      Attribute<int32_t> max_count,
+                      Attribute<std::string> name,
+                      Attribute<int32_t> num_warmup_runs,
+                      Attribute<MlirFunctionExecutable *> fn) {
+  BenchmarkStats bm_stats{name.get(),
+                          num_warmup_runs.get(),
+                          max_count.get(),
+                          std::chrono::seconds(duration_secs.get())};
+
+  while (bm_stats.MoreRun()) {
+    bm_stats.StartRun();
+    fn.get()->Execute(args.values(), results.values(), true);
+    bm_stats.StopRun();
+  }
+  bm_stats.Summarize();
+}
+
+// Just copy the input to the result.
+tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) {
+  return src;
+}
+
+void RegisterTestKernels(host_context::KernelRegistry *registry) {
+  registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark));
+  registry->AddKernel("infrt.test.shadow_copy_tensor",
+                      INFRT_KERNEL(ShadowCopyTensor));
+}
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h
new file mode 100644
index 00000000000000..f42884dfaf2c90
--- /dev/null
+++ b/paddle/infrt/kernel/test_kernels.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt::host_context {
+
+struct KernelRegistry;
+
+}  // namespace infrt::host_context
+
+namespace infrt::kernel {
+
+/**
+ * Register all the test kernels to registry.
+ */
+void RegisterTestKernels(host_context::KernelRegistry* registry);
+
+}  // namespace infrt::kernel
diff --git a/paddle/infrt/paddle/CMakeLists.txt b/paddle/infrt/paddle/CMakeLists.txt
new file mode 100644
index 00000000000000..172d78ecde3b80
--- /dev/null
+++ b/paddle/infrt/paddle/CMakeLists.txt
@@ -0,0 +1,24 @@
+proto_library(paddle_framework_proto SRCS framework.proto)
+
+add_subdirectory(cpp)
+add_subdirectory(pb)
+
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    model_parser.cc
+    scope.cc
+    tensor.cc
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/cpp/CMakeLists.txt b/paddle/infrt/paddle/cpp/CMakeLists.txt
new file mode 100644
index 00000000000000..0feaabd2fa7c93
--- /dev/null
+++ b/paddle/infrt/paddle/cpp/CMakeLists.txt
@@ -0,0 +1,16 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/cpp/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h
new file mode 100644
index 00000000000000..ccd79c048ab145
--- /dev/null
+++ b/paddle/infrt/paddle/cpp/desc_api.h
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace infrt::paddle::cpp {
+
+/*
+ * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
+ * classes should implement this.
+ */
+class VarDescAPI {
+ public:
+  enum class Type {
+    // Pod Types
+    BOOL = 0,
+    INT16,
+    INT32,
+    INT64,
+    FP16,
+    FP32,
+    FP64,
+    // Tensor<size_t> is used in C++.
+    SIZE_T,
+    UINT8,
+    INT8,
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR,
+    SELECTED_ROWS,
+    FEED_MINIBATCH,
+    FETCH_LIST,
+    STEP_SCOPES,
+    LOD_RANK_TABLE,
+    LOD_TENSOR_ARRAY,
+    PLACE_LIST,
+    READER,
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW,
+    TUPLE
+  };
+
+  using VarDataType = Type;
+
+  virtual ~VarDescAPI() = default;
+
+  // Get var's name
+  virtual std::string Name() const = 0;
+  // Set var's name
+  virtual void SetName(std::string name) = 0;
+  // Get var's type
+  virtual Type GetType() const = 0;
+  // Set var's type
+  virtual void SetType(Type type) = 0;
+  // Tell whether var is persistable or not
+  virtual bool Persistable() const = 0;
+  // Set var to be persistable or not
+  virtual void SetPersistable(bool persistable) = 0;
+  // Get var's shape
+  virtual std::vector<int64_t> GetShape() const = 0;
+  // Set var's shape
+  virtual void SetShape(const std::vector<int64_t>& dims) = 0;
+};
+
+/*
+ * NOTE Some interfaces are weried, we remain them unchanged to keep compatible
+ * with framework::OpDesc in Fluid framework.
+ */
+class OpDescAPI {
+ public:
+  // The AttrType is used to make the proto::AttrType portable.
+  enum class AttrType {
+    INT = 0,
+    FLOAT = 1,
+    STRING = 2,
+    INTS = 3,
+    FLOATS = 4,
+    STRINGS = 5,
+    BOOLEAN = 6,
+    BOOLEANS = 7,
+    BLOCK = 8,
+    LONG = 9,
+    BLOCKS = 10,
+    LONGS = 11,
+    UNK,
+  };
+
+  virtual ~OpDescAPI() = default;
+
+  /// Get operator's type.
+  virtual std::string Type() const = 0;
+  /// Set operator's type.
+  virtual void SetType(const std::string& type) = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Input(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> InputArgumentNames() const = 0;
+  /// Get arguments given the parameter.
+  virtual std::vector<std::string> Output(const std::string& param) const = 0;
+  /// Get parameters.
+  virtual std::vector<std::string> OutputArgumentNames() const = 0;
+  /// Set a input given the parameter and arguments.
+  virtual void SetInput(const std::string& param,
+                        const std::vector<std::string>& args) = 0;
+  virtual void SetOutput(const std::string& param,
+                         const std::vector<std::string>& args) = 0;
+  /// Tell whether this desc has an attribute.
+  virtual bool HasAttr(const std::string& name) const = 0;
+
+  /// Get the type of an attribute.
+  virtual AttrType GetAttrType(const std::string& name) const = 0;
+
+  virtual std::vector<std::string> AttrNames() const = 0;
+
+  /// Set an attribute.
+  template <typename T>
+  void SetAttr(const std::string& name, const T& v);
+
+  /// Get an attribute.
+  template <typename T>
+  T GetAttr(const std::string& name) const;
+
+  std::string Repr() const {
+    std::stringstream ss;
+    ss << Type();
+    ss << "(";
+    for (auto& arg : InputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Input(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ") -> (";
+    for (auto& arg : OutputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Output(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
+};
+
+class BlockDescAPI {
+ public:
+  virtual ~BlockDescAPI() = default;
+
+  virtual int32_t Idx() const = 0;
+
+  virtual void SetIdx(int32_t idx) = 0;
+
+  virtual int32_t ParentIdx() const = 0;
+
+  virtual void SetParentIdx(int32_t idx) = 0;
+
+  virtual size_t VarsSize() const = 0;
+
+  virtual void ClearVars() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  virtual size_t OpsSize() const = 0;
+
+  virtual void ClearOps() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  virtual int32_t ForwardBlockIdx() const = 0;
+
+  virtual void SetForwardBlockIdx(int32_t idx) = 0;
+};
+
+class ProgramDescAPI {
+ public:
+  virtual ~ProgramDescAPI() = default;
+
+  virtual size_t BlocksSize() const = 0;
+
+  virtual void ClearBlocks() = 0;
+
+  // NOTE: This ugly method is used to compatible interfaces between cpp and
+  // pb/nb backends
+  // TODO(sangoly): refine this
+  template <typename T>
+  T* GetBlock(int32_t idx);
+
+  template <typename T>
+  T* AddBlock();
+
+  virtual bool HasVersion() const = 0;
+
+  virtual int64_t Version() const = 0;
+
+  virtual void SetVersion(int64_t version) = 0;
+};
+
+}  // namespace infrt::paddle::cpp
diff --git a/paddle/infrt/paddle/framework.proto b/paddle/infrt/paddle/framework.proto
new file mode 100644
index 00000000000000..634ec9665d08e0
--- /dev/null
+++ b/paddle/infrt/paddle/framework.proto
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+package paddle.framework.proto;
+
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+  BLOCKS = 10;
+  LONGS = 11;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+message VarType {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR = 7;
+    SELECTED_ROWS = 8;
+    FEED_MINIBATCH = 9;
+    FETCH_LIST = 10;
+    STEP_SCOPES = 11;
+    LOD_RANK_TABLE = 12;
+    LOD_TENSOR_ARRAY = 13;
+    PLACE_LIST = 14;
+    READER = 15;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
+    TUPLE = 18;
+  }
+
+  required Type type = 1;
+
+  message TensorDesc {
+    // Should only be PODType. Is enforced in C++
+    required Type data_type = 1;
+    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  }
+  optional TensorDesc selected_rows = 2;
+
+  message LoDTensorDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorDesc lod_tensor = 3;
+
+  message LoDTensorArrayDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorArrayDesc tensor_array = 4;
+
+  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+  optional ReaderDesc reader = 5;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
+}
+
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+  // True if the variable is an input data and
+  // have to check the feed data shape and dtype
+  optional bool need_check_feed = 4 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+  optional int32 forward_block_idx = 5 [ default = -1 ];
+}
+
+// CompatibleInfo is used to determine if a feature is compatible and
+// provides the information.
+message CompatibleInfo {
+  enum Type {
+    COMPATIBLE = 0;
+    DEFINITELY_NOT = 1;
+    POSSIBLE = 2;
+    BUG_FIX = 3;
+    PRECISION_CHANGE = 4;
+  }
+  required string version = 1;
+  required Type type = 2;
+}
+
+// In some cases, Paddle Fluid may perform operator definition iterations,
+// and the operator uses OpCompatibleMap for compatibility testing.
+message OpCompatibleMap {
+  message OpCompatiblePair {
+    required string op_name = 1;
+    required CompatibleInfo compatible_info = 2;
+  }
+  repeated OpCompatiblePair pair = 1;
+  optional string default_required_version = 2;
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
+message ProgramDesc {
+  reserved 2; // For backward compatibility.
+  repeated BlockDesc blocks = 1;
+  optional Version version = 4;
+  optional OpCompatibleMap op_compatible_map = 3;
+}
\ No newline at end of file
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
new file mode 100644
index 00000000000000..285280e69435b0
--- /dev/null
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/model_parser.h"
+
+#include <fstream>
+#include <vector>
+
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/common/target.h"
+#include "paddle/infrt/common/type.h"
+
+namespace infrt::paddle {
+
+int SizeOfType(framework_proto::VarType::Type type) {
+  using Type = framework_proto::VarType::Type;
+  switch (static_cast<int>(type)) {
+#define DO(desc, type)            \
+  case Type::VarType_Type_##desc: \
+    return sizeof(type);
+    DO(BOOL, bool);
+    DO(FP16, float);
+    DO(FP32, float);
+    DO(INT8, int8_t);
+    DO(INT16, int16_t);
+    DO(INT32, int);
+    DO(INT64, int64_t);
+#undef DO
+    default:
+      LOG(FATAL) << "unknown data type " << type;
+  }
+  return -1;
+}
+
+void TensorFromStream(std::istream &is,
+                      _Tensor_ *tensor,
+                      const common::Target &target) {
+  using Type = framework_proto::VarType::Type;
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  CHECK_EQ(version, 0U) << "Only version 0 is supported";
+  // read tensor desc
+  framework_proto::VarType::TensorDesc desc;
+  {
+    // int32_t size
+    // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    CHECK(desc.ParseFromArray(buf.get(), size)) << "Cannot parse tensor desc";
+  }
+
+  // read tensor
+  std::vector<int32_t> dims_vec;
+  std::copy(
+      desc.dims().begin(), desc.dims().end(), std::back_inserter(dims_vec));
+  Shape dims(dims_vec);
+  tensor->Resize(dims);
+  void *buf;
+  size_t size = tensor->shape().numel() * SizeOfType(desc.data_type());
+  // alllocate memory
+  if (target.arch == Target::Arch::X86) {
+    switch (static_cast<int>(desc.data_type())) {
+#define SET_TENSOR(desc, type, precision)     \
+  case Type::VarType_Type_##desc:             \
+    buf = tensor->mutable_data<type>(target); \
+    tensor->set_type(precision);              \
+    break
+
+      SET_TENSOR(FP32, float, Float(32));
+      SET_TENSOR(INT8, int8_t, Int(8));
+      SET_TENSOR(INT16, int16_t, Int(16));
+      SET_TENSOR(INT32, int32_t, Int(32));
+      SET_TENSOR(INT64, int64_t, Int(64));
+#undef SET_TENSOR
+      default:
+        LOG(FATAL) << "unknown type " << desc.data_type();
+    }
+    // tensor->set_persistable(true);
+    is.read(static_cast<char *>(buf), size);
+  } else if (target.arch == Target::Arch::NVGPU) {
+#ifdef INFRT_WITH_CUDA
+    if (desc.data_type() != Type::VarType_Type_FP32)
+      LOG(FATAL) << "[CUDA] The type is not fp32!!";
+    auto *data = tensor->mutable_data<float>(target);
+    tensor->set_type(infrt::common::Float(32));
+    std::vector<float> temp(tensor->shape().numel());
+    // LOG(INFO) <<"[CUDA] The tensor's size is "<< tensor->shape().numel();
+    is.read(reinterpret_cast<char *>(temp.data()), size);
+    CUDA_CALL(cudaMemcpy(reinterpret_cast<void *>(data),
+                         temp.data(),
+                         tensor->shape().numel() * sizeof(float),
+                         cudaMemcpyHostToDevice));
+#else
+    LOG(FATAL) << "To use CUDA backends, you need to set WITH_CUDA ON!";
+#endif
+  } else {
+    INFRT_NOT_IMPLEMENTED
+  }
+}
+
+void LoadLoDTensor(std::istream &is, _Variable *var, const Target &target) {
+  auto &tensor = var->get<Tensor>();
+  uint32_t version{};
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  VLOG(3) << "model version " << version;
+
+  // Load LoD information
+  uint64_t lod_level{};
+  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::vector<uint64_t> tmp(size / sizeof(uint64_t));
+    is.read(reinterpret_cast<char *>(tmp.data()),
+            static_cast<std::streamsize>(size));
+    // lod[i] = tmp;
+  }
+
+  TensorFromStream(is, tensor.operator->(), target);
+}
+
+void ReadBinaryFile(const std::string &filename, std::string *contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  CHECK(fin.is_open()) << "Cannot open file: " << filename;
+  fin.seekg(0, std::ios::end);
+  auto size = fin.tellg();
+  contents->clear();
+  contents->resize(size);
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
+    const std::string &path, bool program_from_memory) {
+  std::unique_ptr<framework_proto::ProgramDesc> main_program(
+      new framework_proto::ProgramDesc);
+  if (!program_from_memory) {
+    std::string desc_str;
+    ReadBinaryFile(path, &desc_str);
+    main_program->ParseFromString(desc_str);
+  } else {
+    main_program->ParseFromString(path);
+  }
+  return main_program;
+}
+
+void LoadParams(const std::string &path) {}
+
+// Load directly to CPU, and latter transfer to other devices.
+void LoadParam(const std::string &path, _Variable *out, const Target &target) {
+  std::ifstream fin(path, std::ios::binary);
+  CHECK(fin.is_open()) << "failed to open file " << path;
+  LoadLoDTensor(fin, out, target);
+}
+
+}  // namespace infrt::paddle
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
new file mode 100644
index 00000000000000..73125fadedb82b
--- /dev/null
+++ b/paddle/infrt/paddle/model_parser.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/framework.pb.h"
+#include "paddle/infrt/paddle/pb/block_desc.h"
+#include "paddle/infrt/paddle/pb/op_desc.h"
+#include "paddle/infrt/paddle/pb/program_desc.h"
+#include "paddle/infrt/paddle/scope.h"
+#include "paddle/infrt/paddle/tensor.h"
+
+namespace infrt::paddle {
+namespace framework_proto = ::paddle::framework::proto;
+
+// Read a __model__ file.
+std::unique_ptr<framework_proto::ProgramDesc> LoadProgram(
+    const std::string& path, bool program_from_memory = false);
+
+void LoadLoDTensor(std::istream& is,
+                   _Variable* var,
+                   const common::Target& target);
+
+// Read a single file containing all the parameters.
+void LoadParams(const std::string& path);
+
+// Load a single parameter to an output tensor.
+void LoadParam(const std::string& path,
+               _Variable* out,
+               const common::Target& target);
+
+// LoDTensor to ostream
+void TensorToStream(std::ostream& os, const _Tensor_& tensor);
+void TensorFromStream(
+    std::istream& is,
+    _Tensor_* tensor,
+    const common::Target& target = common::DefaultHostTarget());
+void ReadBinaryFile(const std::string& filename, std::string* contents);
+
+}  // namespace infrt::paddle
diff --git a/paddle/infrt/paddle/pb/CMakeLists.txt b/paddle/infrt/paddle/pb/CMakeLists.txt
new file mode 100644
index 00000000000000..fac38afa62db28
--- /dev/null
+++ b/paddle/infrt/paddle/pb/CMakeLists.txt
@@ -0,0 +1,20 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+    var_desc.cc
+    op_desc.cc
+    block_desc.cc
+    program_desc.cc
+    )
+
+foreach(cpp ${SRCS})
+  set(infrt_src
+    "${infrt_src};infrt/paddle/pb/${cpp}"
+    CACHE INTERNAL "")
+endforeach()
+
+file(GLOB includes LIST_DIRECTORIES false RELATIVE ${CMAKE_SOURCE_DIR} *.h)
+
+foreach(header ${includes})
+  set(core_includes "${core_includes};${header}" CACHE INTERNAL "")
+endforeach()
diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc
new file mode 100644
index 00000000000000..11186bc68af164
--- /dev/null
+++ b/paddle/infrt/paddle/pb/block_desc.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/block_desc.h"
+
+namespace infrt::paddle::pb {
+
+template <>
+framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(VarsSize())) << "idx >= vars.size()";
+  return desc_->mutable_vars(idx);
+}
+
+template <>
+framework_proto::VarDesc* BlockDesc::AddVar<framework_proto::VarDesc>() {
+  return desc_->add_vars();
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::GetOp<framework_proto::OpDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(OpsSize())) << "idx >= ops.size()";
+  return desc_->mutable_ops(idx);
+}
+
+template <>
+framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
+  return desc_->add_ops();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h
new file mode 100644
index 00000000000000..9c1b7f9adf172f
--- /dev/null
+++ b/paddle/infrt/paddle/pb/block_desc.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+
+namespace framework_proto = ::paddle::framework::proto;
+
+class BlockDesc : public cpp::BlockDescAPI {
+ public:
+  BlockDesc() = delete;
+
+  explicit BlockDesc(framework_proto::BlockDesc* desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  framework_proto::BlockDesc* Proto() { return desc_; }
+
+  const framework_proto::BlockDesc& ReadonlyProto() const { return *desc_; }
+
+  int32_t Idx() const override { return desc_->idx(); }
+
+  void SetIdx(int32_t idx) override { desc_->set_idx(idx); }
+
+  int32_t ParentIdx() const override { return desc_->parent_idx(); }
+
+  void SetParentIdx(int32_t idx) override { desc_->set_parent_idx(idx); }
+
+  size_t VarsSize() const override { return desc_->vars_size(); }
+
+  void ClearVars() override { desc_->clear_vars(); }
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T* AddVar();
+
+  size_t OpsSize() const override { return desc_->ops_size(); }
+
+  void ClearOps() override { desc_->clear_ops(); }
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T* AddOp();
+
+  int32_t ForwardBlockIdx() const override {
+    return desc_->forward_block_idx();
+  }
+
+  void SetForwardBlockIdx(int32_t idx) override {
+    desc_->set_forward_block_idx(idx);
+  }
+
+ private:
+  framework_proto::BlockDesc* desc_;  // not_own
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc
new file mode 100644
index 00000000000000..c7b1e66f506425
--- /dev/null
+++ b/paddle/infrt/paddle/pb/op_desc.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/op_desc.h"
+
+namespace infrt::paddle::pb {
+
+google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr>
+FindAttr(framework_proto::OpDesc *desc, const std::string &name) {
+  auto &xs = *desc->mutable_attrs();
+  auto it = std::find_if(
+      xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+        return x.name() == name;
+      });
+  if (it == xs.end()) {
+    auto *attr = xs.Add();
+    attr->set_name(name);
+    it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+  }
+  return it;
+}
+
+#define SET_IMPL_ONE(T, ty__, pb_f__)                            \
+  template <>                                                    \
+  void OpDesc::SetAttr<T>(const std::string &name, const T &v) { \
+    auto it = FindAttr(desc_, name);                             \
+    it->set_type(framework_proto::ty__);                         \
+    it->set_##pb_f__(v);                                         \
+  }
+SET_IMPL_ONE(int, INT, i);
+SET_IMPL_ONE(float, FLOAT, f);
+SET_IMPL_ONE(bool, BOOLEAN, b);
+SET_IMPL_ONE(int64_t, LONG, l);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
+                                       const std::vector<int> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::INTS);
+  it->clear_ints();
+  for (auto &i : v) {
+    it->add_ints(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name,
+                                  const std::string &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRING);
+  it->set_s(v.c_str());
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<float>>(const std::string &name,
+                                         const std::vector<float> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::FLOATS);
+  it->clear_floats();
+  for (auto &i : v) {
+    it->add_floats(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<std::string>>(
+    const std::string &name, const std::vector<std::string> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::STRINGS);
+  it->clear_strings();
+  for (auto &i : v) {
+    it->add_strings(i);
+  }
+}
+
+template <>
+void OpDesc::SetAttr<std::vector<int64_t>>(const std::string &name,
+                                           const std::vector<int64_t> &v) {
+  auto it = FindAttr(desc_, name);
+  it->set_type(framework_proto::LONGS);
+  it->clear_longs();
+  for (auto &i : v) {
+    it->add_longs(i);
+  }
+}
+google::protobuf::internal::RepeatedPtrIterator<
+    const framework_proto::OpDesc_Attr>
+GetFindAttr(const framework_proto::OpDesc &desc, const std::string &name) {
+  auto &xs = desc.attrs();
+  auto it = std::find_if(
+      xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+        return x.name() == name;
+      });
+  return it;
+}
+
+#define GET_ATTR_IMPL(T, pb_f__)                        \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    return it->pb_f__();                                \
+  }
+
+#define GET_ATTRS_IMPL(T, pb_f__)                       \
+  template <>                                           \
+  T OpDesc::GetAttr<T>(const std::string &name) const { \
+    auto it = GetFindAttr(*desc_, name);                \
+    T res;                                              \
+    for (const auto &v : it->pb_f__()) {                \
+      res.push_back(v);                                 \
+    }                                                   \
+    return res;                                         \
+  }
+GET_ATTR_IMPL(int32_t, i);
+GET_ATTR_IMPL(int16_t, block_idx);
+GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(bool, b);
+GET_ATTR_IMPL(int64_t, l);
+GET_ATTRS_IMPL(std::vector<int>, ints);
+GET_ATTRS_IMPL(std::vector<float>, floats);
+GET_ATTRS_IMPL(std::vector<std::string>, strings);
+GET_ATTR_IMPL(std::string, s);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h
new file mode 100644
index 00000000000000..81d57d9f322527
--- /dev/null
+++ b/paddle/infrt/paddle/pb/op_desc.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+#include "paddle/infrt/support/variant.h"
+
+namespace infrt::paddle::pb {
+
+namespace framework_proto = ::paddle::framework::proto;
+
+using Attribute =
+    Variant<int, float, bool, std::vector<std::string>, std::vector<int>>;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+/*
+ * The lite::OpDesc, an light-weight implementation of wrapper of proto::OpDesc.
+ * Unlike the original one in framework::OpDesc, we remove the local members
+ * except the desc_, to avoid the inconsistent state, which is normal in the
+ * original interface and results in bugs.
+ */
+class OpDesc : public cpp::OpDescAPI {
+ public:
+  OpDesc() = delete;
+
+  explicit OpDesc(framework_proto::OpDesc *desc) : desc_(desc) { CHECK(desc_); }
+
+  framework_proto::OpDesc *Proto() { return desc_; }
+  const framework_proto::OpDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Type() const override { return desc_->type(); }
+
+  void SetType(const std::string &type) override { desc_->set_type(type); }
+
+  // Get the arguments of parameter called `param`
+  std::vector<std::string> Input(const std::string &param) const override {
+    return GetArguments(desc_->inputs(), param);
+  }
+
+  std::vector<std::string> InputArgumentNames() const override {
+    return GetArgumentNames(desc_->inputs());
+  }
+
+  void SetInput(const std::string &param,
+                const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_inputs(), param, args);
+  }
+
+  std::vector<std::string> Output(const std::string &param) const override {
+    return GetArguments(desc_->outputs(), param);
+  }
+
+  std::vector<std::string> OutputArgumentNames() const override {
+    return GetArgumentNames(desc_->outputs());
+  }
+
+  void SetOutput(const std::string &param,
+                 const std::vector<std::string> &args) override {
+    SetArgument(desc_->mutable_outputs(), param, args);
+  }
+
+  bool HasAttr(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+    return it != xs.end();
+  }
+
+  AttrType GetAttrType(const std::string &name) const override {
+    const auto &xs = desc_->attrs();
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Attr &x) {
+          return x.name() == name;
+        });
+    CHECK(it != xs.end());
+#define DEF_ONE(type__)                   \
+  case framework_proto::AttrType::type__: \
+    return AttrType::type__;
+
+    switch (it->type()) {
+      DEF_ONE(INT);
+      DEF_ONE(FLOAT);
+      DEF_ONE(STRING);
+      DEF_ONE(INTS);
+      DEF_ONE(FLOATS);
+      DEF_ONE(STRINGS);
+      DEF_ONE(BOOLEAN);
+      DEF_ONE(BOOLEANS);
+      DEF_ONE(BLOCK);
+      DEF_ONE(LONG);
+      DEF_ONE(BLOCKS);
+      DEF_ONE(LONGS);
+      default:
+        LOG(FATAL) << "Unknown attribute type";
+        return static_cast<AttrType>(-1);
+    }
+#undef DEF_ONE
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    std::vector<std::string> res;
+    const auto &xs = desc_->attrs();
+    std::transform(
+        xs.begin(),
+        xs.end(),
+        std::back_inserter(res),
+        [](const framework_proto::OpDesc_Attr &x) { return x.name(); });
+    return res;
+  }
+
+  template <typename T>
+  void SetAttr(const std::string &name, const T &v);
+
+  template <typename T>
+  T GetAttr(const std::string &name) const;
+
+ private:
+  std::vector<std::string> GetArguments(
+      const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs,
+      const std::string &param) const {
+    std::vector<std::string> res;
+    auto it = std::find_if(
+        xs.begin(), xs.end(), [&](const framework_proto::OpDesc_Var &it) {
+          return it.parameter() == param;
+        });
+    CHECK(it != xs.end());
+
+    const auto &ys = it->arguments();
+    std::transform(ys.begin(),
+                   ys.end(),
+                   std::back_inserter(res),
+                   [](const std::string &x) { return x; });
+    return res;
+  }
+
+  void SetArgument(
+      google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> *xs,
+      const std::string &param,
+      const std::vector<std::string> &args) {
+    auto it = std::find_if(
+        xs->begin(), xs->end(), [&](const framework_proto::OpDesc_Var &it) {
+          return it.parameter() == param;
+        });
+    if (it == xs->end()) {
+      auto *new_arg = xs->Add();
+      new_arg->set_parameter(param);
+      for (const auto &arg : args) {
+        *new_arg->mutable_arguments()->Add() = arg;
+      }
+    } else {
+      it->mutable_arguments()->Clear();
+      for (const auto &arg : args) {
+        *it->mutable_arguments()->Add() = arg;
+      }
+    }
+  }
+
+  std::vector<std::string> GetArgumentNames(
+      const google::protobuf::RepeatedPtrField<framework_proto::OpDesc_Var> &xs)
+      const {
+    std::vector<std::string> res;
+    std::transform(
+        xs.begin(),
+        xs.end(),
+        std::back_inserter(res),
+        [](const framework_proto::OpDesc_Var &x) { return x.parameter(); });
+    return res;
+  }
+
+ private:
+  framework_proto::OpDesc *desc_;
+};
+
+template <>
+void OpDesc::SetAttr<std::string>(const std::string &name,
+                                  const std::string &v);
+
+template <>
+void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
+                                       const std::vector<int> &v);
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc
new file mode 100644
index 00000000000000..ed8a7e36e0129c
--- /dev/null
+++ b/paddle/infrt/paddle/pb/program_desc.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/program_desc.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace infrt::paddle::pb {
+
+template <>
+framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(
+    int32_t idx) {
+  CHECK_LT(idx, static_cast<int>(BlocksSize())) << "idx >= blocks.size()";
+  return desc_->mutable_blocks(idx);
+}
+
+template <>
+framework_proto::BlockDesc*
+ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
+  return desc_->add_blocks();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h
new file mode 100644
index 00000000000000..4adad650c974df
--- /dev/null
+++ b/paddle/infrt/paddle/pb/program_desc.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+namespace framework_proto = ::paddle::framework::proto;
+
+class ProgramDesc : public cpp::ProgramDescAPI {
+ public:
+  ProgramDesc() = delete;
+
+  explicit ProgramDesc(framework_proto::ProgramDesc *desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  framework_proto::ProgramDesc *Proto() { return desc_; }
+
+  const framework_proto::ProgramDesc &ReadonlyProto() const { return *desc_; }
+
+  size_t BlocksSize() const override { return desc_->blocks_size(); }
+
+  void ClearBlocks() override { desc_->clear_blocks(); }
+
+  template <typename T>
+  T *GetBlock(int32_t idx);
+
+  template <typename T>
+  T *AddBlock();
+
+  bool HasVersion() const override { return desc_->has_version(); }
+
+  int64_t Version() const override { return desc_->version().version(); }
+
+  void SetVersion(int64_t version) override {
+    desc_->mutable_version()->set_version(version);
+  }
+
+ private:
+  framework_proto::ProgramDesc *desc_;  // not_own
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc
new file mode 100644
index 00000000000000..cf80df4f1b845b
--- /dev/null
+++ b/paddle/infrt/paddle/pb/var_desc.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/pb/var_desc.h"
+
+#include <google/protobuf/map.h>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+
+cpp::VarDescAPI::Type VarDesc::GetType() const {
+  auto type = desc_->type().type();
+
+#define GET_TYPE_CASE_ITEM(type__)       \
+  case framework_proto::VarType::type__: \
+    return cpp::VarDescAPI::Type::type__;
+
+  switch (type) {
+    GET_TYPE_CASE_ITEM(LOD_TENSOR);
+    GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    GET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    GET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    GET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    GET_TYPE_CASE_ITEM(FETCH_LIST);
+    GET_TYPE_CASE_ITEM(STEP_SCOPES);
+    GET_TYPE_CASE_ITEM(PLACE_LIST);
+    GET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+      return VarDescAPI::Type();
+  }
+#undef GET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetType(VarDescAPI::Type type) {
+#define SET_TYPE_CASE_ITEM(type__)                                     \
+  case VarDescAPI::Type::type__:                                       \
+    desc_->mutable_type()->set_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (type) {
+    SET_TYPE_CASE_ITEM(LOD_TENSOR);
+    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
+    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
+    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
+    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
+    SET_TYPE_CASE_ITEM(FETCH_LIST);
+    SET_TYPE_CASE_ITEM(STEP_SCOPES);
+    SET_TYPE_CASE_ITEM(PLACE_LIST);
+    SET_TYPE_CASE_ITEM(READER);
+    default:
+      LOG(FATAL) << "Unknown var type";
+  }
+#undef SET_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      auto *lod_tensors_ptr =
+          desc_->mutable_type()->mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      LOG(FATAL) << "Setting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      return desc_->type().reader().lod_tensor_size();
+      break;
+    default:
+      LOG(FATAL) << "Getting 'sub_tensor_number' is not supported by the type "
+                    "of var %s."
+                 << this->Name();
+  }
+  return 0;
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  if (multiple_dims.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_dims.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensors =
+      mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
+void VarDesc::SetDataType(VarDescAPI::VarDataType data_type) {
+#define SET_DATA_TYPE_CASE_ITEM(type__)                                     \
+  case cpp::VarDescAPI::Type::type__:                                       \
+    mutable_tensor_desc()->set_data_type(framework_proto::VarType::type__); \
+    break;
+
+  switch (data_type) {
+    SET_DATA_TYPE_CASE_ITEM(BOOL);
+    SET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    SET_DATA_TYPE_CASE_ITEM(UINT8);
+    SET_DATA_TYPE_CASE_ITEM(INT8);
+    SET_DATA_TYPE_CASE_ITEM(INT16);
+    SET_DATA_TYPE_CASE_ITEM(INT32);
+    SET_DATA_TYPE_CASE_ITEM(INT64);
+    SET_DATA_TYPE_CASE_ITEM(FP16);
+    SET_DATA_TYPE_CASE_ITEM(FP32);
+    SET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(data_type);
+  }
+#undef SET_DATA_TYPE_CASE_ITEM
+}
+
+void VarDesc::SetDataTypes(
+    const std::vector<framework_proto::VarType::Type> &multiple_data_type) {
+  if (multiple_data_type.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given data types("
+            << multiple_data_type.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_data_type.size());
+  }
+  std::vector<framework_proto::VarType::TensorDesc *> tensor_descs =
+      mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
+}
+
+// proto::VarType::Type VarDesc::GetDataType() const {
+//   return tensor_desc().data_type();
+// }
+cpp::VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  if (desc_->type().type() != framework_proto::VarType::LOD_TENSOR) {
+    return VarDescAPI::Type();
+  }
+  auto type = tensor_desc().data_type();
+#define GET_DATA_TYPE_CASE_ITEM(type__)                       \
+  case framework_proto::VarType::Type::VarType_Type_##type__: \
+    return VarDescAPI::Type::type__
+
+  switch (type) {
+    GET_DATA_TYPE_CASE_ITEM(BOOL);
+    GET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP16);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(type);
+      return VarDescAPI::Type();
+  }
+#undef GET_DATA_TYPE_CASE_ITEM
+}
+
+std::vector<framework_proto::VarType::Type> VarDesc::GetDataTypes() const {
+  std::vector<framework_proto::VarType::TensorDesc> descs = tensor_descs();
+  std::vector<framework_proto::VarType::Type> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      desc_->mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      desc_->mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      LOG(FATAL)
+          << "Setting 'lod_level' is not supported by the type of var %s."
+          << this->Name();
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  if (multiple_lod_level.size() != GetTensorDescNum()) {
+    VLOG(3) << "WARNING: The number of given lod_levels("
+            << multiple_lod_level.size()
+            << ") doesn't match the existing tensor number("
+            << GetTensorDescNum()
+            << "). The Reader is going to be reinitialized.";
+    SetTensorDescNum(multiple_lod_level.size());
+  }
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor :
+           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      LOG(FATAL)
+          << "Setting 'lod_levels' is not supported by the type of var %s."
+          << this->Name();
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().lod_level();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().lod_level();
+    default:
+      LOG(FATAL)
+          << "Getting 'lod_level' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return 0;
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      res.reserve(desc_->type().reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      LOG(FATAL)
+          << "Getting 'lod_levels' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return std::vector<int32_t>();
+}
+
+const framework_proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->type().selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->type().lod_tensor().tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->type().tensor_array().tensor();
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_desc' is not supported by the type of var %s."
+          << this->Name();
+  }
+  return framework_proto::VarDesc().type().lod_tensor().tensor();
+}
+
+std::vector<framework_proto::VarType::TensorDesc> VarDesc::tensor_descs()
+    const {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (const auto &lod_tensor : desc_->type().reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_descs' is not supported by the type of var "
+             "%s."
+          << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc>();
+}
+
+framework_proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::SELECTED_ROWS:
+      return desc_->mutable_type()->mutable_selected_rows();
+    case framework_proto::VarType::LOD_TENSOR:
+      return desc_->mutable_type()->mutable_lod_tensor()->mutable_tensor();
+    case framework_proto::VarType::LOD_TENSOR_ARRAY:
+      return desc_->mutable_type()->mutable_tensor_array()->mutable_tensor();
+    default:
+      LOG(FATAL) << "Getting 'mutable_tensor_desc' is not supported by the "
+                    "type of var "
+                    "%s."
+                 << this->Name();
+  }
+  return nullptr;
+}
+
+std::vector<framework_proto::VarType::TensorDesc *>
+VarDesc::mutable_tensor_descs() {
+  CHECK(desc_->has_type()) << "The var type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  std::vector<framework_proto::VarType::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_->type().type()) {
+    case framework_proto::VarType::READER:
+      for (auto &lod_tensor :
+           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      LOG(FATAL)
+          << "Getting 'tensor_descs' is not supported by the type of var "
+             "%s."
+          << this->Name();
+  }
+  return std::vector<framework_proto::VarType::TensorDesc *>();
+}
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h
new file mode 100644
index 00000000000000..4cff5fdee0375d
--- /dev/null
+++ b/paddle/infrt/paddle/pb/var_desc.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <google/protobuf/map.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/paddle/cpp/desc_api.h"
+#include "paddle/infrt/paddle/framework.pb.h"
+
+namespace infrt::paddle::pb {
+namespace framework_proto = ::paddle::framework::proto;
+
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(
+      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+class VarDesc : public cpp::VarDescAPI {
+ public:
+  VarDesc() = delete;
+
+  explicit VarDesc(framework_proto::VarDesc *desc) : desc_(desc) {
+    CHECK(desc_);
+  }
+
+  ::paddle::framework::proto::VarDesc *Proto() { return desc_; }
+  const framework_proto::VarDesc &ReadonlyProto() const { return *desc_; }
+
+  std::string Name() const override { return desc_->name(); }
+
+  void SetName(std::string name) override { desc_->set_name(name); }
+
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
+  void SetShape(const std::vector<int64_t> &dims);
+
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
+  void SetDataType(VarDescAPI::VarDataType data_type);
+
+  void SetDataTypes(
+      const std::vector<framework_proto::VarType::Type> &multiple_data_type);
+
+  VarDescAPI::VarDataType GetDataType() const;
+
+  std::vector<framework_proto::VarType::Type> GetDataTypes() const;
+
+  void SetLoDLevel(int32_t lod_level);
+
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
+  int32_t GetLoDLevel() const;
+
+  std::vector<int32_t> GetLoDLevels() const;
+
+  VarDescAPI::Type GetType() const override;
+
+  void SetType(VarDescAPI::Type type) override;
+
+  bool Persistable() const override { return desc_->persistable(); }
+
+  void SetPersistable(bool persistable) override {
+    desc_->set_persistable(persistable);
+  }
+
+ private:
+  const framework_proto::VarType::TensorDesc &tensor_desc() const;
+  std::vector<framework_proto::VarType::TensorDesc> tensor_descs() const;
+  framework_proto::VarType::TensorDesc *mutable_tensor_desc();
+  std::vector<framework_proto::VarType::TensorDesc *> mutable_tensor_descs();
+
+  framework_proto::VarDesc *desc_;
+};
+
+}  // namespace infrt::paddle::pb
diff --git a/paddle/infrt/paddle/scope.cc b/paddle/infrt/paddle/scope.cc
new file mode 100644
index 00000000000000..d7bab9f749591d
--- /dev/null
+++ b/paddle/infrt/paddle/scope.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/scope.h"
+
+#include "paddle/infrt/common/common.h"
+
+namespace infrt {
+namespace paddle {
+
+_Variable* Scope::FindVar(const std::string& name) const {
+  auto it = data_.find(name);
+  if (it != data_.end()) return it->second.get();
+  return nullptr;
+}
+
+Tensor Scope::GetTensor(const std::string& name) const {
+  CheckVarNameValid(name);
+  auto* var = FindVar(name);
+  CHECK(var) << "No variable called [" << name << "] found";
+  return var->get<Tensor>();
+}
+
+std::vector<std::string> Scope::var_names() const {
+  std::vector<std::string> names;
+  for (auto& item : data_) {
+    names.push_back(item.first);
+  }
+  return names;
+}
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/scope.h b/paddle/infrt/paddle/scope.h
new file mode 100644
index 00000000000000..4ebf846374c6fe
--- /dev/null
+++ b/paddle/infrt/paddle/scope.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unordered_map>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/infrt/common/macros.h"
+#include "paddle/infrt/paddle/tensor.h"
+#include "paddle/infrt/support/variant.h"
+
+namespace infrt {
+namespace paddle {
+
+using _Variable = Variant<Tensor>;
+
+struct _Tensor_;
+
+class Scope {
+ public:
+  static std::shared_ptr<Scope> Create() { return std::make_shared<Scope>(); }
+
+  //! Get or create a variable.
+  template <typename T>
+  _Variable* Var(const std::string& name);
+
+  //! Find a variable, get null if not exists.
+  _Variable* FindVar(const std::string& name) const;
+
+  Tensor GetTensor(const std::string& name) const;
+
+  //! Get variable names.
+  std::vector<std::string> var_names() const;
+
+  Scope() = default;
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<_Variable>> data_;
+
+  INFRT_DISALLOW_COPY_AND_ASSIGN(Scope);
+};
+
+template <typename T>
+_Variable* Scope::Var(const std::string& name) {
+  VLOG(4) << "Scope insert Var [" << name << "]";
+  _Variable* x = FindVar(name);
+  if (x) return x;
+  auto* data = new _Variable(T());
+  data_[name].reset(data);
+  return data;
+}
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/tensor.cc b/paddle/infrt/paddle/tensor.cc
new file mode 100644
index 00000000000000..072701ee9077dd
--- /dev/null
+++ b/paddle/infrt/paddle/tensor.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/paddle/tensor.h"
+
+namespace infrt {
+namespace paddle {}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/tensor.h b/paddle/infrt/paddle/tensor.h
new file mode 100644
index 00000000000000..5c4458bb62d736
--- /dev/null
+++ b/paddle/infrt/paddle/tensor.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "paddle/infrt/common/buffer.h"
+#include "paddle/infrt/common/common.h"
+#include "paddle/infrt/common/object.h"
+
+namespace infrt {
+namespace paddle {
+using common::Target;
+
+struct Shape {
+  using dim_t = int;
+
+  Shape() = default;
+  explicit Shape(const std::vector<dim_t>& data) : data_(data) {}
+
+  void SetData(const std::vector<dim_t>& data) { data_ = data; }
+
+  const std::vector<dim_t>& data() const INFRT_RESULT_SHOULD_USE {
+    return data_;
+  }
+  std::vector<dim_t>& data() INFRT_RESULT_SHOULD_USE { return data_; }
+  size_t size() const INFRT_RESULT_SHOULD_USE { return data_.size(); }
+  uint32_t numel() const INFRT_RESULT_SHOULD_USE {
+    return std::accumulate(
+        data_.begin(), data_.end(), 1, [](dim_t a, dim_t b) { return a * b; });
+  }
+
+ private:
+  std::vector<dim_t> data_;
+};
+
+class _Tensor_ : public common::Object {
+ public:
+  _Tensor_() : buffer_(std::make_shared<Buffer>()) {}
+
+  Shape& shape() { return shape_; }
+
+  void Resize(const Shape& shape) {
+    shape_ = shape;
+    buffer_->data()->resize(
+        reinterpret_cast<const infrt_dimension_t*>(shape.data().data()),
+        shape.size());
+  }
+
+  template <typename T>
+  inline T* mutable_data(const Target& target) {
+    set_type(type_of<T>());
+    if (target == common::DefaultHostTarget()) {
+      int alignment = type_of<T>().ElementOf().bits();
+      buffer_->ResizeLazy(alignment, shape_.numel() * sizeof(T), target);
+    } else {
+      buffer_->ResizeLazy(shape_.numel() * sizeof(T), target);
+    }
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  template <typename T>
+  const T* data() const {
+    return reinterpret_cast<T*>(buffer_->data()->memory);
+  }
+
+  const Type& type() { return type_; }
+
+  void set_type(Type type) { type_ = type; }
+  const Type& type() const { return type_; }
+
+  infrt_buffer_t* buffer() { return buffer_->data(); }
+
+  const char* type_info() const override { return __type_info__; }
+
+ private:
+  common::Type type_;
+  // A shared ptr to make it easier to share buffer between tensors.
+  std::shared_ptr<Buffer> buffer_;
+  Shape shape_;
+
+  static constexpr const char* __type_info__ = "_frontend_tensor_";
+};
+
+class Tensor : public Shared<_Tensor_> {
+ public:
+  Tensor() : Shared(new _Tensor_) {}
+  explicit Tensor(_Tensor_* x) : Shared(x) {}
+};
+
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/support/CMakeLists.txt b/paddle/infrt/support/CMakeLists.txt
new file mode 100644
index 00000000000000..9bcce6cab368d0
--- /dev/null
+++ b/paddle/infrt/support/CMakeLists.txt
@@ -0,0 +1 @@
+core_gather_headers()
diff --git a/paddle/infrt/support/type_traits.h b/paddle/infrt/support/type_traits.h
new file mode 100644
index 00000000000000..341dabb7c1c4a6
--- /dev/null
+++ b/paddle/infrt/support/type_traits.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file defines type traits related utilities.
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace infrt {
+
+// Utility template for tag dispatching.
+template <typename T>
+struct TypeTag {};
+
+// This is the equivalent of std::void_t in C++17.
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+// The same as std::disjunction in C++17.
+template <class...>
+struct disjunction : std::false_type {};
+template <class B1>
+struct disjunction<B1> : B1 {};
+template <class B1, class... Bn>
+struct disjunction<B1, Bn...>
+    : std::conditional_t<bool(B1::value), B1, disjunction<Bn...>> {};
+
+// Check whether T may be a base class.
+template <typename T>
+using MaybeBase =
+    llvm::conjunction<std::is_class<T>, llvm::negation<std::is_final<T>>>;
+
+// Find the index of a type in a tuple.
+//
+// Example:
+// using Tuple = std::tuple<int, float, double>;
+// static_assert(TupleIndexOf<int, Tuple>::value == 0);
+// static_assert(TupleIndexOf<double, Tuple>::value == 2);
+template <class T, class Tuple>
+struct TupleIndexOf;
+
+template <class T, class... Types>
+struct TupleIndexOf<T, std::tuple<T, Types...>>
+    : std::integral_constant<size_t, 0> {};
+
+template <class T, class U, class... Types>
+struct TupleIndexOf<T, std::tuple<U, Types...>>
+    : std::integral_constant<size_t,
+                             1 + TupleIndexOf<T, std::tuple<Types...>>::value> {
+};
+
+template <typename T, typename Tuple>
+struct TupleHasType;
+
+template <typename T, typename... Us>
+struct TupleHasType<T, std::tuple<Us...>>
+    : disjunction<std::is_same<T, Us>...> {};
+
+// The detector pattern in C++ that can be used for checking whether a type has
+// a specific property, e.g. whether an internal type is present or whether a
+// particular operation is valid.
+//
+// Sample usage:
+//
+// struct Foo {
+//   using difference_type = int;
+//   int get();
+// };
+// struct Bar {};
+//
+// // Check whether a type T has an internal difference_type.
+// template<class T>
+// using diff_t = typename T::difference_type;
+//
+// static_assert(is_detected_v<diff_t, Foo>, "Foo has difference_type");
+// static_assert(!is_detected_v<diff_t, Bar>, "Bar has no difference_type");
+//
+// // Check whether a type T has a get() member function.
+// template<class T>
+// using has_get_t = decltype(std::declval<T>().get());
+//
+// static_assert(is_detected_v<has_get_t, Foo>, "Foo has get()");
+// static_assert(!is_detected_v<has_get_t, Bar>, "Bar has no get()");
+//
+// See https://en.cppreference.com/w/cpp/experimental/is_detected for details.
+
+namespace internal {
+
+// nonesuch is a class type used to indicate detection failure.
+struct nonesuch {
+  ~nonesuch() = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <class Default,
+          class AlwaysVoid,
+          template <class...> class Op,
+          class... Args>
+struct detector : std::false_type {
+  using value_t = std::false_type;
+  using type = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type = Op<Args...>;
+};
+
+}  // namespace internal
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename internal::detector<internal::nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t =
+    typename internal::detector<internal::nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or = internal::detector<Default, void, Op, Args...>;
+
+template <template <class...> class Op, class... Args>
+constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+}  // namespace infrt
diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h
new file mode 100644
index 00000000000000..2f415b21c80109
--- /dev/null
+++ b/paddle/infrt/support/variant.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file implements the variant data structure similar to
+// absl::variant in C++17.
+
+#pragma once
+
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "paddle/infrt/support/type_traits.h"
+
+namespace infrt {
+
+// A Variant similar to absl::variant in C++17.
+//
+// Example usage:
+//
+// Variant<int, float, double> v;
+//
+// v = 1;
+// assert(v.get<int>() == 1);
+// assert(v.is<int>());
+// assert(v.get_if<float>() == nullptr);
+//
+// // Print the variant.
+// visit([](auto& t) { std::cout << t; }, v);
+//
+// v.emplace<float>(3);
+//
+template <typename... Ts>
+class Variant {
+  // Convenient constant to check if a type is a variant.
+  template <typename T>
+  static constexpr bool IsVariant =
+      std::is_same<std::decay_t<T>, Variant>::value;
+
+ public:
+  using IndexT = int16_t;
+  using Types = std::tuple<Ts...>;
+  template <int N>
+  using TypeOf = typename std::tuple_element<N, Types>::type;
+  static constexpr size_t kNTypes = sizeof...(Ts);
+
+  // Default constructor sets the Variant to the default constructed fisrt type.
+  Variant() {
+    using Type0 = TypeOf<0>;
+    index_ = 0;
+    new (&storage_) Type0();
+  }
+
+  template <typename T, std::enable_if_t<!IsVariant<T>, int> = 0>
+  explicit Variant(T&& t) {
+    fillValue(std::forward<T>(t));
+  }
+
+  Variant(const Variant& v) {
+    visit([this](auto& t) { fillValue(t); }, v);
+  }
+
+  Variant(Variant&& v) {
+    visit([this](auto&& t) { fillValue(std::move(t)); }, v);
+  }
+
+  ~Variant() { destroy(); }
+
+  Variant& operator=(Variant&& v) {
+    visit([this](auto& t) { *this = std::move(t); }, v);
+    return *this;
+  }
+
+  Variant& operator=(const Variant& v) {
+    visit([this](auto& t) { *this = t; }, v);
+    return *this;
+  }
+
+  template <typename T, std::enable_if_t<!IsVariant<T>, int> = 0>
+  Variant& operator=(T&& t) {
+    destroy();
+    fillValue(std::forward<T>(t));
+
+    return *this;
+  }
+
+  template <typename T, typename... Args>
+  T& emplace(Args&&... args) {
+    AssertHasType<T>();
+
+    destroy();
+    index_ = IndexOf<T>;
+    auto* t = new (&storage_) T(std::forward<Args>(args)...);
+    return *t;
+  }
+
+  template <typename T>
+  bool is() const {
+    AssertHasType<T>();
+    return IndexOf<T> == index_;
+  }
+
+  template <typename T>
+  const T& get() const {
+    AssertHasType<T>();
+    return *reinterpret_cast<const T*>(&storage_);
+  }
+
+  template <typename T>
+  T& get() {
+    AssertHasType<T>();
+    return *reinterpret_cast<T*>(&storage_);
+  }
+
+  template <typename T>
+  const T* get_if() const {
+    if (is<T>()) return &get<T>();
+    return nullptr;
+  }
+
+  template <typename T>
+  T* get_if() {
+    if (is<T>()) return &get<T>();
+    return nullptr;
+  }
+
+  IndexT index() { return index_; }
+
+ private:
+  template <typename T>
+  static constexpr size_t IndexOf = TupleIndexOf<T, Types>::value;
+
+  static constexpr size_t kStorageSize = std::max({sizeof(Ts)...});
+  static constexpr size_t kAlignment = std::max({alignof(Ts)...});
+
+  template <typename T>
+  static constexpr void AssertHasType() {
+    constexpr bool has_type = TupleHasType<T, Types>::value;
+    static_assert(has_type, "Invalid Type used for Variant");
+  }
+
+  void destroy() {
+    visit(
+        [](auto& t) {
+          using T = std::decay_t<decltype(t)>;
+          t.~T();
+        },
+        *this);
+  }
+
+  template <typename T>
+  void fillValue(T&& t) {
+    using Type = std::decay_t<T>;
+    AssertHasType<Type>();
+
+    index_ = IndexOf<Type>;
+    new (&storage_) Type(std::forward<T>(t));
+  }
+
+  using StorageT = std::aligned_storage_t<kStorageSize, kAlignment>;
+
+  StorageT storage_;
+  IndexT index_ = -1;
+};
+
+struct Monostate {};
+
+namespace internal {
+
+template <typename F, typename Variant>
+decltype(auto) visitHelper(
+    F&& f,
+    Variant&& v,
+    std::integral_constant<int, std::decay_t<Variant>::kNTypes>) {
+  assert(false && "Unexpected index_ in Variant");
+}
+
+// Disable clang-format as it does not format less-than (<) in the template
+// parameter properly.
+//
+// clang-format off
+template <
+    typename F, typename Variant, int N,
+    std::enable_if_t<N < std::decay_t<Variant>::kNTypes, int> = 0>
+decltype(auto) visitHelper(F&& f, Variant&& v, std::integral_constant<int, N>) {
+  // clang-format on
+  using VariantT = std::decay_t<Variant>;
+  using T = typename VariantT::template TypeOf<N>;
+  if (auto* t = v.template get_if<T>()) {
+    return f(*t);
+  } else {
+    return visitHelper(std::forward<F>(f),
+                       std::forward<Variant>(v),
+                       std::integral_constant<int, N + 1>());
+  }
+}
+
+}  // namespace internal
+
+template <typename F, typename Variant>
+decltype(auto) visit(F&& f, Variant&& v) {
+  return internal::visitHelper(std::forward<F>(f),
+                               std::forward<Variant>(v),
+                               std::integral_constant<int, 0>());
+}
+
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/CMakeLists.txt b/paddle/infrt/tensor/CMakeLists.txt
new file mode 100644
index 00000000000000..95b2e8f6839263
--- /dev/null
+++ b/paddle/infrt/tensor/CMakeLists.txt
@@ -0,0 +1,20 @@
+core_gather_headers()
+
+gather_srcs(infrt_src SRCS
+  tensor_map.cc
+  tensor_metadata.cc
+  dense_tensor_view.cc
+  dense_host_tensor.cc
+  tensor_shape.cc
+  )
+
+# set(tensor_map_mlir "${CMAKE_SOURCE_DIR}/infrt/dialect/mlir_tests/tensor_map.mlir")
+# set(external_kernels_lib "${CMAKE_BINARY_DIR}/paddle/libexternal_kernels.so")
+# message(STATUS "tensor_map_mlir: ${tensor_map_mlir}")
+# message(STATUS "external_kernels_lib: ${external_kernels_lib}")
+
+# Disable temporarily for the external-kernel's mkldnn is outdate
+# add_test(
+#     NAME run_and_check_tensor_map
+#     COMMAND sh -c "sed -e 's|/infrt/build|${CMAKE_BINARY_DIR}|' ${tensor_map_mlir} > /tmp/tensor_map.mlir && ${CMAKE_BINARY_DIR}/infrt/host_context/infrt-exec -i /tmp/tensor_map.mlir --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${tensor_map_mlir}"
+# )
diff --git a/paddle/infrt/tensor/dense_host_tensor.cc b/paddle/infrt/tensor/dense_host_tensor.cc
new file mode 100644
index 00000000000000..e54ab0e5c42ce9
--- /dev/null
+++ b/paddle/infrt/tensor/dense_host_tensor.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+#include <llvm/Support/raw_os_ostream.h>
+
+#include "paddle/infrt/common/buffer.h"
+
+namespace infrt::tensor {
+
+DenseHostTensor::DenseHostTensor(const TensorShape& shape, DType dtype)
+    : HostTensor(TensorMetadata{dtype, shape}) {
+  CHECK(metadata().IsValid()) << "Tensor construct get invalid metadata";
+  buffer_.reset(new infrt::Buffer(infrt::common::DefaultHostTarget()));
+  buffer_->ResizeLazy(dtype.GetHostSize() * shape.GetNumElements());
+}
+
+const TensorShape& DenseHostTensor::shape() const { return metadata().shape; }
+
+void DenseHostTensor::Init(const std::vector<int64_t>& shape, DType dtype) {
+  auto shape_array = llvm::ArrayRef<int64_t>(shape.data(), shape.size());
+  auto metadata = TensorMetadata(dtype, shape_array);
+  setTensorMetadata(metadata);
+  buffer_.reset(new infrt::Buffer(infrt::common::DefaultHostTarget()));
+  buffer_->ResizeLazy(dtype.GetHostSize() * metadata.shape.GetNumElements());
+}
+
+const infrt::Buffer* DenseHostTensor::buffer() const { return buffer_.get(); }
+
+template <typename T>
+void DisplayArray(std::ostream& os, T* data, int num_elements) {
+  for (int i = 0; i < num_elements - 1; i++) os << data[i] << ", ";
+  if (num_elements > 0) os << data[num_elements - 1];
+}
+
+std::ostream& operator<<(std::ostream& os, const DenseHostTensor& instance) {
+  CHECK(instance.metadata().IsValid())
+      << "Cann't print tensor with invalid metadata";
+  llvm::raw_os_ostream oos(os);
+  oos << "tensor: ";
+  oos << "shape=";
+  oos << instance.shape();
+  oos << ", values=[";
+
+  oos.flush();
+
+  if (instance.metadata().dtype == GetDType<float>()) {
+    auto* data = reinterpret_cast<float*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<double>()) {
+    auto* data = reinterpret_cast<double*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<int32_t>()) {
+    auto* data = reinterpret_cast<int32_t*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else if (instance.metadata().dtype == GetDType<int64_t>()) {
+    auto* data = reinterpret_cast<int64_t*>(instance.buffer()->data()->memory);
+    DisplayArray(os, data, instance.shape().GetNumElements());
+  } else {
+    LOG(FATAL) << "Not supported dtype [" << instance.metadata().dtype.name()
+               << " " << static_cast<int>(instance.metadata().dtype.kind())
+               << "] in print";
+  }
+
+  os << "]";
+
+  return os;
+}
+
+DenseHostTensor::~DenseHostTensor() {}
+
+void* DenseHostTensor::raw_data() const { return buffer_->data()->memory; }
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_host_tensor.h b/paddle/infrt/tensor/dense_host_tensor.h
new file mode 100644
index 00000000000000..7821395b54ea14
--- /dev/null
+++ b/paddle/infrt/tensor/dense_host_tensor.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "paddle/infrt/tensor/tensor_metadata.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+class Buffer;
+}  // namespace infrt
+
+namespace infrt::tensor {
+
+enum class DeviceKind {
+  kCPU = 0,
+};
+
+class Tensor {
+ public:
+  virtual bool IsHostTensor() const = 0;
+  virtual ~Tensor() = default;
+
+  const TensorMetadata& metadata() const { return metadata_; }
+
+ protected:
+  Tensor() = default;
+  void setTensorMetadata(TensorMetadata& metadata) {  // NOLINT
+    metadata_ = metadata;
+  }
+  explicit Tensor(const TensorMetadata& metadata) : metadata_(metadata) {}
+  explicit Tensor(TensorMetadata&& metadata) : metadata_(std::move(metadata)) {}
+
+ private:
+  TensorMetadata metadata_;
+};
+
+class HostTensor : public Tensor {
+ public:
+  bool IsHostTensor() const override { return true; }
+
+  virtual ~HostTensor() {}
+
+ protected:
+  HostTensor() = default;
+  explicit HostTensor(const TensorMetadata& metadata) : Tensor(metadata) {}
+  explicit HostTensor(TensorMetadata&& metadata)
+      : Tensor(std::move(metadata)) {}
+};
+
+// TODO(Superjomn) Replace the hlir/framework/Tensor with this.
+/**
+ * DenseTensor is a dense tensor, it holds a TensorShape and a buffer.
+ */
+class DenseHostTensor : public HostTensor {
+ public:
+  DenseHostTensor() = default;
+  DenseHostTensor(const TensorShape& shape, DType dtype);
+
+  void Init(const std::vector<int64_t>& shape, DType dtype);
+  const TensorShape& shape() const;
+
+  const Buffer* buffer() const;
+
+  void* raw_data() const;
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const DenseHostTensor& instance);
+
+  virtual ~DenseHostTensor();
+
+ private:
+  // TODO(Superjomn) Discard the dependency of the Buffer in infrtcore or create
+  // a general buffer in common.
+  std::shared_ptr<Buffer> buffer_;
+};
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_tensor_view.cc b/paddle/infrt/tensor/dense_tensor_view.cc
new file mode 100644
index 00000000000000..df8c8ba27018b2
--- /dev/null
+++ b/paddle/infrt/tensor/dense_tensor_view.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/dense_tensor_view.h"
+
+namespace infrt::tensor {}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/dense_tensor_view.h b/paddle/infrt/tensor/dense_tensor_view.h
new file mode 100644
index 00000000000000..71bb23fd4a7822
--- /dev/null
+++ b/paddle/infrt/tensor/dense_tensor_view.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt::tensor {
+
+template <typename DType>
+class DTArrayView {
+ public:
+  using UnderlyingT = DenseHostTensor;
+
+  explicit DTArrayView(const DenseHostTensor* tensor) : tensor_(*tensor) {}
+
+  const TensorShape& shape() { return tensor_.shape(); }
+
+  size_t GetNumElements() const { return tensor_.shape().GetNumElements(); }
+
+  const DType* data() const {
+    return static_cast<const DType*>(tensor_.raw_data());
+  }
+  DType* data() { return static_cast<DType*>(tensor_.raw_data()); }
+
+  llvm::ArrayRef<DType> Elements() const {
+    return llvm::ArrayRef<DType>(data(), GetNumElements());
+  }
+
+ private:
+  const DenseHostTensor& tensor_;
+};
+
+template <typename DType>
+class MutableDTArrayView : public DTArrayView<DType> {
+ public:
+  explicit MutableDTArrayView(DenseHostTensor* tensor)
+      : DTArrayView<DType>(tensor) {}
+
+  void Fill(const DType& v) {
+    std::fill(this->data(), this->data() + this->GetNumElements(), v);
+  }
+
+  using DTArrayView<DType>::data;
+  using DTArrayView<DType>::GetNumElements;
+  llvm::MutableArrayRef<DType> Elements() {
+    return llvm::MutableArrayRef<DType>(data(), this->GetNumElements());
+  }
+};
+
+}  // namespace infrt::tensor
diff --git a/paddle/infrt/tensor/tensor_map.cc b/paddle/infrt/tensor/tensor_map.cc
new file mode 100644
index 00000000000000..2fe81a3b77489f
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_map.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_map.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/infrt/common/string.h"
+#include "paddle/infrt/paddle/model_parser.h"
+
+using Scope = infrt::paddle::Scope;
+using Target = infrt::common::Target;
+using Type = infrt::common::Type;
+
+namespace infrt {
+namespace tensor {
+
+DType CinnType2DType_(Type type) {
+  if (type.is_bool()) return GetDType<bool>();
+  if (type.is_int(8)) return GetDType<int8_t>();
+  if (type.is_int(16)) return GetDType<int16_t>();
+  if (type.is_int(32)) return GetDType<int32_t>();
+  if (type.is_int(64)) return GetDType<int64_t>();
+  if (type.is_uint(8)) return GetDType<uint8_t>();
+  if (type.is_uint(16)) return GetDType<uint16_t>();
+  if (type.is_uint(32)) return GetDType<uint32_t>();
+  if (type.is_uint(64)) return GetDType<uint64_t>();
+  if (type.is_float(32)) return GetDType<float>();
+  if (type.is_float(64)) return GetDType<double>();
+  if (type.is_string()) return GetDType<std::string>();
+  return DType(DType::Kind::Unk);
+}
+
+TensorMap *LoadParams(const std::string &path) {
+  std::cout << "loading params from: " << path << std::endl;
+  TensorMap *map = new TensorMap();
+  Scope scope;
+  const Target &target = common::DefaultHostTarget();
+
+  std::string model_path = path + "/__model__";
+  // paddle::framework::proto::ProgramDesc pb_proto_prog =
+  // *infrt::frontend::paddle::LoadProgram(model_path);
+  auto pb_proto_prog = *paddle::LoadProgram(model_path);
+  // infrt::frontend::paddle::pb::ProgramDesc pb_prog_desc(&pb_proto_prog);
+  // infrt::frontend::paddle::TransformProgramDescAnyToCpp(pb_prog_desc,
+  // cpp_prog);
+  auto main_block = pb_proto_prog.blocks(0);
+  for (auto &var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
+      continue;
+    std::string param_path = path + "/" + var.name();
+    std::ifstream param_file(param_path, std::ios::binary);
+    switch (var.type().type()) {
+      case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
+        auto var_name = infrt::TransValidVarName(var.name());
+        // std::cout << "var name: " << var.name() << " " << var_name <<
+        // std::endl;
+        auto *_var = scope.Var<paddle::Tensor>(var_name);
+        paddle::LoadLoDTensor(param_file, _var, target);
+        auto tensor = scope.GetTensor(var_name);
+        auto *src_data = tensor->data<float>();
+        auto &infrt_type = tensor->type();
+        std::vector<int64_t> shape;
+        for (int dim : tensor->shape().data()) shape.push_back(dim);
+        auto shape_array = llvm::ArrayRef<int64_t>(shape.data(), shape.size());
+        auto dtype = CinnType2DType_(infrt_type);
+        auto *dht = new DenseHostTensor(TensorShape(shape_array), dtype);
+        int num_elements = dht->shape().GetNumElements();
+        auto *dst_data = reinterpret_cast<float *>(dht->raw_data());
+        for (int i = 0; i < num_elements; ++i) dst_data[i] = src_data[i];
+        (*map)[var.name()] = dht;
+        break;
+      }
+      default:
+        std::cout << "unknown weight type" << std::endl;
+        break;
+    }
+  }
+  return map;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_map.h b/paddle/infrt/tensor/tensor_map.h
new file mode 100644
index 00000000000000..ec0a58149ceb67
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_map.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <unordered_map>
+
+#include "paddle/infrt/tensor/dense_host_tensor.h"
+
+namespace infrt {
+namespace tensor {
+
+using TensorMap = std::unordered_map<std::string, tensor::DenseHostTensor*>;
+
+TensorMap* LoadParams(const std::string& path);
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_metadata.cc b/paddle/infrt/tensor/tensor_metadata.cc
new file mode 100644
index 00000000000000..20f3ee2508547f
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_metadata.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_metadata.h"
+
+#include <llvm/Support/raw_ostream.h>
+
+namespace infrt {
+namespace tensor {
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, TensorMetadata& meta) {
+  os << meta.dtype.name();
+  os << "\n";
+  os << meta.shape;
+  return os;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_metadata.h b/paddle/infrt/tensor/tensor_metadata.h
new file mode 100644
index 00000000000000..b5aa8c1a83f737
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_metadata.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "paddle/infrt/common/dtype.h"
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+namespace infrt {
+namespace tensor {
+
+struct TensorMetadata {
+  DType dtype;
+  TensorShape shape;
+
+  TensorMetadata() = default;
+  TensorMetadata(DType dtype, const TensorShape& shape)
+      : dtype(dtype), shape(shape) {
+    CHECK(IsValid());
+  }
+  TensorMetadata(DType dtype, llvm::ArrayRef<int64_t> shape)
+      : dtype(dtype), shape(shape) {
+    CHECK(IsValid());
+  }
+
+  size_t GetHostSizeInBytes() const {
+    return dtype.GetHostSize() * shape.GetNumElements();
+  }
+
+  bool IsValid() const { return dtype.IsValid(); }
+  bool IsInvalid() const { return !dtype.IsValid(); }
+
+  bool operator==(const TensorMetadata& other) const {
+    return dtype == other.dtype && shape == other.shape;
+  }
+  bool operator!=(const TensorMetadata& other) const {
+    return !(*this == other);
+  }
+
+  friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                       TensorMetadata& meta);
+};
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_shape.cc b/paddle/infrt/tensor/tensor_shape.cc
new file mode 100644
index 00000000000000..1e6d5c107e1b89
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_shape.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/tensor/tensor_shape.h"
+
+#include <glog/logging.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <algorithm>
+#include <functional>
+
+namespace infrt {
+namespace tensor {
+
+TensorShape::TensorShape(llvm::ArrayRef<int64_t> dims)
+    : dims_(dims.begin(), dims.end()) {}
+
+int TensorShape::GetRank() const { return dims_.size(); }
+
+int64_t TensorShape::GetDim(int idx) const {
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, GetRank());
+  return dims_[idx];
+}
+int TensorShape::GetNumElements() const {
+  int64_t size = 1;
+  for (int v : dims_) size *= v;
+  return size;
+}
+
+DynamicTensorShape::DynamicTensorShape(
+    llvm::Optional<llvm::ArrayRef<int64_t>> dims) {
+  if (dims.hasValue()) {
+    dims_ = llvm::SmallVector<int64_t, 4>(dims->begin(), dims->end());
+  }
+}
+
+int DynamicTensorShape::GetRank() const {
+  if (dims_.hasValue()) return dims_->size();
+  return kUnknownDimSize;
+}
+
+int64_t DynamicTensorShape::GetDim(int idx) const {
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, GetRank());
+  return (*dims_)[idx];
+}
+
+bool DynamicTensorShape::IsShapeKnown() const {
+  if (!dims_.hasValue()) return false;
+  for (int64_t v : *dims_) {
+    if (IsDimUnknown(v)) return false;
+  }
+  return true;
+}
+
+llvm::Optional<TensorShape> DynamicTensorShape::ToTensorShape() const {
+  if (IsShapeKnown()) {
+    return TensorShape(*dims_);
+  }
+  return llvm::None;
+}
+
+llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const TensorShape& v) {
+  os << "shape[";
+  for (int i = 0; i < v.GetRank() - 1; i++) {
+    os << v.dims_[i] << ",";
+  }
+  if (v.GetRank() > 0) os << v.dims_.back();
+  os << "]";
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const DynamicTensorShape& v) {
+  os << "dynamic_shape[";
+  for (int i = 0; i < v.GetRank() - 1; i++) {
+    os << v << ",";
+  }
+  if (v.GetRank() > 0) os << v.dims_->back();
+  os << "]";
+  return os;
+}
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/infrt/tensor/tensor_shape.h b/paddle/infrt/tensor/tensor_shape.h
new file mode 100644
index 00000000000000..cce95072f5c354
--- /dev/null
+++ b/paddle/infrt/tensor/tensor_shape.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <llvm/ADT/ArrayRef.h>
+
+namespace infrt {
+namespace tensor {
+
+/**
+ * TensorShape represents the shape of a Tensor, all the dimensions should be
+ * known.
+ */
+class TensorShape {
+ public:
+  TensorShape() = default;
+  explicit TensorShape(llvm::ArrayRef<int64_t> dims);
+
+  int GetRank() const;
+
+  int64_t GetDim(int idx) const;
+
+  int GetNumElements() const;
+
+  friend llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
+                                       const TensorShape& v);
+  friend bool operator==(const TensorShape& a, const TensorShape& b) {
+    return a.dims_ == b.dims_;
+  }
+
+ private:
+  llvm::SmallVector<int64_t, 4> dims_;
+};
+
+/**
+ * DynamicTensorShape represents the shape of a Tensor, with some dimensions or
+ * even the rank is unknown.
+ */
+class DynamicTensorShape {
+ public:
+  explicit DynamicTensorShape(llvm::Optional<llvm::ArrayRef<int64_t>> dims);
+
+  //! Returns the rank if rank is known, or kUnknownDimSize.
+  int GetRank() const;
+
+  int64_t GetDim(int idx) const;
+
+  bool IsShapeKnown() const;
+
+  //! Convert to a TensorShape if all the dimensions are known.
+  llvm::Optional<TensorShape> ToTensorShape() const;
+
+  static constexpr int64_t kUnknownDimSize = -1;
+
+  static bool IsDimUnknown(int64_t dim) { return dim == kUnknownDimSize; }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const DynamicTensorShape& v);
+  friend bool operator==(const DynamicTensorShape& a,
+                         const DynamicTensorShape& b) {
+    return a.dims_ == b.dims_;
+  }
+
+ private:
+  //! Will be std::nullopt if no dim is known.
+  llvm::Optional<llvm::SmallVector<int64_t, 4>> dims_;
+};
+
+}  // namespace tensor
+}  // namespace infrt
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 00da59d05691ec..df647a17386173 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -216,6 +216,7 @@ function cmake_base() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DWITH_INFRT=${WITH_INFRT:-OFF}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
@@ -262,6 +263,7 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DWITH_INFRT=${WITH_INFRT:-OFF} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \

From b8793f7016ac94f60bbfb80b283dce10fe9d495e Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Tue, 7 Dec 2021 21:44:59 +0800
Subject: [PATCH 111/124] [Fleet Executor] Add feed, fetch and check
 correctness (#37824)

---
 .../fleet_executor/fleet_executor_desc.proto  |  15 +--
 .../fleet_executor/runtime_graph.cc           |  13 +-
 python/paddle/fluid/executor.py               | 124 +++++++++---------
 .../tests/unittests/test_fleet_executor.py    |  44 ++++++-
 .../test_fleet_executor_multi_devices.py      |   3 +-
 5 files changed, 117 insertions(+), 82 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
index 0da98ab13b9ffe..6890c311ec0039 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
@@ -21,12 +21,11 @@ message RankInfo {
 }
 
 message FleetExecutorDesc {
-  optional string strategy = 1 [ default = "Origin" ];
-  optional int64 cur_rank = 2 [ default = 0 ]; // Rank id of current processor
-  repeated RankInfo cluster_info = 3;
-  optional int32 dp_degree = 4 [ default = 1 ];
-  optional int32 mp_degree = 5 [ default = 1 ];
-  optional int32 pp_degree = 6 [ default = 1 ];
-  optional int64 num_micro_batches = 7 [ default = 1 ];
-  optional int64 num_slots = 8 [ default = 1 ];
+  optional int64 cur_rank = 1 [ default = 0 ]; // Rank id of current processor
+  repeated RankInfo cluster_info = 2;
+  optional int32 dp_degree = 3 [ default = 1 ];
+  optional int32 mp_degree = 4 [ default = 1 ];
+  optional int32 pp_degree = 5 [ default = 1 ];
+  optional int64 num_micro_batches = 6 [ default = 1 ];
+  optional int64 num_slots = 7 [ default = 1 ];
 }
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 9999956223ab15..2f5fff25f1063d 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -100,12 +100,7 @@ std::vector<OpRole> RuntimeGraph::functionality_order = {
 RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
                            const FleetExecutorDesc& exe_desc)
     : exe_desc_(exe_desc) {
-  if (exe_desc.strategy() == "1F1B") {
-    SplitProgramBasedFunctionality(program);
-    AssignTaskToIntercepter();
-    FakeDependence();
-    FakeRuntimeInfo();
-  } else if (exe_desc.strategy() == "Origin") {
+  if (exe_desc.pp_degree() == 1) {
     int64_t cur_rank = exe_desc_.cur_rank();
     int64_t max_run_times = exe_desc_.num_micro_batches();
     int64_t max_slot_nums = exe_desc_.num_slots();
@@ -117,8 +112,10 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
     intercepter_id_to_rank_.insert({task_id, cur_rank});
     intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()});
   } else {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Strategy %s is None of 1F1B or Origin.", exe_desc.strategy()));
+    SplitProgramBasedFunctionality(program);
+    AssignTaskToIntercepter();
+    FakeDependence();
+    FakeRuntimeInfo();
   }
 }
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index be408ea765ddc7..5aee48b7d81fb6 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -682,8 +682,6 @@ def __init__(self, place=None):
         self._enable_interpreter_core = _is_enable_standalone_executor()
         self._executor_cache = _ExecutorCache(self.place)
 
-        self._fleet_executor_cache = None
-
     def _get_scope_cache(self, program_cache_key):
         return self.scope_caches.get(program_cache_key, None)
 
@@ -1274,9 +1272,7 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
         if isinstance(program, Program) and program._pipeline_opt:
             if "fleet_opt" in program._pipeline_opt:
                 return self._run_using_fleet_executor(
-                    program,
-                    fetch_list=fetch_list,
-                    use_program_cache=use_program_cache)
+                    program=program, feed=feed, fetch_list=fetch_list)
             if "startup_program" in program._pipeline_opt:
                 program = program._pipeline_opt["startup_program"]
             else:
@@ -1950,64 +1946,72 @@ def _get_real_program_fetch_list():
 
         return ctx
 
+    def _prepare_fleet_executor(self, program=None, scope=None, fleet_opt=None):
+        from ..distributed.fleet.proto import fleet_executor_desc_pb2
+        from google.protobuf import text_format
+        assert program, "Program for fleet executor should not be None"
+        assert fleet_opt, "Configurations for fleet executor should not be None"
+        trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
+        trainer_endpoints = trainer_endpoints_str.split(',')
+        fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
+        fleet_exe_desc.cur_rank = os.getenv("PADDLE_TRAINER_ID", 0)
+        nrank = len(trainer_endpoints)
+        for rank, endpoint in enumerate(trainer_endpoints):
+            rank_info = fleet_executor_desc_pb2.RankInfo()
+            rank_info.rank = rank
+            rank_info.ip_port = endpoint
+            fleet_exe_desc.cluster_info.append(rank_info)
+        if "dist_strategy" in fleet_opt:
+            fleet_exe_desc.dp_degree = fleet_opt["dist_strategy"]["dp_degree"]
+            fleet_exe_desc.mp_degree = fleet_opt["dist_strategy"]["mp_degree"]
+            fleet_exe_desc.pp_degree = fleet_opt["dist_strategy"]["pp_degree"]
+        if "num_micro_batches" in fleet_opt:
+            fleet_exe_desc.num_micro_batches = fleet_opt["num_micro_batches"]
+        num_of_gpu = fleet_exe_desc.dp_degree * fleet_exe_desc.mp_degree * fleet_exe_desc.pp_degree
+        assert nrank == num_of_gpu, "The number of rank is not equal to the number of gpu."
+        fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
+        place = core.Place()
+        place.set_place(self.place)
+        fleet_exe.init(program.desc, scope, place)
+        return fleet_exe
+
     def _run_using_fleet_executor(self,
                                   program=None,
-                                  dataset=None,
-                                  scope=None,
-                                  thread=0,
-                                  is_infer=False,
-                                  debug=False,
-                                  fetch_list=None,
-                                  fetch_info=None,
-                                  print_period=100,
-                                  fetch_handler=None,
-                                  use_program_cache=False):
-        if self._fleet_executor_cache is None:
-            from ..distributed.fleet.proto import fleet_executor_desc_pb2
-            from google.protobuf import text_format
-            cur_rank = os.getenv("PADDLE_TRAINER_ID")
-            trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-            fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
-            nrank = 1
-            if cur_rank and trainer_endpoints_str:
-                fleet_exe_desc.cur_rank = int(cur_rank)
-                trainer_endpoints = trainer_endpoints_str.split(',')
-                for rank, endpoint in enumerate(trainer_endpoints):
-                    rank_info = fleet_executor_desc_pb2.RankInfo()
-                    rank_info.rank = rank
-                    rank_info.ip_port = endpoint
-                    fleet_exe_desc.cluster_info.append(rank_info)
-                nrank = len(trainer_endpoints)
-            else:
-                fleet_exe_desc.cur_rank = 0
-                rank_info = fleet_executor_desc_pb2.RankInfo()
-                rank_info.rank = 0
-                rank_info.ip_port = ''
-                fleet_exe_desc.cluster_info.append(rank_info)
-                logging.warning(
-                    "Fleet Executor will run on single device only.")
+                                  feed=None,
+                                  feed_var_name="feed",
+                                  fetch_var_name="fetch",
+                                  fetch_list=None):
+        cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
+        cached_ctx = self._get_ctx_cache(cache_key)
+        cached_scope = self._get_scope_cache(cache_key)
+        cached_program = self._get_program_cache(cache_key)
+        if cached_scope is None:
+            cached_scope = global_scope()
+            self._add_scope_cache(cache_key, cached_scope)
+        if cached_program is None:
+            real_feed = [] if feed is None else feed
+            real_program = program
+            if "section_program" in program._pipeline_opt:
+                real_program = program._pipeline_opt["section_program"]
+            cached_program = self._add_feed_fetch_ops(
+                program=real_program,
+                feed=real_feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name)
+            self._add_program_cache(cache_key, cached_program)
+        if cached_ctx is None:
             fleet_opt = program._pipeline_opt["fleet_opt"]
-            if "dist_strategy" in fleet_opt:
-                fleet_exe_desc.dp_degree = fleet_opt["dist_strategy"][
-                    "dp_degree"]
-                fleet_exe_desc.mp_degree = fleet_opt["dist_strategy"][
-                    "mp_degree"]
-                fleet_exe_desc.pp_degree = fleet_opt["dist_strategy"][
-                    "pp_degree"]
-            if "num_micro_batches" in fleet_opt:
-                fleet_exe_desc.num_micro_batches = fleet_opt[
-                    "num_micro_batches"]
-            num_of_gpu = fleet_exe_desc.dp_degree * fleet_exe_desc.mp_degree * fleet_exe_desc.pp_degree
-            assert nrank == num_of_gpu, "The number of rank is not equal to the number of gpu."
-            fleet_exe = core.FleetExecutor(fleet_exe_desc.SerializeToString())
-            place = core.Place()
-            place.set_place(self.place)
-            if scope is None:
-                scope = global_scope()
-            fleet_exe.init(program._pipeline_opt["section_program"].desc, scope,
-                           place)
-            self._fleet_executor_cache = fleet_exe
-        self._fleet_executor_cache.run()
+            cached_ctx = self._prepare_fleet_executor(
+                program=cached_program, scope=cached_scope, fleet_opt=fleet_opt)
+            self._add_ctx_cache(cache_key, cached_ctx)
+        if feed:
+            self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+        cached_ctx.run()
+        if fetch_list:
+            arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
+            tensors = arr._move_to_list()
+            return as_numpy(tensors)
         return None
 
     def _run_pipeline(self,
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 76d4a546746a43..09f9fa6ce105df 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 
@@ -20,20 +21,53 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
-    def run_fleet_executor(self, place):
+    def fake_fleet_opt(self):
+        # TODO: Fake for coverage will be removed in the future
+        import paddle.distributed.fleet as fleet
+        strategy = fleet.DistributedStrategy()
+        strategy.sharding_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": 1
+        }
+        strategy.pipeline_configs = {"accumulate_steps": 1}
+        fleet_opt = {
+            "dist_strategy": strategy.sharding_configs,
+            "num_micro_batches": strategy.pipeline_configs["accumulate_steps"]
+        }
+        return fleet_opt
+
+    def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(name='x', shape=[1], dtype=paddle.float32)
+            x = fluid.layers.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype)
+            y = fluid.layers.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype)
+            z = x + y
+            a = 2 * x + 3 * y
+        # TODO: section_program will be removed in the future
         empty_program._pipeline_opt = {
-            "fleet_opt": {},
+            "fleet_opt": self.fake_fleet_opt(),
             "section_program": empty_program
         }
-        exe.run(empty_program, feed={'x': [1]})
+        res = exe.run(empty_program,
+                      feed={'x': x_data,
+                            'y': y_data},
+                      fetch_list=[z.name, a.name])
+        return res
 
     def test_executor_on_single_device(self):
         if fluid.is_compiled_with_cuda():
-            self.run_fleet_executor(fluid.CUDAPlace(0))
+            shape = (10000, 3462)
+            x_data = np.random.rand(*shape)
+            y_data = np.random.rand(*shape)
+            z_data = x_data + y_data
+            a_data = 2 * x_data + 3 * y_data
+            res = self.run_fleet_executor(fluid.CUDAPlace(0), x_data, y_data)
+            self.assertTrue(np.allclose(res[0], z_data))
+            self.assertTrue(np.allclose(res[1], a_data))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
index adffd228591b99..fb82c71b2ff7f3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
@@ -49,7 +49,8 @@ def test_dist_executor_on_multi_devices(self):
             "num_micro_batches": strategy.pipeline_configs["accumulate_steps"]
         }
         if fluid.is_compiled_with_cuda():
-            self.run_fleet_executor(fluid.CUDAPlace(0), fleet_opt)
+            # TODO: Distribute test case is not supported for executor can not stop
+            pass
 
 
 if __name__ == "__main__":

From 9a2d327cedf3d4842254accfdf553ea0c546b9f9 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 8 Dec 2021 09:57:11 +0800
Subject: [PATCH 112/124] bug fix for adamw (#37905)

---
 python/paddle/fluid/optimizer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7412d3a3fe6cfb..f849d61c5d7008 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -6582,7 +6582,10 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
                 print("Finished apply_optimize")
         """
 
-        return self._optimizer.apply_optimize(
+        func = self._optimizer.apply_optimize if hasattr(
+            self._optimizer,
+            'apply_optimize') else self._optimizer._apply_optimize
+        return func(
             loss, startup_program=startup_program, params_grads=params_grads)
 
     def minimize(self,

From a1ad3a63b07c65bd0d29a1cf9d60ae46b907537b Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 8 Dec 2021 10:57:38 +0800
Subject: [PATCH 113/124] Fix CUDA Graph H2D bug by restore host memory
 (#37774)

* fix CUDA Graph H2D bug again

* fix no return bug
---
 .../fluid/operators/math/concat_and_split.cu  | 49 ++++++++-----------
 .../platform/cuda_graph_with_memory_pool.h    | 17 +++++++
 2 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 32bb479e00517e..bc2d496a3e76a8 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -287,13 +287,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     const T** dev_ins_data = nullptr;
     if (!has_same_shape || in_num < 2 || in_num > 4) {
       tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                     static_cast<void*>(inputs_data), in_num * sizeof(T*),
-                     context.stream());
-      }
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored,
+                   in_num * sizeof(T*), context.stream());
       dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
     }
 
@@ -317,13 +315,12 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                     static_cast<void*>(inputs_col),
-                     inputs_col_num * sizeof(int64_t), context.stream());
-      }
+
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+                   inputs_col_num * sizeof(int64_t), context.stream());
       int64_t* dev_ins_col_data =
           static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
@@ -422,13 +419,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     T** dev_out_gpu_data = nullptr;
     if (!has_same_shape || o_num < 2 || o_num > 4) {
       tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                     reinterpret_cast<void*>(outputs_data), o_num * sizeof(T*),
-                     context.stream());
-      }
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored,
+                   o_num * sizeof(T*), context.stream());
       dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
     }
 
@@ -452,13 +447,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-      {
-        platform::SkipCUDAGraphCaptureGuard guard;
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                     reinterpret_cast<void*>(outputs_cols),
-                     outputs_cols_num * sizeof(int64_t), context.stream());
-      }
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
+      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
+                   outputs_cols_num * sizeof(int64_t), context.stream());
       int64_t* dev_outs_col_data =
           reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
index fe082c850aa4d2..7a9e1a3a1419ca 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.h
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -60,6 +60,23 @@ inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
   callback();
 }
 
+template <typename T>
+inline T *RestoreHostMemIfCapturingCUDAGraph(T *host_mem, size_t size) {
+  static_assert(std::is_trivial<T>::value, "T must be trivial type");
+  static_assert(!std::is_same<T, void>::value, "T cannot be void");
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
+    size_t nbytes = size * sizeof(T);
+    void *new_host_mem = new uint8_t[nbytes];
+    std::memcpy(new_host_mem, host_mem, nbytes);
+    AddResetCallbackIfCapturingCUDAGraph(
+        [new_host_mem] { delete[] reinterpret_cast<uint8_t *>(new_host_mem); });
+    return reinterpret_cast<T *>(new_host_mem);
+  }
+#endif
+  return host_mem;
+}
+
 class SkipCUDAGraphCaptureGuard {
   DISABLE_COPY_AND_ASSIGN(SkipCUDAGraphCaptureGuard);
 

From a8f009e4be0ffb3f926bb40f915728ec0c14fa9f Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Wed, 8 Dec 2021 11:07:03 +0800
Subject: [PATCH 114/124] add pyyaml needed by
 python\paddle\utils\code_gen\api_gen.py (#37897)

---
 paddle/scripts/paddle_build.bat | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 39b78d2a4b8fb9..84e6893ea26cc1 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -86,6 +86,7 @@ if "%WITH_PYTHON%" == "ON" (
     where python
     where pip
     pip install wheel --user
+    pip install pyyaml --user
     pip install -r %work_dir%\python\requirements.txt --user
     if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!

From b5dd12fb1216ff5146d5dc79fb4f18edd9ef5919 Mon Sep 17 00:00:00 2001
From: Yanxing Shi <48111042+Yanxing-Shi@users.noreply.github.com>
Date: Wed, 8 Dec 2021 11:33:32 +0800
Subject: [PATCH 115/124] fix softmax max dim (#37901)

---
 paddle/fluid/operators/softmax_cudnn_op.cu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h
index c538fbade8ae8f..533488896dfcd1 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -449,7 +449,7 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
   const int N = SizeToAxis(axis, dims);
   const int D = SizeOutAxis(axis, dims);
 
-  constexpr int max_dim = 320;
+  constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
 
   if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
@@ -540,7 +540,7 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx,
   const int N = SizeToAxis(axis, dims);
   const int D = SizeOutAxis(axis, dims);
 
-  constexpr int max_dim = 320;
+  constexpr int max_dim = 512;
   constexpr int warps_per_block = 4;
 
   if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {

From cf873c397a6ac61cc87c5f907b22657ef926c849 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 8 Dec 2021 12:28:53 +0800
Subject: [PATCH 116/124] Enabled Eager AutoCodeGen for 40+ more operators
 (#37910)

* Rearranged Eager AutoCodeGen directory structure

* Removed USE_OP in Eager AutoCodeGen

* Enabled generation for Operators without Grad/Inputs/Outputs

* Resolved operators without input

* Fixed merge conflicts

* Enabled Eager AutoCodeGen for 10+ more operators
---
 .../auto_code_generator/eager_generator.cc    | 145 ++++++++++++++----
 .../eager/auto_code_generator/op_list.txt     |  46 +++++-
 2 files changed, 159 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index b3657a9894f82b..fe29792b6e75c4 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -33,14 +33,18 @@ static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
 static std::unordered_set<std::string> operators_to_skip = {
-    "chunk_eval",  // Stupid tensor name
-    "minus",          "pull_sparse",     "pull_box_extended_sparse",
-    "pull_sparse_v2", "pull_box_sparse", "fused_attention",
-    "diag_v2",        "c_split"};
+    "minus",
+};
 
 static std::unordered_set<std::string> operators_to_codegen = {};
 static std::unordered_set<std::string> skipped_operators = {};
 
+static std::string LegalizeVariableName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
+  return ret;
+}
+
 static std::string AttrTypeToString(const proto::AttrType& type) {
   std::string ret;
   switch (type) {
@@ -608,6 +612,9 @@ static bool CollectGradInformationFromOpInfo(
   }
 
   VLOG(6) << "Prepared Default Attributes Map, size = " << default_attrs.size();
+  for (const auto& iter : default_attrs) {
+    VLOG(6) << iter.first;
+  }
 
   /* ---------------------------- */
   /* --------- Backward --------- */
@@ -1052,24 +1059,25 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const std::string& output_name = output.name();
     std::string out_tensor_str;
     size_t return_position = fwd_outputs_name_pos_map.at(output_name);
+    std::string output_varname = LegalizeVariableName(output_name);
 
     if (output.duplicable()) {
       const char* FWD_OUT_TENSORS_TEMPLATE =
           "  std::vector<egr::EagerTensor> %s = "
           "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
       out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE,
-                                               output_name, output_name);
+                                               output_varname, output_name);
       return_types[return_position] = "std::vector<egr::EagerTensor>";
     } else {
       const char* FWD_OUT_TENSOR_TEMPLATE =
           "  egr::EagerTensor %s = "
           "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n";
       out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE,
-                                               output_name, output_name);
+                                               output_varname, output_name);
       return_types[return_position] = "egr::EagerTensor";
     }
 
-    return_contents[return_position] = output_name;
+    return_contents[return_position] = output_varname;
     generated_function_body += out_tensor_str;
   }
   generated_function_body += "\n";
@@ -1280,23 +1288,76 @@ static std::string GenerateGradNodeCCContents(
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
       // Fwd Tensor
-      const std::string& fwd_input_name =
-          grad_outs_slotname_map.at(grad_output_name);
-      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_input_name);
-
-      if (duplicable_input_name_set.count(fwd_input_name)) {
-        const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
-            "this->OutputMeta()[%d].Size() ) },";
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name);
+
+      /* Handle Special Case: "PullSparseOp", etc
+
+          Forward:
+
+             Ids  W
+              |   |
+           PullSparseOp
+                |
+               Out
+
+          Backward:
+
+             Ids  GradOut  W
+              |      |     |
+             PullSparseGradOp
+                     |
+                  GradOut
+
+          Its grad output "GradOut" corresponds to forward output "Out",
+          where there is a hiden inplace involved. So we find "GradOut"'s index
+         in
+          grads, and perform the inplace operation by constructing outs =
+         {{"Out", grads[i]}}
+
+          GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
+          outs = {{"Out", grads[i]}}
+
+          For returns, append "GradOut" to the very end of return list.
+      */
+      if (!fwd_inputs_name_pos_map.count(fwd_name)) {
+        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                       paddle::platform::errors::Fatal(
+                           "fwd_name not found in fwd_inputs_name_pos_map nor "
+                           "fwd_outputs_name_pos_map"));
+
+        size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
+        std::string grad_ptr_name = fwd_name + "_ptrs";
+        const char* GET_GRADS_PTR_TEMPLATE =
+            "  std::vector<std::shared_ptr<egr::EagerTensor>> %s;\n"
+            "  for(const auto& t : grads[%d]) {\n    "
+            "%s.emplace_back(std::move(std::make_shared<egr::EagerTensor>(t)));"
+            "\n  }\n";
+        std::string grads_ptr_str =
+            paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name,
+                                    grads_position, grad_ptr_name);
+        generated_grad_function_body += grads_ptr_str;
+        generated_grad_function_body += "\n";
+
+        const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
         outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name);
+
       } else {
-        const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", "
-            "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
-            "GenerateUniqueName())}},";
-        outs_contents_str += paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE,
-                                                     grad_output_name);
+        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+        if (duplicable_input_name_set.count(fwd_name)) {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
+              "this->OutputMeta()[%d].Size() ) },";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+        } else {
+          const char* GRAD_OUTS_CONTENT_TEMPLATE =
+              "{ \"%s\", "
+              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
+              "GenerateUniqueName())}},";
+          outs_contents_str += paddle::string::Sprintf(
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+        }
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
@@ -1340,15 +1401,39 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Return
   std::string outputs_str = "";
+  size_t num_appended_outputs = 0;
   for (auto iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
-    size_t fwd_input_position =
-        fwd_inputs_name_pos_map.at(grad_outs_slotname_map.at(grad_out_name));
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_inputs_name_pos_map.count(fwd_name)) {
+      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE,
+                                             fwd_input_position, grad_out_name);
+      num_appended_outputs++;
+    } else {
+      PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                     paddle::platform::errors::Fatal(
+                         "fwd_name not found in fwd_inputs_name_pos_map nor "
+                         "fwd_outputs_name_pos_map"));
+    }
+  }
 
-    const char* BWD_OUTPUT_TEMPLATE =
-        "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-    outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE,
-                                           fwd_input_position, grad_out_name);
+  /* Handle Special Case: "PullSparseOp", etc
+     For returns, append "GradOut" to the very end of return list. */
+  for (auto iter : grad_outs) {
+    const std::string& grad_out_name = iter.first;
+    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+    if (fwd_outputs_name_pos_map.count(fwd_name)) {
+      const char* BWD_OUTPUT_TEMPLATE =
+          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
+      outputs_str += paddle::string::Sprintf(
+          BWD_OUTPUT_TEMPLATE, num_appended_outputs, grad_out_name);
+      num_appended_outputs++;
+    }
   }
 
   const char* BWD_RETURN_TEMPLATE =
@@ -1722,6 +1807,10 @@ static void PrepareAttrMapForOps() {
   operators_with_attrs["transfer_dtype"] = {};
   operators_with_attrs["transfer_dtype"]["out_dtype"] = 5;
   operators_with_attrs["transfer_dtype"]["in_dtype"] = 5;
+
+  // Handle "c_split"
+  operators_with_attrs["c_split"] = {};
+  operators_with_attrs["c_split"]["nranks"] = 1;
 }
 
 static void CollectOperatorsToCodeGen(const std::string& op_list_path) {
diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt
index 2456a7a1846d1e..699a84169d7002 100644
--- a/paddle/fluid/eager/auto_code_generator/op_list.txt
+++ b/paddle/fluid/eager/auto_code_generator/op_list.txt
@@ -1,7 +1,3 @@
-sigmoid
-matmul_v2
-reduce_sum
-elementwise_add
 rsqrt
 multihead_matmul
 addmm
@@ -19,7 +15,9 @@ pow2_decay_with_linear_warmup
 split
 fc
 clear_float_status
+matmul_v2
 load
+c_embedding
 elementwise_max
 adadelta
 chunk_eval
@@ -43,8 +41,10 @@ expand_v2
 lgamma
 solve
 deformable_psroi_pooling
+transfer_layout
 instance_norm
 decode_jpeg
+distributed_push_sparse
 gather_nd
 reduce_prod
 matrix_rank
@@ -57,10 +57,12 @@ sequence_slice
 lookup_table
 softplus
 depthwise_conv2d
+c_allreduce_sum
 fused_fc_elementwise_layernorm
 sigmoid_cross_entropy_with_logits
 exp
 scatter
+c_allreduce_min
 equal_all
 searchsorted
 fusion_squared_mat_sub
@@ -73,6 +75,7 @@ momentum
 temporal_shift
 nce
 mv
+global_scatter
 proximal_gd
 memcpy_h2d
 add_position_encoding
@@ -90,13 +93,18 @@ randperm
 sequence_scatter
 partial_sum
 relu6
+partial_allgather
+c_scatter
+alltoall
 conv3d
 lstm_unit
 not_equal
 transpose2
+c_sync_comm_stream
 uniform_random_batch_size_like
 unfold
 lrn
+isclose
 softmax_with_cross_entropy
 isfinite_v2
 bernoulli
@@ -105,6 +113,7 @@ gaussian_random
 flatten2
 matmul
 cvm
+recv_v2
 adamax
 masked_select
 range
@@ -112,6 +121,7 @@ bitwise_not
 trace
 multinomial
 modified_huber_loss
+c_reduce_prod
 roll
 squared_l2_distance
 conv3d_transpose
@@ -128,8 +138,10 @@ multiclass_nms2
 bpr_loss
 fft_c2c
 bicubic_interp_v2
+angle
 reshape
 coalesce_tensor
+dgc
 roi_align
 reshape2
 reduce_any
@@ -139,6 +151,7 @@ sequence_reshape
 bilateral_slice
 fill_any_like
 empty
+partial_recv
 pad_constant_like
 pool2d
 size
@@ -148,11 +161,14 @@ stack
 dgc_momentum
 lamb
 generate_proposals_v2
+c_sync_calc_stream
 bitwise_or
 gru_unit
 fake_channel_wise_quantize_dequantize_abs_max
 sampling_id
 unsqueeze2
+transfer_dtype
+allreduce
 average_accumulates
 sequence_enumerate
 fusion_seqconv_eltadd_relu
@@ -160,6 +176,7 @@ bce_loss
 generate_proposal_labels
 im2sequence
 isinf
+c_reducescatter
 adagrad
 linear_chain_crf
 retinanet_target_assign
@@ -170,6 +187,7 @@ lookup_table_v2
 detection_map
 l1_norm
 sqrt
+partial_send
 fused_elemwise_activation
 slogdeterminant
 share_buffer
@@ -191,7 +209,10 @@ linear_interp
 auc
 logical_or
 batch_norm
+c_reduce_sum
+elementwise_add
 acos
+send_and_recv
 unpool
 cumprod
 sample_logits
@@ -206,6 +227,7 @@ matrix_power
 greater_equal
 generate_proposals
 bilinear_interp
+sigmoid
 inplace_abn
 softshrink
 mul
@@ -243,6 +265,8 @@ overlap_add
 fill_constant_batch_size_like
 fill_any
 dequantize_log
+c_split
+barrier
 max_pool2d_with_index
 pad3d
 norm
@@ -258,6 +282,7 @@ pow
 stanh
 label_smooth
 merged_momentum
+c_reduce_min
 ascend_trigger
 fused_feedforward
 rpn_target_assign
@@ -271,6 +296,7 @@ frame
 bincount
 shape
 group_norm
+c_softmax_with_cross_entropy
 resnet_unit
 sequence_expand_as
 cos_sim
@@ -319,6 +345,7 @@ adamw
 elementwise_pow
 prior_box
 p_norm
+c_concat
 unique_consecutive
 lod_reset
 pad
@@ -339,6 +366,7 @@ pad2d
 inverse
 spectral_norm
 shuffle_channel
+send_v2
 psroi_pool
 seed
 ceil
@@ -347,6 +375,7 @@ reduce_min
 cos
 ncclAllReduce
 cudnn_lstm
+reduce_sum
 digamma
 assign_value
 increment
@@ -366,6 +395,8 @@ atan
 less_than
 unsqueeze
 crf_decoding
+global_gather
+c_allreduce_prod
 log_softmax
 ftrl
 matrix_nms
@@ -374,6 +405,7 @@ cast
 tanh_shrink
 hard_shrink
 multiclass_nms
+c_broadcast
 fusion_transpose_flatten_concat
 sequence_unpad
 fused_elemwise_add_activation
@@ -393,6 +425,7 @@ alloc_float_status
 sequence_concat
 fusion_seqpool_cvm_concat
 similarity_focus
+c_allreduce_max
 argsort
 sequence_expand
 sgd
@@ -413,6 +446,7 @@ pyramid_hash
 fake_quantize_dequantize_moving_average_abs_max
 multi_dot
 sequence_pool
+broadcast
 transpose
 top_k
 dist
@@ -466,10 +500,13 @@ squared_l2_norm
 elementwise_sub
 margin_rank_loss
 faster_tokenizer
+c_identity
+c_reduce_max
 relu
 is_empty
 reduce_all
 edit_distance
+distributed_lookup_table
 bmm
 yolo_box
 soft_relu
@@ -484,6 +521,7 @@ nearest_interp
 gather
 trilinear_interp_v2
 box_clip
+c_allgather
 isnan_v2
 softmax
 conv2d_fusion

From 52f63cd24012a87ae038051a50c52b8297db1055 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Wed, 8 Dec 2021 12:36:11 +0800
Subject: [PATCH 117/124] [Eager] generate eager core ops, only 4 ops (#37813)

* refine a test case, test=develop

* publish python c api for eager, test=develop

* revert modify about test_allclose_layer.py, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* delete numpy includes, use pybind11 numpy.h, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* suport eager error msg, and add grad test case, test=develop

* refine, test=develop

* refine, test=develop

* generate eager core ops, only 4 ops, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop
---
 paddle/fluid/pybind/CMakeLists.txt            |  63 +-
 paddle/fluid/pybind/eager.cc                  |   3 +
 paddle/fluid/pybind/eager_method.cc           |   4 +
 .../pybind/eager_op_function_generator.cc     | 397 +++++++++
 paddle/fluid/pybind/eager_properties.cc       |  11 +
 paddle/fluid/pybind/eager_utils.cc            |  78 ++
 paddle/fluid/pybind/eager_utils.h             |  33 +
 paddle/fluid/pybind/op_function.h             | 773 +----------------
 paddle/fluid/pybind/op_function_common.cc     | 806 ++++++++++++++++++
 paddle/fluid/pybind/op_function_common.h      | 126 +++
 paddle/fluid/pybind/op_function_generator.cc  |  71 --
 paddle/fluid/pybind/op_function_generator.h   |  71 ++
 python/paddle/_C_ops.py                       |  18 +
 python/paddle/fluid/framework.py              |  28 +-
 .../unittests/test_egr_code_generate_api.py   |  71 ++
 15 files changed, 1707 insertions(+), 846 deletions(-)
 create mode 100644 paddle/fluid/pybind/eager_op_function_generator.cc
 create mode 100644 paddle/fluid/pybind/op_function_common.cc
 create mode 100644 paddle/fluid/pybind/op_function_common.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4f896f852ffd6c..e6d0a096b2d806 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -134,17 +134,25 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+  add_executable(eager_op_function_generator eager_op_function_generator.cc)
+  target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
+  target_link_libraries(eager_op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
     target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
+    target_link_libraries(eager_op_function_generator ${ROCM_HIPRTC_LIB})
   endif()
 
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
+  set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
+  set(tmp_eager_impl_file ${eager_impl_file}.tmp)
 
   set(OP_IMPL_DEPS op_function_generator)
+  set(EAGER_OP_IMPL_DEPS eager_op_function_generator)
+
   if(WIN32)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
       set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}")
@@ -168,22 +176,41 @@ if(WITH_PYTHON)
     ")\n"
     "exit /b 0")
 
+    file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat ""
+    "set build_times=1\n"
+    ":retry\n"
+    "ECHO eager_op_function_generator run %build_times% time\n"
+    "taskkill /f /im eager_op_function_generator.exe 2>NUL\n"
+    "${op_impl_path}/eager_op_function_generator.exe ${tmp_eager_impl_file}\n"
+    "if %ERRORLEVEL% NEQ 0 (\n"
+    "    set /a build_times=%build_times%+1\n"
+    "    if %build_times% GEQ 10 (\n"
+    "        exit /b 1\n"
+    "    ) else (\n"
+    "        goto :retry\n"
+    "    )\n"
+    ")\n"
+    "exit /b 0")
+
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
     else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
         DEPENDS extern_openblas)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
+      list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
     endif()
     if(WITH_MKLDNN)
       ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
         DEPENDS mkldnn)
         list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
+        list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
 
     add_custom_command(OUTPUT ${impl_file}
@@ -191,6 +218,13 @@ if(WITH_PYTHON)
       COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
       COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
       DEPENDS ${OP_IMPL_DEPS})
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+        COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+        COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+        DEPENDS ${EAGER_OP_IMPL_DEPS})
+    endif()
   else(WIN32)
     # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
@@ -201,12 +235,14 @@ if(WITH_PYTHON)
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
     endif()
     if(WITH_MKLDNN)
       ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
         COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS mkldnn)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
+      list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
     endif()
     add_custom_command(OUTPUT ${impl_file}
           COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
@@ -216,15 +252,34 @@ if(WITH_PYTHON)
           COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
           DEPENDS ${OP_IMPL_DEPS}
           VERBATIM)
+    if(NOT ON_INFER)
+      add_custom_command(OUTPUT ${eager_impl_file}
+            COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
+                "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator"
+                "${tmp_eager_impl_file}"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file}
+            COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
+            DEPENDS ${EAGER_OP_IMPL_DEPS}
+            VERBATIM)
+      endif()
   endif(WIN32)
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  if(NOT ON_INFER)
+    add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file})
+  endif()
 
   list(APPEND PYBIND_DEPS interpretercore standalone_executor)
+  cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS})
+  list(APPEND PYBIND_DEPS op_function_common)
 
-  cc_library(paddle_eager
-  SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
-  DEPS autograd_meta grad_node_info pten global_utils utils eager_api accumulation_node backward python)
-  list(APPEND PYBIND_DEPS paddle_eager)
+  if(NOT ON_INFER)
+    cc_library(paddle_eager
+    SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc
+    DEPS eager_api autograd_meta backward grad_node_info pten op_function_common dygraph_function dygraph_node math_cpu linalg_cpu creation_cpu utils_cpu manipulation_cpu accumulation_node global_utils utils python)
+    add_dependencies(paddle_eager eager_codegen)
+    add_dependencies(paddle_eager eager_op_function_generator_cmd)
+    list(APPEND PYBIND_DEPS paddle_eager)
+  endif()
 
   cc_library(paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 5be000844bcf17..07140803822055 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -27,6 +28,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/include/core.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/pybind/eager_op_function_impl.h"
 
 namespace paddle {
 namespace pybind {
@@ -126,6 +128,7 @@ void BindEager(pybind11::module* module) {
   }
 
   BindFunctions(m.ptr());
+  BindEagerOpFunctions(&m);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e01396a4e3ca76..75fd8c7fabe635 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -37,6 +37,7 @@ extern PyTypeObject* pEagerTensorType;
 static PyObject* eager_tensor_method_numpy(EagerTensorObject* self,
                                            PyObject* args, PyObject* kwargs) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   if (!self->eagertensor.initialized()) {
     Py_INCREF(Py_None);
     return Py_None;
@@ -93,6 +94,9 @@ static PyObject* eager_tensor_method_is_initialized(EagerTensorObject* self,
                                                     PyObject* args,
                                                     PyObject* kwargs) {
   EAGER_TRY
+  if (self->eagertensor.Var().IsInitialized()) {
+    self->eagertensor.SyncToTensor();
+  }
   return ToPyObject(self->eagertensor.initialized());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
new file mode 100644
index 00000000000000..46d0bdcb46de77
--- /dev/null
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -0,0 +1,397 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
+#include "paddle/fluid/pybind/op_function_generator.h"
+
+std::set<std::string> gen_list = {"elementwise_add", "reduce_sum", "matmul_v2",
+                                  "sigmoid"};
+
+// clang-format off
+const char* OUT_INITIALIZER_TEMPLATE =
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
+const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
+
+const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
+const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
+
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
+)";
+
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    if (%s.size() != 0) {
+      ins["%s"] = %s;
+    }
+)";
+
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    outs["%s"] = {%s};
+)";
+
+const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
+    outs["%s"] = %s;
+)";
+// if inputs is list, no need {}
+const char* ARG_OUT_NUM = R"(%sNum)";
+const char* ARG_OUT_NUM_TYPE = R"(size_t )";
+
+const char* IN_VAR_TYPE = R"(py::handle)";
+const char* IN_VAR_LIST_TYPE = R"(py::handle)";
+
+const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
+const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
+
+const char* CAST_VAR_TEMPLATE = R"(
+    auto %s = GetEagerTensorFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* CAST_VAR_LIST_TEMPLATE = R"(
+    auto %s = GetEagerTensorListFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* CAST_SIZE_T_TEMPLATE = R"(
+    auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
+
+const char* ARG_TEMPLATE = R"(const %s& %s)";
+
+const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)";
+const char* RETURN_TUPLE_TEMPLATE = R"(std::make_tuple(%s))";
+const char* RETURN_LIST_TEMPLATE = R"(outs["%s"])";
+const char* RETURN_TEMPLATE = R"(outs["%s"][0])";
+
+const char* FUNCTION_ARGS = R"(%s, const py::args& args)";
+const char* FUNCTION_ARGS_NO_INPUT = R"(const py::args& args)";
+
+const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT = R"(
+    if (ins.count("%s") && outs.count("%s")) {
+      HandleViewBetweenInputAndOutput(ins["%s"][0], outs["%s"][0]);
+    })";
+
+const char* OP_FUNCTION_TEMPLATE =
+R"(
+static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+  PyThreadState *tstate = nullptr;
+  try
+  {
+    %s
+    framework::AttributeMap attrs;
+    ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
+    tstate = PyEval_SaveThread();
+    %s
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    %s
+  }
+  catch(...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+})";
+
+const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)";
+
+// clang-format on
+static inline bool FindInsMap(const std::string& op_type,
+                              const std::string& in_name) {
+  return op_ins_map[op_type].count(in_name);
+}
+
+static inline bool FindOutsMap(const std::string& op_type,
+                               const std::string& out_name) {
+  return op_outs_map[op_type].count(out_name);
+}
+
+static inline bool FindPassingOutsMap(const std::string& op_type,
+                                      const std::string& out_name) {
+  return op_passing_outs_map[op_type].count(out_name);
+}
+
+static inline bool FindViewOpMap(const std::string& op_type) {
+  return view_op_map.count(op_type);
+}
+
+static inline std::string TempName(const std::string& name) {
+  return name + '_';
+}
+
+std::string GenerateOpFunctionsBody(
+    const paddle::framework::proto::OpProto* op_proto, std::string func_name,
+    bool use_inplace_strategy = false,
+    std::map<std::string, std::string> inplace_map = {}) {
+  auto& op_type = op_proto->type();
+  std::string input_args = "";
+  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string ins_initializer_with_null = "";
+  std::string py_arg = "";
+  int arg_idx = 0;
+  int input_args_num = 0;
+  std::string ins_cast_str = "";
+  std::string view_strategy_str = "";
+  for (auto& input : op_proto->inputs()) {
+    auto& in_name = input.name();
+    // skip those dispensable inputs, like ResidualData in conv2d
+    if (input.dispensable() && !FindInsMap(op_type, in_name)) {
+      continue;
+    }
+    const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
+    auto input_arg =
+        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    input_args += input_arg;
+    input_args += ",";
+    input_args_num++;
+    const auto in_cast_type =
+        input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+    auto dispensable = input.dispensable() ? "true" : "false";
+    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
+                                            in_name, arg_idx++, dispensable);
+
+    if (input.dispensable()) {
+      const auto in_template = input.duplicable()
+                                   ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                   : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+      ins_initializer_with_null +=
+          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
+    } else {
+      call_api_str += in_name + ", ";
+    }
+  }
+
+  if (!input_args.empty() && input_args.back() == ',') {
+    input_args.pop_back();
+  }
+
+  // Generate outs initializer
+  std::string outs_initializer = "{";
+  std::string outs_initializer_with_null = "";
+  std::string return_str = "";
+
+  int outs_num = 0;
+  for (auto& output : op_proto->outputs()) {
+    auto& out_name = output.name();
+
+    // skip those dispensable oututs
+    if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
+      continue;
+    }
+    const auto out_type =
+        output.duplicable() ? OUT_VAR_LIST_TYPE : OUT_VAR_TYPE;
+
+    if (FindPassingOutsMap(op_type, out_name)) {
+      if (input_args != "") {
+        input_args += ",";
+      }
+      input_args += out_type;
+      input_args += out_name;
+      input_args_num++;
+
+      if (output.dispensable()) {
+        const auto out_template =
+            output.duplicable() ? OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
+                                : OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL;
+        outs_initializer_with_null +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+      } else {
+        const auto out_template = output.duplicable()
+                                      ? INPUT_LIST_INITIALIZER_TEMPLATE
+                                      : INPUT_INITIALIZER_TEMPLATE;
+        outs_initializer +=
+            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += ",";
+      }
+
+      const auto in_cast_type =
+          output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
+      auto dispensable = output.dispensable() ? "true" : "false";
+      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
+                                              out_name, arg_idx++, dispensable);
+    } else {
+      // There are few Operators that have duplicable output, like `Out` in
+      // split op. We need to specify the number of variables for the
+      // duplicable output, as the argument OutNum;
+      if (output.duplicable()) {
+        if (input_args != "") {
+          input_args += ",";
+        }
+        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        input_args += ARG_OUT_NUM_TYPE;
+        input_args += out_num_str;
+        input_args_num++;
+        outs_initializer += paddle::string::Sprintf(
+            OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str);
+
+        auto dispensable = output.dispensable() ? "true" : "false";
+        ins_cast_str +=
+            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type,
+                                    out_num_str, arg_idx++, dispensable);
+        call_api_str += out_num_str + ", ";
+      } else {
+        outs_initializer +=
+            paddle::string::Sprintf(OUT_INITIALIZER_TEMPLATE, out_name);
+      }
+      outs_initializer += ",";
+    }
+
+    // return_str += paddle::string::Sprintf(return_template, out_name);
+    // return_str += ",";
+    outs_num += 1;
+  }
+  call_api_str += "attrs);";
+  if (outs_initializer.back() == ',') {
+    outs_initializer.pop_back();
+    // return_str.pop_back();
+  }
+  outs_initializer += "}";
+  if (FindViewOpMap(op_type)) {
+    std::string viwe_input_name = view_op_map[op_type].first;
+    std::string viwe_output_name = view_op_map[op_type].second;
+    view_strategy_str += paddle::string::Sprintf(
+        HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
+        viwe_input_name, viwe_output_name);
+  }
+  if (outs_num == 0) {
+    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
+  std::string function_args = "";
+  if (input_args == "") {
+    function_args = FUNCTION_ARGS_NO_INPUT;
+  } else {
+    function_args = paddle::string::Sprintf(FUNCTION_ARGS, input_args);
+  }
+
+  // generate op funtcion body
+  auto op_function_str = paddle::string::Sprintf(
+      OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num,
+      call_api_str, return_str);
+
+  return op_function_str;
+}
+
+static std::tuple<std::vector<std::string>, std::vector<std::string>>
+GenerateOpFunctions() {
+  auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
+
+  std::vector<std::string> op_function_list, bind_function_list;
+  auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
+
+  for (auto& pair : op_info_map) {
+    auto& op_info = pair.second;
+    auto op_proto = op_info.proto_;
+    if (op_proto == nullptr) {
+      continue;
+    }
+    auto& op_type = op_proto->type();
+    // Skip ooerator which is not inherit form OperatorWithKernel, like while,
+    // since only OperatorWithKernel can run in dygraph mode.
+    // if the pten lib contains op kernel, we still generate ops method
+    if (!all_kernels.count(op_type) &&
+        !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
+      continue;
+    }
+    if (!gen_list.count(op_type)) {
+      continue;
+    }
+    std::string func_name = "eager_api_" + op_type;
+    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+
+    // generate pybind item
+    auto bind_function_str = paddle::string::Sprintf(
+        PYBIND_ITEM_TEMPLATE, op_type, func_name, op_type);
+
+    op_function_list.emplace_back(std::move(op_function_str));
+    bind_function_list.emplace_back(std::move(bind_function_str));
+  }
+  return std::make_tuple(op_function_list, bind_function_list);
+}
+
+int main(int argc, char* argv[]) {
+  if (argc != 2) {
+    std::cerr << "argc must be 2" << std::endl;
+    return -1;
+  }
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+
+  std::vector<std::string> headers{
+      "\"pybind11/detail/common.h\"",
+      "\"paddle/fluid/pybind/op_function_common.h\"",
+      "\"paddle/fluid/pybind/exception.h\"", "<Python.h>"};
+
+  std::ofstream out(argv[1], std::ios::out);
+
+  out << "#pragma once\n\n";
+
+  for (auto& header : headers) {
+    out << "#include  " + header + "\n";
+  }
+
+  out << "\n\n";
+
+  auto op_funcs = GenerateOpFunctions();
+
+  out << "namespace paddle {\n"
+      << "namespace pybind {\n\n";
+  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
+  out << "\n\n";
+
+  out << "static PyMethodDef ExtestMethods[] = {\n"
+      << paddle::string::join_strings(std::get<1>(op_funcs), '\n')
+      << "\n  {nullptr,nullptr,0,nullptr}"
+      << "};\n\n";
+
+  out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n"
+      << "  auto m = module->def_submodule(\"ops\");\n"
+      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+         "core.eager.ops failed!\"));\n"
+      << "  }\n\n"
+      << "  InitOpsAttrTypeMap();"
+      << "}\n\n"
+      << "} // namespace pybind\n"
+      << "} // namespace paddle\n";
+
+  out.close();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  ge::GEFinalize();
+#endif
+
+  return 0;
+}
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index a13e4836d141a8..7f20f32e81a5ea 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -37,6 +37,7 @@ extern PyTypeObject* p_eager_tensor_type;
 PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
                                            void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   return ToPyObject(self->eagertensor.name());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -44,6 +45,7 @@ PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
 int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
                                      void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   self->eagertensor.set_name(CastPyArg2AttrString(value, 0));
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_ZERO
@@ -52,6 +54,7 @@ int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
 PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
                                                     void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
   return ToPyObject(meta->StopGradient());
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -60,6 +63,7 @@ PyObject* eager_tensor_properties_get_stop_gradient(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
                                            void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eagertensor);
   return ToPyObject(meta->Grad());
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -68,6 +72,7 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
 int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
                                               PyObject* value, void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
   meta->SetStopGradient(CastPyArg2AttrBoolean(value, 0));
   return 0;
@@ -77,6 +82,7 @@ int eager_tensor_properties_set_stop_gradient(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
                                                   void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
   return ToPyObject(meta->Persistable());
   EAGER_CATCH_AND_THROW_RETURN_NULL
@@ -85,6 +91,7 @@ PyObject* eager_tensor_properties_get_persistable(EagerTensorObject* self,
 int eager_tensor_properties_set_persistable(EagerTensorObject* self,
                                             PyObject* value, void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto meta = egr::EagerUtils::autograd_meta(&self->eagertensor);
   meta->SetPersistable(CastPyArg2AttrBoolean(value, 0));
   return 0;
@@ -94,6 +101,7 @@ int eager_tensor_properties_set_persistable(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
                                             void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   auto ddim = self->eagertensor.shape();
   std::vector<int64_t> value;
   size_t rank = static_cast<size_t>(ddim.size());
@@ -109,6 +117,7 @@ PyObject* eager_tensor_properties_get_shape(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
                                             void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   return ToPyObject(self->eagertensor.place());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -116,6 +125,7 @@ PyObject* eager_tensor_properties_get_place(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
                                                 void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   std::stringstream ostr;
   ostr << self->eagertensor.place();
   return ToPyObject(ostr.str());
@@ -125,6 +135,7 @@ PyObject* eager_tensor_properties_get_place_str(EagerTensorObject* self,
 PyObject* eager_tensor_properties_get_dtype(EagerTensorObject* self,
                                             void* closure) {
   EAGER_TRY
+  self->eagertensor.SyncToTensor();
   return ToPyObject(pten::DataType2String(self->eagertensor.type()));
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c8b6f2c06c731a..eb53884186ffc0 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
@@ -369,5 +370,82 @@ PyObject* ToPyObject(const platform::Place& value) {
   return obj.ptr();
 }
 
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable) {
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check(obj)) {
+    obj = PyTuple_GET_ITEM(obj, 0);
+  }
+
+  if (obj == nullptr || obj == Py_None) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    egr::EagerTensor emptytensor;
+    return emptytensor;
+  }
+
+  return reinterpret_cast<EagerTensorObject*>(obj)->eagertensor;
+}
+
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));
+    }
+    return {};
+  }
+
+  std::vector<egr::EagerTensor> result;
+
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyList_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      result.emplace_back(
+          reinterpret_cast<EagerTensorObject*>(PyTuple_GetItem(list, i))
+              ->eagertensor);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        (reinterpret_cast<PyTypeObject*>(list->ob_type))->tp_name));
+  }
+
+  return result;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index f311e62b8965e1..e72820c4dbe8c5 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -52,5 +52,38 @@ PyObject* ToPyObject(const std::vector<double>& value);
 PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
 PyObject* ToPyObject(const platform::Place& value);
 
+template <typename Tuple, size_t N>
+struct TupleEagerTensorResult {
+  static void Run(const Tuple& out, PyObject* result) {
+    TupleEagerTensorResult<Tuple, N - 1>::Run(out, result);
+    PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+  }
+};
+
+template <typename Tuple>
+struct TupleEagerTensorResult<Tuple, 1> {
+  static void Run(const Tuple& out, PyObject* result) {
+    PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+  }
+};
+
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out) {
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleEagerTensorResult<decltype(out), sizeof...(Args)>::Run(out, result);
+
+  return result;
+}
+
+egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
+                                        const std::string& arg_name,
+                                        PyObject* args, ssize_t arg_idx,
+                                        bool dispensable = false);
+std::vector<egr::EagerTensor> GetEagerTensorListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 5535ffd950f37d..7b9379df6be2c3 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -31,33 +31,12 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 
 namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-class OpAttrTypeMap {
- public:
-  static OpAttrTypeMap& Instance() {
-    static OpAttrTypeMap g_op_attr_type_map;
-    return g_op_attr_type_map;
-  }
-
-  std::unordered_map<
-      std::string,
-      std::unordered_map<std::string, paddle::framework::proto::AttrType>>&
-  Map() {
-    return ops_attrtype_map_;
-  }
-
- private:
-  OpAttrTypeMap() = default;
-  std::unordered_map<
-      std::string,
-      std::unordered_map<std::string, paddle::framework::proto::AttrType>>
-      ops_attrtype_map_;
-};
-
 static inline std::shared_ptr<imperative::VarBase> CastPyHandleToVarBase(
     const std::string& op_type, const std::string& arg_name, int arg_idx,
     const py::handle& handle, bool dispensable = false) {
@@ -198,737 +177,7 @@ static inline void HandleViewBetweenInputAndOutput(
   }
 }
 
-extern PyTypeObject* g_varbase_pytype;
-extern PyTypeObject* g_vartype_pytype;
-extern PyTypeObject* g_blockdesc_pytype;
-
-inline bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); }
-
-inline bool PyObject_CheckLongOrToLong(PyObject** obj) {
-  if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) ||
-      PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) ||  // NOLINT
-      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
-    return true;
-  }
-
-  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
-          .find("numpy") != std::string::npos) {
-    auto to = PyNumber_Long(*obj);
-    if (to) {
-      *obj = to;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-inline bool PyObject_CheckFloatOrToFloat(PyObject** obj) {
-  // sometimes users provide PyLong or numpy.int64 but attr is float
-  if (PyFloat_Check(*obj) || PyLong_Check(*obj) ||
-      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
-    return true;
-  }
-  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
-          .find("numpy") != std::string::npos) {
-    auto to = PyNumber_Float(*obj);
-    if (to) {
-      *obj = to;
-      return true;
-    }
-  }
-  return false;
-}
-
-inline bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
-
-static inline void CastPyArg2AttrBoolean(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (obj == Py_None) {
-    attrs[key] = false;  // To be compatible with QA integration testing. Some
-                         // test case pass in None.
-  } else if (obj == Py_True) {
-    attrs[key] = true;
-  } else if (obj == Py_False) {
-    attrs[key] = false;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "bool, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrInt(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckLongOrToLong(&obj)) {
-    attrs[key] = (int)PyLong_AsLong(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "int, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrLong(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckLongOrToLong(&obj)) {
-    attrs[key] = (int64_t)PyLong_AsLong(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "long, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloat(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckFloatOrToFloat(&obj)) {
-    attrs[key] = (float)PyFloat_AsDouble(obj);  // NOLINT
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "float, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrString(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyObject_CheckString(obj)) {
-    Py_ssize_t size;
-    const char* data;
-    data = PyUnicode_AsUTF8AndSize(obj, &size);
-    attrs[key] = std::string(data, static_cast<size_t>(size));
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "str, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrBooleans(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<bool> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckBool(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of bool, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<bool> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckBool(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of bool, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrInts(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrLongs(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<int64_t> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of int, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloats(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<float> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrFloat64s(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PySequence_Check(obj)) {
-    Py_ssize_t len = PySequence_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<double> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PySequence_GetItem(obj, i);
-      if (PyObject_CheckFloatOrToFloat(&item)) {
-        value.emplace_back(PyFloat_AsDouble(item));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of float, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrStrings(
-    PyObject* obj,
-    paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  if (PyList_Check(obj)) {
-    Py_ssize_t len = PyList_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<std::string> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyList_GetItem(obj, i);
-      if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
-        data = PyUnicode_AsUTF8AndSize(item, &size);
-        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of str, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else if (PyTuple_Check(obj)) {
-    Py_ssize_t len = PyTuple_Size(obj);
-    PyObject* item = nullptr;
-    std::vector<std::string> value;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = PyTuple_GetItem(obj, i);
-      if (PyObject_CheckString(item)) {
-        Py_ssize_t size;
-        const char* data;
-        data = PyUnicode_AsUTF8AndSize(item, &size);
-        value.emplace_back(std::string(data, static_cast<size_t>(size)));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument (position %d) must be "
-            "list of str, but got %s at pos %d",
-            op_type, arg_pos + 1,
-            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
-            i));
-      }
-    }
-    attrs[key] = value;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "list or tuple, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline void CastPyArg2AttrBlock(
-    PyObject* obj, paddle::framework::AttributeMap& attrs,  // NOLINT
-    const std::string& key, const std::string& op_type, ssize_t arg_pos) {
-  ::pybind11::detail::instance* inst =
-      (::pybind11::detail::instance*)obj;  // NOLINT
-
-  if (!PyObject_IsInstance((PyObject*)inst,                   // NOLINT
-                           (PyObject*)g_blockdesc_pytype)) {  // NOLINT
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "BlockDesc, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-  void** vh = inst->simple_layout ? inst->simple_value_holder
-                                  : &inst->nonsimple.values_and_holders[0];
-  attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
-}
-
-static inline void ConstructAttrMapFromPyArgs(
-    const std::string& op_type, PyObject* args, ssize_t attr_start,
-    ssize_t attr_end, paddle::framework::AttributeMap& attrs) {  // NOLINT
-  PADDLE_ENFORCE_EQ(
-      (attr_end - attr_start) % 2, 0,
-      platform::errors::InvalidArgument(
-          "The number of arguments for attributes should be even."));
-
-  auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]);
-
-  PyObject* obj = nullptr;
-  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
-    Py_ssize_t key_len;
-    const char* key_ptr;
-    obj = PyTuple_GET_ITEM(args, arg_pos);
-    if (PyObject_CheckString(obj)) {
-      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument (position %d) must be str, but got "
-          "%s",
-          op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-    }
-
-    std::string key(key_ptr, static_cast<size_t>(key_len));
-    auto iter = attr_type_map->find(key);
-    if (iter == attr_type_map->end()) {
-      continue;
-    }
-
-    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
-
-    switch (iter->second) {
-      case paddle::framework::proto::AttrType::INT:
-        CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOAT:
-        CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::STRING:
-        CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::INTS:
-        CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOATS:
-        CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::STRINGS:
-        CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BOOLEAN:
-        CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BOOLEANS:
-        CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::LONG:
-        CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::LONGS:
-        CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::FLOAT64S:
-        CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos);
-        break;
-      case paddle::framework::proto::AttrType::BLOCK:
-        CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos);
-        break;
-      default:
-        break;
-    }
-  }
-}
-
-static inline std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable = false) {
-  ::pybind11::detail::instance* inst =
-      (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx);
-
-  if (PyTuple_Check((PyObject*)inst)) {  // NOLINT
-    inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0);
-  }
-
-  if (inst == nullptr || (PyObject*)inst == Py_None) {  // NOLINT
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be Tensor, but got None",
-          op_type, arg_name, arg_idx));
-    }
-    return nullptr;
-  }
-
-  if (!PyObject_IsInstance((PyObject*)inst,                 // NOLINT
-                           (PyObject*)g_varbase_pytype)) {  // NOLINT
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be Tensor, but got "
-        "%s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name));  // NOLINT
-  }
-
-  void** vh = inst->simple_layout ? inst->simple_value_holder
-                                  : &inst->nonsimple.values_and_holders[0];
-  return reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(vh[1]);
-}
-
-static inline std::vector<std::shared_ptr<imperative::VarBase>>
-GetVarBaseListFromArgs(const std::string& op_type, const std::string& arg_name,
-                       PyObject* args, ssize_t arg_idx,
-                       bool dispensable = false) {
-  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
-
-  if (list == nullptr || list == Py_None) {
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
-          "None",
-          op_type, arg_name, arg_idx));  // NOLINT
-    }
-    return {};
-  }
-
-  std::vector<std::shared_ptr<imperative::VarBase>> result;
-
-  if (PyList_Check(list)) {
-    Py_ssize_t len = PyList_Size(list);
-    if (len == 0) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-          "empty list",
-          op_type, arg_name, arg_idx));
-    }
-    ::pybind11::detail::instance* item = nullptr;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = (::pybind11::detail::instance*)PyList_GetItem(list, i);
-      if (!PyObject_IsInstance((PyObject*)item,                 // NOLINT
-                               (PyObject*)g_varbase_pytype)) {  // NOLINT
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument '%s' (position %d) must be list of Tensors, but "
-            "got list of "
-            "%s",
-            op_type, arg_name, arg_idx,
-            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
-      }
-      void** vh = item->simple_layout ? item->simple_value_holder
-                                      : &item->nonsimple.values_and_holders[0];
-      result.emplace_back(
-          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
-              vh[1]));
-    }
-  } else if (PyTuple_Check(list)) {
-    Py_ssize_t len = PyTuple_Size(list);
-    if (len == 0) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-          "empty list",
-          op_type, arg_name, arg_idx));
-    }
-    ::pybind11::detail::instance* item = nullptr;
-    for (Py_ssize_t i = 0; i < len; i++) {
-      item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i);  // NOLINT
-      if (!PyObject_IsInstance((PyObject*)item,                        // NOLINT
-                               (PyObject*)g_varbase_pytype)) {         // NOLINT
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s(): argument '%s' (position %d) must be list of Tensors, but "
-            "got list of "
-            "%s",
-            op_type, arg_name, arg_idx,
-            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
-      }
-      void** vh = item->simple_layout ? item->simple_value_holder
-                                      : &item->nonsimple.values_and_holders[0];
-      result.emplace_back(
-          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
-              vh[1]));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
-        "%s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)list->ob_type)->tp_name));  // NOLINT
-  }
-
-  return result;
-}
-
-static inline unsigned long GetUnsignedLongFromArgs(  // NOLINT
-    const std::string& op_type, const std::string& arg_name, PyObject* args,
-    ssize_t arg_idx, bool dispensable = false) {
-  PyObject* item = PyTuple_GET_ITEM(args, arg_idx);
-
-  if (item == nullptr) {
-    if (!dispensable) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "%s(): argument '%s' (position %d) must be long, but got None",
-          op_type, arg_name, arg_idx));
-    }
-    return 0;
-  }
-
-  if (PyObject_CheckLongOrToLong(&item)) {
-    return PyLong_AsUnsignedLong(item);
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument '%s' (position %d) must be "
-        "long, but got %s",
-        op_type, arg_name, arg_idx,
-        ((PyTypeObject*)item->ob_type)->tp_name));  // NOLINT
-  }
-}
-
-static inline PyObject* MakeReturnPyObject(
+PyObject* MakeReturnPyObject(
     const std::shared_ptr<paddle::imperative::VarBase>& out) {
   return ::pybind11::detail::type_caster_base<imperative::VarBase>::cast_holder(
              ::pybind11::detail::holder_helper<
@@ -937,7 +186,7 @@ static inline PyObject* MakeReturnPyObject(
       .ptr();
 }
 
-static inline PyObject* MakeReturnPyObject(
+PyObject* MakeReturnPyObject(
     const std::vector<std::shared_ptr<imperative::VarBase>>& out) {
   PyObject* result = PyList_New((Py_ssize_t)out.size());
 
@@ -970,7 +219,7 @@ struct TupleVarBasesResult<Tuple, 1> {
 };
 
 template <typename... Args>
-static inline PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
+PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
   auto len = sizeof...(Args);
   PyObject* result = PyTuple_New(len);
 
@@ -979,20 +228,6 @@ static inline PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
-void InitOpsAttrTypeMap() {
-  auto op_info_map = paddle::framework::OpInfoMap::Instance().map();
-  for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) {
-    auto op_proto = iter->second.proto_;
-    if (op_proto == nullptr) {
-      continue;
-    }
-    auto attrs_proto = op_proto->attrs();
-    for (auto& attr : attrs_proto) {
-      OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type();
-    }
-  }
-}
-
 }  // namespace pybind
 }  // namespace paddle
 
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
new file mode 100644
index 00000000000000..1f761ae29c2afa
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -0,0 +1,806 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+class OpAttrTypeMap {
+ public:
+  static OpAttrTypeMap& Instance() {
+    static OpAttrTypeMap g_op_attr_type_map;
+    return g_op_attr_type_map;
+  }
+
+  std::unordered_map<
+      std::string,
+      std::unordered_map<std::string, paddle::framework::proto::AttrType>>&
+  Map() {
+    return ops_attrtype_map_;
+  }
+
+ private:
+  OpAttrTypeMap() = default;
+  std::unordered_map<
+      std::string,
+      std::unordered_map<std::string, paddle::framework::proto::AttrType>>
+      ops_attrtype_map_;
+};
+
+extern PyTypeObject* g_varbase_pytype;
+extern PyTypeObject* g_vartype_pytype;
+extern PyTypeObject* g_blockdesc_pytype;
+
+bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); }
+
+bool PyObject_CheckLongOrToLong(PyObject** obj) {
+  if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) ||
+      PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) ||  // NOLINT
+      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
+    return true;
+  }
+
+  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Long(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool PyObject_CheckFloatOrToFloat(PyObject** obj) {
+  // sometimes users provide PyLong or numpy.int64 but attr is float
+  if (PyFloat_Check(*obj) || PyLong_Check(*obj) ||
+      PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) {  // NOLINT
+    return true;
+  }
+  if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name)  // NOLINT
+          .find("numpy") != std::string::npos) {
+    auto to = PyNumber_Float(*obj);
+    if (to) {
+      *obj = to;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
+
+void CastPyArg2AttrBoolean(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos) {
+  if (obj == Py_None) {
+    attrs[key] = false;  // To be compatible with QA integration testing. Some
+                         // test case pass in None.
+  } else if (obj == Py_True) {
+    attrs[key] = true;
+  } else if (obj == Py_False) {
+    attrs[key] = false;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "bool, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrInt(PyObject* obj,
+                       paddle::framework::AttributeMap& attrs,  // NOLINT
+                       const std::string& key, const std::string& op_type,
+                       ssize_t arg_pos) {
+  if (PyObject_CheckLongOrToLong(&obj)) {
+    attrs[key] = (int)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "int, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrLong(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos) {
+  if (PyObject_CheckLongOrToLong(&obj)) {
+    attrs[key] = (int64_t)PyLong_AsLong(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "long, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  if (PyObject_CheckFloatOrToFloat(&obj)) {
+    attrs[key] = (float)PyFloat_AsDouble(obj);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "float, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrString(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos) {
+  if (PyObject_CheckString(obj)) {
+    Py_ssize_t size;
+    const char* data;
+    data = PyUnicode_AsUTF8AndSize(obj, &size);
+    attrs[key] = std::string(data, (size_t)size);  // NOLINT
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "str, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrBooleans(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<bool> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckBool(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<bool> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckBool(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of bool, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrInts(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrLongs(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<int64_t> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckLongOrToLong(&item)) {
+        value.emplace_back(PyLong_AsLong(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of int, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloats(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<float> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrFloat64s(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PySequence_Check(obj)) {
+    Py_ssize_t len = PySequence_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<double> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PySequence_GetItem(obj, i);
+      if (PyObject_CheckFloatOrToFloat(&item)) {
+        value.emplace_back(PyFloat_AsDouble(item));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrStrings(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<std::string> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckString(item)) {
+        Py_ssize_t size;
+        const char* data;
+        data = PyUnicode_AsUTF8AndSize(item, &size);
+        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of str, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    std::vector<std::string> value;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GetItem(obj, i);
+      if (PyObject_CheckString(item)) {
+        Py_ssize_t size;
+        const char* data;
+        data = PyUnicode_AsUTF8AndSize(item, &size);
+        value.emplace_back(std::string(data, (size_t)size));  // NOLINT
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument (position %d) must be "
+            "list of str, but got %s at pos %d",
+            op_type, arg_pos + 1,
+            ((PyTypeObject*)item->ob_type)->tp_name,  // NOLINT
+            i));
+      }
+    }
+    attrs[key] = value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "list or tuple, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void CastPyArg2AttrBlock(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos) {
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)obj;  // NOLINT
+
+  if (!PyObject_IsInstance((PyObject*)inst,                   // NOLINT
+                           (PyObject*)g_blockdesc_pytype)) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "BlockDesc, but got %s",
+        op_type, arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
+}
+
+void ConstructAttrMapFromPyArgs(
+    const std::string& op_type, PyObject* args, ssize_t attr_start,
+    ssize_t attr_end, paddle::framework::AttributeMap& attrs) {  // NOLINT
+  PADDLE_ENFORCE_EQ(
+      (attr_end - attr_start) % 2, 0,
+      platform::errors::InvalidArgument(
+          "The number of arguments for attributes should be even."));
+
+  auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]);
+
+  PyObject* obj = nullptr;
+  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    Py_ssize_t key_len;
+    const char* key_ptr;
+    obj = PyTuple_GET_ITEM(args, arg_pos);
+    if (PyObject_CheckString(obj)) {
+      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    }
+
+    std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    auto iter = attr_type_map->find(key);
+    if (iter == attr_type_map->end()) {
+      continue;
+    }
+
+    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
+
+    switch (iter->second) {
+      case paddle::framework::proto::AttrType::INT:
+        CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOAT:
+        CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::STRING:
+        CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::INTS:
+        CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOATS:
+        CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::STRINGS:
+        CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BOOLEAN:
+        CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BOOLEANS:
+        CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::LONG:
+        CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::LONGS:
+        CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::FLOAT64S:
+        CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos);
+        break;
+      case paddle::framework::proto::AttrType::BLOCK:
+        CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx);
+
+  if (PyTuple_Check((PyObject*)inst)) {  // NOLINT
+    inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0);
+  }
+
+  if (inst == nullptr || (PyObject*)inst == Py_None) {  // NOLINT
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be Tensor, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return nullptr;
+  }
+
+  if (!PyObject_IsInstance((PyObject*)inst,                 // NOLINT
+                           (PyObject*)g_varbase_pytype)) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be Tensor, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name));  // NOLINT
+  }
+
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  return reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(vh[1]);
+}
+
+std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* list = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (list == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensor, but got "
+          "None",
+          op_type, arg_name, arg_idx));  // NOLINT
+    }
+    return {};
+  }
+
+  std::vector<std::shared_ptr<imperative::VarBase>> result;
+
+  if (PyList_Check(list)) {
+    Py_ssize_t len = PyList_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    ::pybind11::detail::instance* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = (::pybind11::detail::instance*)PyList_GetItem(list, i);
+      if (!PyObject_IsInstance((PyObject*)item,                 // NOLINT
+                               (PyObject*)g_varbase_pytype)) {  // NOLINT
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of Tensors, but "
+            "got list of "
+            "%s",
+            op_type, arg_name, arg_idx,
+            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
+      }
+      void** vh = item->simple_layout ? item->simple_value_holder
+                                      : &item->nonsimple.values_and_holders[0];
+      result.emplace_back(
+          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
+              vh[1]));
+    }
+  } else if (PyTuple_Check(list)) {
+    Py_ssize_t len = PyTuple_Size(list);
+    if (len == 0) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+          "empty list",
+          op_type, arg_name, arg_idx));
+    }
+    ::pybind11::detail::instance* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i);  // NOLINT
+      if (!PyObject_IsInstance((PyObject*)item,                        // NOLINT
+                               (PyObject*)g_varbase_pytype)) {         // NOLINT
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "%s(): argument '%s' (position %d) must be list of Tensors, but "
+            "got list of "
+            "%s",
+            op_type, arg_name, arg_idx,
+            ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name));  // NOLINT
+      }
+      void** vh = item->simple_layout ? item->simple_value_holder
+                                      : &item->nonsimple.values_and_holders[0];
+      result.emplace_back(
+          reinterpret_cast<std::shared_ptr<paddle::imperative::VarBase>&>(
+              vh[1]));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be list of Tensors, but got "
+        "%s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)list->ob_type)->tp_name));  // NOLINT
+  }
+
+  return result;
+}
+
+unsigned long GetUnsignedLongFromArgs(  // NOLINT
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable) {
+  PyObject* item = PyTuple_GET_ITEM(args, arg_idx);
+
+  if (item == nullptr) {
+    if (!dispensable) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument '%s' (position %d) must be long, but got None",
+          op_type, arg_name, arg_idx));
+    }
+    return 0;
+  }
+
+  if (PyObject_CheckLongOrToLong(&item)) {
+    return PyLong_AsUnsignedLong(item);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument '%s' (position %d) must be "
+        "long, but got %s",
+        op_type, arg_name, arg_idx,
+        ((PyTypeObject*)item->ob_type)->tp_name));  // NOLINT
+  }
+}
+
+void InitOpsAttrTypeMap() {
+  auto op_info_map = paddle::framework::OpInfoMap::Instance().map();
+  for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) {
+    auto op_proto = iter->second.proto_;
+    if (op_proto == nullptr) {
+      continue;
+    }
+    auto attrs_proto = op_proto->attrs();
+    for (auto& attr : attrs_proto) {
+      OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type();
+    }
+  }
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
new file mode 100644
index 00000000000000..9dc3a71a6ccf94
--- /dev/null
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/chrono.h>
+#include <pybind11/complex.h>
+#include <pybind11/functional.h>
+#include <pybind11/stl.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/pybind/imperative.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+bool PyObject_CheckBool(PyObject** obj);
+
+bool PyObject_CheckLongOrToLong(PyObject** obj);
+
+bool PyObject_CheckFloatOrToFloat(PyObject** obj);
+
+bool PyObject_CheckString(PyObject* obj);
+
+void CastPyArg2AttrBoolean(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+
+void CastPyArg2AttrInt(PyObject* obj,
+                       paddle::framework::AttributeMap& attrs,  // NOLINT
+                       const std::string& key, const std::string& op_type,
+                       ssize_t arg_pos);
+
+void CastPyArg2AttrLong(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+
+void CastPyArg2AttrFloat(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void CastPyArg2AttrString(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+
+void CastPyArg2AttrBooleans(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+
+void CastPyArg2AttrInts(PyObject* obj,
+                        paddle::framework::AttributeMap& attrs,  // NOLINT
+                        const std::string& key, const std::string& op_type,
+                        ssize_t arg_pos);
+
+void CastPyArg2AttrLongs(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void CastPyArg2AttrFloats(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key, const std::string& op_type,
+                          ssize_t arg_pos);
+
+void CastPyArg2AttrFloat64s(PyObject* obj,
+                            paddle::framework::AttributeMap& attrs,  // NOLINT
+                            const std::string& key, const std::string& op_type,
+                            ssize_t arg_pos);
+
+void CastPyArg2AttrStrings(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key, const std::string& op_type,
+                           ssize_t arg_pos);
+
+void CastPyArg2AttrBlock(PyObject* obj,
+                         paddle::framework::AttributeMap& attrs,  // NOLINT
+                         const std::string& key, const std::string& op_type,
+                         ssize_t arg_pos);
+
+void ConstructAttrMapFromPyArgs(
+    const std::string& op_type, PyObject* args, ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs);  // NOLINT
+
+std::shared_ptr<imperative::VarBase> GetVarBaseFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+std::vector<std::shared_ptr<imperative::VarBase>> GetVarBaseListFromArgs(
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+unsigned long GetUnsignedLongFromArgs(  // NOLINT
+    const std::string& op_type, const std::string& arg_name, PyObject* args,
+    ssize_t arg_idx, bool dispensable = false);
+
+void InitOpsAttrTypeMap();
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 749782f2413e5d..5587952facc530 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -32,77 +32,6 @@
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
-// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
-// generated in C++ automatically.
-// However, some OPs need to pass the outputs from Python instead of generating
-// them in C++. There are mainly 2 reasons for that,
-// (1) Optimizer OPs need to update the input param in-place, like sgd.
-//     So they need to pass the output which is same as input param.
-// (2) Very few python APIs has out in their arguments, like fill_constant.
-//     So they need to pass the python output to C++.
-//     Actually, this is not a good design, since it may break the SSA graph,
-//     especially in declarative mode.
-// For those OPs, we need to manually specify the outs need to pass in this map.
-std::map<std::string, std::set<std::string>> op_passing_outs_map = {
-    {"sgd", {"ParamOut"}},
-    {"adam",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"adamw",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
-      "MasterParamOut"}},
-    {"average_accumulates",
-     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
-      "out_old_num_accumulates", "out_num_updates"}},
-    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
-    {"batch_norm", {"MeanOut", "VarianceOut"}},
-    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
-    {"accuracy", {"Correct", "Total"}},
-    {"fill_constant", {"Out"}},
-    {"recv_v2", {"Out"}},
-    {"partial_recv", {"Out"}},
-    {"matmul", {"Out"}},
-    {"c_broadcast", {"Out"}},
-    {"c_sync_calc_stream", {"Out"}},
-    {"c_sync_comm_stream", {"Out"}},
-    {"c_reduce_sum", {"Out"}},
-    {"c_reduce_max", {"Out"}},
-    {"c_reduce_min", {"Out"}},
-    {"c_reduce_prod", {"Out"}},
-    {"c_reduce", {"Out"}},
-    {"c_scatter", {"Out"}},
-    {"barrier", {"Out"}},
-    {"fake_quantize_dequantize_moving_average_abs_max",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
-    {"update_loss_scaling",
-     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale",
-     {"Out", "OutScale", "OutAccum", "OutState"}},
-    {"lamb",
-     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
-    {"rnn", {"DropoutState"}},
-    {"run_program", {"Out", "DOut", "OutScope"}},
-    {"clear_float_status", {"FloatStatusOut"}},
-    {"get_float_status", {"FloatStatusOut"}},
-};
-
-// NOTE(pangyoki): Tensor View Strategy.
-// In this case, a new output varbase will be created, and this varbase will
-// reuse the input varbase's allocation.
-// It's a map. The key of outer map is the view op name, the value is
-// a pair which implies the mapping relationship between the input and
-// output varbase.
-std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
-    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
-    {"unsqueeze2", {"X", "Out"}},
-    {"reshape2", {"X", "Out"}},
-    {"flatten_contiguous_range", {"X", "Out"}},
-};
-
 // NOTE(pangyoki): Inplace OP with duplicable input.
 // The set includes inplace ops that have duplicable input.
 // The first Varbase in input needs to be specified for the inplace strategy
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index ad7fa780976d7d..7000097e0abcb2 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -119,3 +119,74 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
       "MasterParamOut"}},
 };
+
+// NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
+// generated in C++ automatically.
+// However, some OPs need to pass the outputs from Python instead of generating
+// them in C++. There are mainly 2 reasons for that,
+// (1) Optimizer OPs need to update the input param in-place, like sgd.
+//     So they need to pass the output which is same as input param.
+// (2) Very few python APIs has out in their arguments, like fill_constant.
+//     So they need to pass the python output to C++.
+//     Actually, this is not a good design, since it may break the SSA graph,
+//     especially in declarative mode.
+// For those OPs, we need to manually specify the outs need to pass in this map.
+std::map<std::string, std::set<std::string>> op_passing_outs_map = {
+    {"sgd", {"ParamOut"}},
+    {"adam",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"adamw",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut",
+      "MasterParamOut"}},
+    {"average_accumulates",
+     {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates",
+      "out_old_num_accumulates", "out_num_updates"}},
+    {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"batch_norm", {"MeanOut", "VarianceOut"}},
+    {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
+    {"accuracy", {"Correct", "Total"}},
+    {"fill_constant", {"Out"}},
+    {"recv_v2", {"Out"}},
+    {"partial_recv", {"Out"}},
+    {"matmul", {"Out"}},
+    {"c_broadcast", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
+    {"c_reduce_sum", {"Out"}},
+    {"c_reduce_max", {"Out"}},
+    {"c_reduce_min", {"Out"}},
+    {"c_reduce_prod", {"Out"}},
+    {"c_reduce", {"Out"}},
+    {"c_scatter", {"Out"}},
+    {"barrier", {"Out"}},
+    {"fake_quantize_dequantize_moving_average_abs_max",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"lamb",
+     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
+    {"clear_float_status", {"FloatStatusOut"}},
+    {"get_float_status", {"FloatStatusOut"}},
+};
+
+// NOTE(pangyoki): Tensor View Strategy.
+// In this case, a new output varbase will be created, and this varbase will
+// reuse the input varbase's allocation.
+// It's a map. The key of outer map is the view op name, the value is
+// a pair which implies the mapping relationship between the input and
+// output varbase.
+std::map<std::string, std::pair<std::string, std::string>> view_op_map = {
+    {"squeeze2", {"X", "Out"}},  // "X" -> "Out"
+    {"unsqueeze2", {"X", "Out"}},
+    {"reshape2", {"X", "Out"}},
+    {"flatten_contiguous_range", {"X", "Out"}},
+};
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index ffec9dc69472eb..83030bb2454f67 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -19,3 +19,21 @@
 for name in dir(core.ops):
     globals()[name] = getattr(core.ops, name)
     __all__.append(name)
+
+
+def switch_to_core_ops():
+    for name in dir(core.eager.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.ops):
+        globals()[name] = getattr(core.ops, name)
+        __all__.append(name)
+
+
+def switch_to_eager_ops():
+    for name in dir(core.ops):
+        del globals()[name]
+        __all__.remove(name)
+    for name in dir(core.eager.ops):
+        globals()[name] = getattr(core.eager.ops, name)
+        __all__.append(name)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 0b09c513db8589..1bc0315a34706c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -40,6 +40,7 @@
 import warnings
 import functools
 from .variable_index import _getitem_impl_, _setitem_impl_
+from paddle import _C_ops
 
 __all__ = [
     'Program',
@@ -82,13 +83,36 @@
 
 
 @signature_safe_contextmanager
-def eager_guard():
+def eager_mode_place_guard(place):
+    if place is not None:
+        expected_place = _get_paddle_place(place)
+    else:
+        expected_place = _current_expected_place()
+
+    global _global_expected_place_
+    tmp_place = _global_expected_place_
+    _global_expected_place_ = expected_place
+
+    _set_expected_place(expected_place)
+
+    try:
+        yield
+    finally:
+        _global_expected_place_ = tmp_place
+        _set_expected_place(tmp_place)
+
+
+@signature_safe_contextmanager
+def eager_guard(place=None):
     global _eager_mode_
     _eager_mode_ = True
+    _C_ops.switch_to_eager_ops()
     try:
-        yield
+        with eager_mode_place_guard(place):
+            yield
     finally:
         _eager_mode_ = False
+        _C_ops.switch_to_core_ops()
 
 
 def in_eager_mode():
diff --git a/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
new file mode 100644
index 00000000000000..728185c0559586
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_egr_code_generate_api.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+import paddle.fluid.eager.eager_tensor_patch_methods as eager_tensor_patch_methods
+import paddle
+import numpy as np
+from paddle.fluid import eager_guard
+import unittest
+
+
+class EagerOpAPIGenerateTestCase(unittest.TestCase):
+    def test_elementwise_add(self):
+        with eager_guard():
+            paddle.set_device("cpu")
+            np_x = np.ones([4, 16, 16, 32]).astype('float32')
+            np_y = np.ones([4, 16, 16, 32]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            out = paddle.add(x, y)
+            out_arr = out.numpy()
+
+            out_arr_expected = np.add(np_x, np_y)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+
+    def test_sum(self):
+        with eager_guard():
+            x_data = np.array(
+                [[0.2, 0.3, 0.5, 0.9], [0.1, 0.2, 0.6, 0.7]]).astype('float32')
+            x = paddle.to_tensor(x_data, 'float32')
+            out = paddle.sum(x, axis=0)
+            out_arr = out.numpy()
+            out_arr_expected = np.sum(x_data, axis=0)
+            self.assertTrue(np.array_equal(out_arr, out_arr_expected))
+
+    def test_mm(self):
+        with eager_guard():
+            np_input = np.random.random([16, 32]).astype('float32')
+            np_mat2 = np.random.random([32, 32]).astype('float32')
+            input = paddle.to_tensor(np_input)
+            mat2 = paddle.to_tensor(np_mat2)
+            out = paddle.mm(input, mat2)
+            out_arr = out.numpy()
+            out_arr_expected = np.matmul(np_input, np_mat2)
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+
+    def test_sigmoid(self):
+        with eager_guard():
+            np_x = np.array([-0.4, -0.2, 0.1, 0.3]).astype('float32')
+            x = paddle.to_tensor(np_x)
+            out = paddle.nn.functional.sigmoid(x)
+            out_arr = out.numpy()
+            out_arr_expected = np.array(
+                [0.40131234, 0.450166, 0.52497919, 0.57444252]).astype(
+                    'float32')
+            self.assertTrue(np.allclose(out_arr, out_arr_expected))
+
+
+if __name__ == "__main__":
+    unittest.main()

From d1ab323fb9d3229358cff8f77877be9e616335fd Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Wed, 8 Dec 2021 12:48:25 +0800
Subject: [PATCH 118/124] fix: when ceil_model==true && Padding_algo!=SAME,
 (x-size)/stride != int, this convert is wrong (#37929)

---
 .../inference/tensorrt/convert/pool2d_op.cc      | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 35c9658108ab54..26d87e4832f5f1 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -162,20 +162,6 @@ class Pool2dOpConverter : public OpConverter {
         }
         layer = pool_layer;
       } else if (!adaptive && !global_pooling && ceil_mode) {
-        nvinfer1::DimsHW pre_pad(0, 0);
-        nvinfer1::DimsHW post_pad(0, 0);
-        // If ceil mode is true, we will pad the appropriate size to the input.
-        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
-                     input_dims);
-        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
-            post_pad);
-        PADDLE_ENFORCE_NOT_NULL(
-            pad_layer, platform::errors::Fatal(
-                           "Pad layer in poolOp converter could not be "
-                           "created. The pointer to pad layer is `NULL`."));
-        input1 = pad_layer->getOutput(0);
-
         auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
                                                 nv_pool_type, nv_ksize);
         pool_layer->setStride(nv_strides);
@@ -183,6 +169,8 @@ class Pool2dOpConverter : public OpConverter {
         pool_layer->setAverageCountExcludesPadding(exclusive);
         if (padding_algorithm == "SAME") {
           pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        } else {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kEXPLICIT_ROUND_UP);
         }
         layer = pool_layer;
       } else if (global_pooling) {

From 2c02a580bb5f13cc6c8f7dc85458b380f0b2a53a Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 8 Dec 2021 12:56:41 +0800
Subject: [PATCH 119/124] add check whether tensor is inplace and leaf when
 calcute gradient (#37931)

---
 paddle/fluid/imperative/py_layer_fwd.h        |  9 +++++++
 .../fluid/tests/unittests/test_pylayer_op.py  | 26 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index 79251d7bf7ad6b..159371970dcacf 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -226,6 +226,15 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
       }
     }
     if (if_inplace) {
+      // when pylayer forward is inplace strategy, check whether tensor is leaf
+      for (auto& t : input_vars) {
+        PADDLE_ENFORCE_EQ(t->IsLeaf() && !t->OverridedStopGradient(), false,
+                          platform::errors::InvalidArgument(
+                              "Leaf Var (%s) that doesn't stop gradient can't "
+                              "use inplace strategy.",
+                              t->Name()));
+      }
+
       inplace_map["X"] = "Out";
     }
 
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index a852b4c90421ac..200273c6066f9d 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -406,6 +406,32 @@ def forward(self, data):
             z.backward()
             self.assertTrue(data.grad is not None)
 
+    def test_pylayer_inplace_and_leaf_exception(self):
+        class cus_pylayer_op(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return dy
+
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                z = cus_pylayer_op.apply(data)
+                return z.mean()
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float64") / (i + 1)
+            data.stop_gradient = False
+            layer = Layer()
+
+            with self.assertRaises(ValueError):
+                z = layer(data)
+
     def test_backward_in_backward(self):
         class cus_tanh(PyLayer):
             @staticmethod

From b4a6749197b53b33dea2e210852c2dd029de46b8 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 8 Dec 2021 13:09:46 +0800
Subject: [PATCH 120/124] Fix CUDAGraphAllocator bug for
 StreamSafeCUDAAllocator (#37821)

* Fix CUDAGraph bug for StreamSafeCUDAAllocator

* Add CUDAGrapthAllocator check in multi-stream interface

* Set FLAGS_use_stream_safe_cuda_allocator defaulted to false

* Fix environment error for cmake

* Fix cmake error

* Add UT of GetAllocatorInterfaceTest

* Add UT of CUDAGraphExceptionTest

* Enhance CUDAGraphExceptionTest
---
 paddle/fluid/memory/CMakeLists.txt            |  9 ++-
 .../memory/allocation/allocator_facade.cc     | 57 ++++++++++++++-
 .../allocation/stream_safe_cuda_allocator.cc  |  2 +
 .../memory/stream_safe_cuda_alloc_test.cu     | 70 +++++++++++++++++--
 4 files changed, 127 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 69134e1c76bb74..97952e4b71641e 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -19,14 +19,13 @@ if (WITH_GPU)
             DEPS device_context malloc)
     nv_test(stream_safe_cuda_alloc_test
             SRCS stream_safe_cuda_alloc_test.cu
-            DEPS malloc)
+            DEPS malloc cuda_graph_with_memory_pool)
     
     if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
         set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_system_allocator=false"
-                             ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
-                             ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
-    endif()
+                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
+                                          FLAGS_allocator_strategy=auto_growth")
+    endif()  
 endif()
 
 if (WITH_ROCM)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 13cd980881bd54..2aed7ec001d2a9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
 // NOTE(Ruibiao): This FLAGS is just to be compatibled with
 // the old single-stream CUDA allocator. It will be removed
 // after StreamSafeCudaAllocator has been fully tested.
-PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
                             "Enable StreamSafeCUDAAllocator");
 
 DECLARE_string(allocator_strategy);
@@ -737,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place,
+                              /* A non-zero num to choose allocator_ */ 1);
+    }
+#endif
+
     return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                             m_->GetDefaultStream());
   }
 #endif
+
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
@@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place, size)->Allocate(size);
+    }
+#endif
+
     return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
                  m_->GetDefaultStream());
   }
 #endif
+
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
@@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_
+          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+          ->Release(place);
+    }
+#endif
+
     return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                    m_->GetDefaultStream());
   }
@@ -783,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   return std::shared_ptr<Allocation>(Alloc(place, size, stream));
 }
 
@@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
           "multi-stream 'Alloca' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
         ->Allocate(size);
@@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   return m_->GetAllocator(place, stream)->Release(place);
 }
 
@@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
           "'RecordStream' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
   m_->RecordStream(allocation, stream);
 }
 
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index d11240bc844870..86f3135ee4d147 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
   for (StreamSafeCUDAAllocator* allocator : allocators) {
     release_size += allocator->ProcessEventsAndFreeWithRelease();
   }
+  VLOG(8) << "Release " << release_size
+          << " bytes memory from all stream for place " << place;
   return release_size;
 }
 
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index a0293e8410c586..134c368d4340e3 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -25,7 +25,9 @@
 #include <vector>
 
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) {
   }
 }
 
+void CheckMemLeak(const platform::CUDAPlace &place) {
+  uint64_t cuda_malloc_size =
+      platform::RecordedGpuMallocSize(place.GetDeviceId());
+  ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                 << " bytes memory that not released yet,"
+                                 << " there may be a memory leak problem";
+}
+
 class StreamSafeCUDAAllocTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 #endif
     }
 
-    uint64_t cuda_malloc_size =
-        platform::RecordedGpuMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                   << " bytes memory that not released yet,"
-                                   << " there may be a memory leak problem";
+    CheckMemLeak(place_);
   }
 
   size_t stream_num_;
@@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
       Alloc(place, alloc_size, default_stream);
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
+  allocation_unique.reset();
+
+  Release(place);
+  CheckMemLeak(place);
 }
 
+TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  auto &instance = allocation::AllocatorFacade::Instance();
+  const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
+
+  size_t alloc_size = 256;
+  std::shared_ptr<Allocation> allocation_from_allocator =
+      allocator->Allocate(alloc_size);
+  EXPECT_GE(allocation_from_allocator->size(), alloc_size);
+  void *address = allocation_from_allocator->ptr();
+  allocation_from_allocator.reset();
+
+  std::shared_ptr<Allocation> allocation_implicit_stream =
+      AllocShared(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
+  allocation_implicit_stream.reset();
+
+  Release(place);
+  CheckMemLeak(place);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 1;
+  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
+
+  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
+  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(RecordStream(allocation.get(), nullptr),
+               paddle::platform::EnforceNotMet);
+  platform::EndCUDAGraphCapture();
+
+  allocation.reset();
+  Release(place);
+  CheckMemLeak(place);
+}
+#endif
+
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
@@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
 
   Release(place, stream1);
   Release(place, stream2);
+  CheckMemLeak(place);
 }
 
 }  // namespace memory

From 567e6bbc8063428b28ef0a3804178bcb7c3e9fb7 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Wed, 8 Dec 2021 14:07:59 +0800
Subject: [PATCH 121/124] implementation of broadcast sub backward by reduce
 (#37754)

* add boardcast_sub

* add boardcast_sub
---
 .../elementwise/elementwise_sub_op.cu         | 61 ++++++++++++++++++-
 .../elementwise/elementwise_sub_op.h          | 35 ++++++++---
 .../kernel_primitives/functor_primitives.h    | 14 +++++
 .../operators/reduce_ops/reduce_functor_op.h  | 11 ++++
 4 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 00562767c97a5c..2b44c81a4550d5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -30,12 +32,69 @@ static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout,
   int col = blockIdx.x * blockDim.x + threadIdx.x;
 
   while (col < size) {
-    dx[col] = dout[col];
+    if (dx != nullptr) {
+      dx[col] = dout[col];
+    }
     dy[col] = -dout[col];
     col += blockDim.x * gridDim.x;
   }
 }
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  auto* dout_data = dout->data<T>();
+  // dx
+  if (dx != nullptr) {
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout->dims()) {
+      if (dx_data != dout_data) {
+        framework::TensorCopy(
+            *dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(*dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
+      gpuStream_t stream = ctx.cuda_device_context().stream();
+      TensorReduceFunctorImpl<T, T, CustomSum>(*dout, dx, reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout->dims()) {
+      if (dy_data != dout_data) {
+        dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+        auto size = dy->numel();
+        dim3 grid_size = dim3(
+            (size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
+        SimpleElemwiseSubGradCUDAKernel<T><<<
+            grid_size, block_size, 0,
+            ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+            dout->data<T>(), size, nullptr,
+            dy->mutable_data<T>(ctx.GetPlace()));
+      }
+    } else {
+      std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
+      gpuStream_t stream = ctx.cuda_device_context().stream();
+      TensorReduceFunctorImpl<T, T, CustomSub>(*dout, dy, reduce_dims, stream);
+    }
+  }
+}
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 94c8edf24a1276..08a4e709a37ad7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -71,6 +71,21 @@ struct SubGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+}
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
@@ -79,13 +94,21 @@ elementwise_sub_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+  default_elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cuda definition
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_sub_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy);
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
@@ -108,15 +131,13 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
     // skip out
     auto* out = dout;
     if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
-      ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(),
-          SubGradDY<T>());
+      default_elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
     }
   }
 };
diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
index 3fce3b1c0920a7..d7aed8595ba056 100644
--- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -86,6 +86,20 @@ struct DivideFunctor {
   Tx n_inv;
 };
 
+/**
+ * @brief Default inverse functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct InverseFunctor {
+  HOSTDEVICE inline InverseFunctor() {}
+
+  HOSTDEVICE explicit inline InverseFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(-x);
+  }
+};
+
 /**
  * @brief Default unary square functor
  */
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 90adea60927c0f..dc79666b72fa6c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -64,6 +64,17 @@ struct CustomSum {
   }
 };
 
+template <typename Tx, typename Ty = Tx>
+struct CustomSub {
+  using Transformer = kps::InverseFunctor<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b + a;
+  }
+};
+
 template <typename Tx, typename Ty = Tx>
 struct CustomMean {
   using Transformer = kps::DivideFunctor<Tx>;

From 55b8774206eb443babe5682e37554cef6a37573d Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 8 Dec 2021 14:11:13 +0800
Subject: [PATCH 122/124] [fleet_executor] bug fix for fleet_executor,
 test=allcase (#37934)

---
 python/paddle/fluid/executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 5aee48b7d81fb6..e601c1cb4c3005 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1954,7 +1954,7 @@ def _prepare_fleet_executor(self, program=None, scope=None, fleet_opt=None):
         trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
         trainer_endpoints = trainer_endpoints_str.split(',')
         fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
-        fleet_exe_desc.cur_rank = os.getenv("PADDLE_TRAINER_ID", 0)
+        fleet_exe_desc.cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
         nrank = len(trainer_endpoints)
         for rank, endpoint in enumerate(trainer_endpoints):
             rank_info = fleet_executor_desc_pb2.RankInfo()

From 6b48dfe93dbae9f44759f704a67a08f9003fccfd Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Wed, 8 Dec 2021 14:19:45 +0800
Subject: [PATCH 123/124] [fleet_executor] Add interceptor gc (#37889)

---
 .../distributed/fleet_executor/CMakeLists.txt |  2 +-
 .../distributed/fleet_executor/carrier.cc     | 96 ++++++++++++-------
 .../fleet_executor/compute_interceptor.cc     |  6 ++
 .../distributed/fleet_executor/interceptor.h  |  5 +
 .../fleet_executor/runtime_graph.cc           | 34 +++++--
 .../fleet_executor/runtime_graph.h            |  1 +
 .../distributed/fleet_executor/task_node.h    | 15 +++
 7 files changed, 111 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 4ef5e77aad1110..51f1d936bd70a9 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -13,7 +13,7 @@ endif()
 cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
         interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry
-        ${BRPC_DEPS})
+        executor_gc_helper ${BRPC_DEPS})
 
 if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index e3af0de2c89d76..009df6438e2707 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
@@ -191,49 +192,70 @@ void Carrier::HandleTmpMessages() {
   message_tmp_.clear();
 }
 
+static std::shared_ptr<framework::GarbageCollector> GetGC(
+    const platform::Place& place) {
+  int64_t max_memory_size = framework::GetEagerDeletionThreshold();
+  std::shared_ptr<framework::GarbageCollector> gc;
+  if (max_memory_size >= 0) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (platform::is_gpu_place(place)) {
+      if (framework::IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
+            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+      }
+    }
+#endif
+  }  // max_memory_size >= 0
+
+  return gc;
+}
+
 void Carrier::CreateInterceptors() {
+  if (runtime_graph_->intercepter_id_to_node().empty()) return;
+
+  auto gc = GetGC(place_);
+
   // create each Interceptor
-  if (!(runtime_graph_->intercepter_id_to_node().empty())) {
-    // no auto init since there is no config
-    for (const auto& item : runtime_graph_->intercepter_id_to_node()) {
-      int64_t interceptor_id = item.first;
-      TaskNode* task_node = item.second;
-
-      PADDLE_ENFORCE_LT(
-          task_node->run_at_offset(), task_node->run_per_steps(),
-          platform::errors::InvalidArgument(
-              "Interceptor's run_at_offset must < run_per_steps, must now "
-              "run_at_offset=%ld run_per_steps=%ld",
-              task_node->run_at_offset(), task_node->run_per_steps()));
-
-      std::unique_ptr<Interceptor> interceptor;
-      if (task_node->type().empty()) {
-        // TODO(wangxi): delete this in future
-        interceptor.reset(new Interceptor(interceptor_id, task_node));
-      } else {
-        interceptor = InterceptorFactory::Create(task_node->type(),
-                                                 interceptor_id, task_node);
-      }
-      interceptor->SetPlace(place_);
-      interceptor->SetMiniBatchScope(minibatch_scope_);
-      interceptor->SetMicroBatchScope(microbatch_scopes_);
-      interceptor->SetRootScope(root_scope_);
+  // no auto init since there is no config
+  for (const auto& item : runtime_graph_->intercepter_id_to_node()) {
+    int64_t interceptor_id = item.first;
+    TaskNode* task_node = item.second;
 
-      SetInterceptor(interceptor_id, std::move(interceptor));
-      VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
-              << " with type: " << task_node->type() << ".";
+    PADDLE_ENFORCE_LT(
+        task_node->run_at_offset(), task_node->run_per_steps(),
+        platform::errors::InvalidArgument(
+            "Interceptor's run_at_offset must < run_per_steps, must now "
+            "run_at_offset=%ld run_per_steps=%ld",
+            task_node->run_at_offset(), task_node->run_per_steps()));
 
-      if (task_node->upstream().empty()) {
-        source_interceptor_ids_.emplace_back(interceptor_id);
-      }
+    std::unique_ptr<Interceptor> interceptor;
+    if (task_node->type().empty()) {
+      // TODO(wangxi): delete this in future
+      interceptor.reset(new Interceptor(interceptor_id, task_node));
+    } else {
+      interceptor = InterceptorFactory::Create(task_node->type(),
+                                               interceptor_id, task_node);
+    }
+    interceptor->SetPlace(place_);
+    interceptor->SetMiniBatchScope(minibatch_scope_);
+    interceptor->SetMicroBatchScope(microbatch_scopes_);
+    interceptor->SetRootScope(root_scope_);
+    interceptor->SetGC(gc);
+
+    SetInterceptor(interceptor_id, std::move(interceptor));
+    VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
+            << " with type: " << task_node->type() << ".";
+
+    if (task_node->upstream().empty()) {
+      source_interceptor_ids_.emplace_back(interceptor_id);
     }
-    // The carrier will be always waiting for outside initializer
-    // since there is no interceptor has been created during auto init
-    creating_flag_mutex_.lock();
-    creating_interceptors_ = false;
-    creating_flag_mutex_.unlock();
-    HandleTmpMessages();
   }
+  // The carrier will be always waiting for outside initializer
+  // since there is no interceptor has been created during auto init
+  creating_flag_mutex_.lock();
+  creating_interceptors_ = false;
+  creating_flag_mutex_.unlock();
+  HandleTmpMessages();
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 6a4fadd1304363..35905125a0a430 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
@@ -172,6 +173,11 @@ void ComputeInterceptor::RunOps() {
           << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
     op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+    if (gc_) {
+      framework::DeleteUnusedTensors(
+          *microbatch_scopes_[step_ % node_->max_run_times()], op,
+          node_->unused_vars(), gc_.get());
+    }
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index ef1ffb1a53b3fc..b0c1e46f031382 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -31,6 +31,7 @@
 namespace paddle {
 namespace framework {
 class Scope;
+class GarbageCollector;
 }
 namespace distributed {
 
@@ -73,6 +74,9 @@ class Interceptor {
   void SetMicroBatchScope(const std::vector<framework::Scope*>& scopes) {
     microbatch_scopes_ = scopes;
   }
+  void SetGC(const std::shared_ptr<framework::GarbageCollector>& gc) {
+    gc_ = gc;
+  }
 
   TaskNode* GetTaskNode() const { return node_; }
 
@@ -94,6 +98,7 @@ class Interceptor {
   framework::Scope* root_scope_{nullptr};
   framework::Scope* minibatch_scope_{nullptr};
   std::vector<framework::Scope*> microbatch_scopes_{};
+  std::shared_ptr<framework::GarbageCollector> gc_{nullptr};
 
  private:
   // pool the local mailbox, parse the Message
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
index 2f5fff25f1063d..32f9e36e530370 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -101,16 +102,7 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
                            const FleetExecutorDesc& exe_desc)
     : exe_desc_(exe_desc) {
   if (exe_desc.pp_degree() == 1) {
-    int64_t cur_rank = exe_desc_.cur_rank();
-    int64_t max_run_times = exe_desc_.num_micro_batches();
-    int64_t max_slot_nums = exe_desc_.num_slots();
-    auto task_node = std::make_unique<TaskNode>(program, cur_rank,
-                                                max_run_times, max_slot_nums);
-    task_node->SetType("Compute");
-    task_nodes_.emplace_back(std::move(task_node));
-    int64_t task_id = task_nodes_[0]->task_id();
-    intercepter_id_to_rank_.insert({task_id, cur_rank});
-    intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()});
+    OriginProgramCompile(program);
   } else {
     SplitProgramBasedFunctionality(program);
     AssignTaskToIntercepter();
@@ -119,10 +111,31 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
   }
 }
 
+void RuntimeGraph::OriginProgramCompile(const ProgramDesc& program) {
+  int64_t cur_rank = exe_desc_.cur_rank();
+  int64_t max_run_times = exe_desc_.num_micro_batches();
+  int64_t max_slot_nums = exe_desc_.num_slots();
+
+  auto task_node = std::make_unique<TaskNode>(program, cur_rank, max_run_times,
+                                              max_slot_nums);
+  // TODO(wangxi): add skip vars
+  auto unused_vars =
+      framework::GetUnusedVars(program.Block(0), task_node->unique_ops(), {});
+  task_node->SetType("Compute");
+  task_node->SetUnusedVars(unused_vars);
+
+  task_nodes_.emplace_back(std::move(task_node));
+  int64_t task_id = task_nodes_[0]->task_id();
+  intercepter_id_to_rank_.insert({task_id, cur_rank});
+  intercepter_id_to_node_.insert({task_id, task_nodes_[0].get()});
+}
+
 void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
   for (const auto& op_desc : program.Block(0).AllOps()) {
     ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
   }
+  // TODO(wangxi): how to gc pipeline backward send
+  auto unused_vars = framework::GetUnusedVars(program.Block(0), ops_, {});
 
   std::unordered_map<int32_t, std::vector<OperatorBase*>> role_to_ops;
   for (const auto& op : ops_) {
@@ -183,6 +196,7 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
     } else {
       task_node->SetType("Compute");
     }
+    task_node->SetUnusedVars(unused_vars);
     task_nodes_.emplace_back(std::move(task_node));
     ++task_id;
   }
diff --git a/paddle/fluid/distributed/fleet_executor/runtime_graph.h b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
index b19456962d6316..26b758767c07fc 100644
--- a/paddle/fluid/distributed/fleet_executor/runtime_graph.h
+++ b/paddle/fluid/distributed/fleet_executor/runtime_graph.h
@@ -52,6 +52,7 @@ class RuntimeGraph final {
   void FakeDependence();
   void AssignTaskToIntercepter();
   void FakeRuntimeInfo();
+  void OriginProgramCompile(const ProgramDesc& program);
   // LRSched, Forward, Backward, Optimize
   static std::vector<paddle::framework::OpRole> functionality_order;
   std::vector<std::unique_ptr<TaskNode>> task_nodes_;
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index a03ccd4cded18e..37105bdd230abd 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -57,12 +57,24 @@ class TaskNode final {
   const std::string& type() const { return type_; }
   const paddle::framework::ProgramDesc& program() const { return program_; }
   const std::vector<OperatorBase*>& ops() const { return ops_; }
+  const std::vector<std::unique_ptr<OperatorBase>>& unique_ops() const {
+    return ops_vec_;
+  }
+  const std::unordered_map<const OperatorBase*, std::vector<std::string>>&
+  unused_vars() const {
+    return unused_vars_;
+  }
 
   void SetRunPerSteps(int64_t value);
   void SetRunAtOffset(int64_t value);
   void SetReplyUpPerSteps(int64_t value);
   void SetSendDownPerSteps(int64_t value);
   void SetType(const std::string& type) { type_ = type; }
+  void SetUnusedVars(
+      const std::unordered_map<const OperatorBase*, std::vector<std::string>>&
+          unused_vars) {
+    unused_vars_ = unused_vars;
+  }
 
   // upstream need buffs?
   bool AddUpstreamTask(int64_t task_id, int64_t buff_size = 1);
@@ -79,6 +91,9 @@ class TaskNode final {
   std::unordered_map<int64_t, int64_t> downstream_;
   framework::ProgramDesc program_;
   std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
+  std::unordered_map<const OperatorBase*, std::vector<std::string>>
+      unused_vars_;
+
   int32_t role_;
   int64_t rank_;
   int64_t task_id_;

From 46212b80fd076b89042f268ac895d657fc4750dd Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Wed, 8 Dec 2021 15:30:13 +0800
Subject: [PATCH 124/124] add update func of auto search (#37867)

* add update func of auto search

* update unitest
---
 .../paddle/distributed/auto_parallel/utils.py | 136 +++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/test_auto_parallel_searcher.py  | 179 ++++++++++++++++++
 3 files changed, 318 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py

diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index a3505eae876ef2..92918c834a5da7 100755
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1036,3 +1036,139 @@ def set_grad_var_shape(program, dist_context):
 
                 if list(grad_var.shape) != ref_shape:
                     grad_var.desc.set_shape(ref_shape)
+
+
+def update_op_dims_mapping_by_default_dist_impl(dist_op):
+    changed = False
+    op_dist_attr = dist_op.dist_attr
+    op_desc = dist_op.serial_op.desc
+    # The following statement will be replaced by a more elegent way
+    if op_desc.type() == "shape" or op_desc.type() == "slice":
+        return False
+    output_names = op_desc.output_names()
+    xshape_arg_names = []
+    if "XShape" in output_names:
+        xshape_arg_names = op_desc.output("XShape")
+    batch_dim_mappings = []
+    for arg_name in op_desc.input_arg_names():
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if len(dims_mapping) > 1:
+            for idx, mapping in enumerate(dims_mapping[1:]):
+                assert mapping == -1, \
+                    "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
+                        .format(op_desc.type(), idx, mapping)
+        batch_dim_mappings.append(dims_mapping[0])
+    for arg_name in op_desc.output_arg_names():
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if arg_name not in xshape_arg_names:
+            if len(dims_mapping) > 1:
+                for idx, mapping in enumerate(dims_mapping[1:]):
+                    assert mapping == -1, \
+                        "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
+                            .format(op_desc.type(), idx, mapping)
+            batch_dim_mappings.append(dims_mapping[0])
+        else:
+            assert dims_mapping[0] == -1, \
+                "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part."\
+                    .format(op_desc.type(), mapping)
+            if len(dims_mapping) > 2:
+                for idx, mapping in enumerate(dims_mapping[2:]):
+                    assert mapping == -1, \
+                        "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part."\
+                            .format(op_desc.type(), idx, mapping)
+            batch_dim_mappings.append(dims_mapping[1])
+
+    compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
+    assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
+    for arg_name in op_desc.input_arg_names():
+        serial_tensor = dist_op.get_serial_input(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if compatible_dim_mapping != dims_mapping[0]:
+            dims_mapping[0] = compatible_dim_mapping
+            changed = True
+    for arg_name in op_desc.output_arg_names():
+        serial_tensor = dist_op.get_serial_output(arg_name)
+        if serial_tensor.is_parameter:
+            continue
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if arg_name not in xshape_arg_names:
+            if compatible_dim_mapping != dims_mapping[0]:
+                dims_mapping[0] = compatible_dim_mapping
+                changed = True
+        else:
+            if compatible_dim_mapping != dims_mapping[1]:
+                dims_mapping[1] = compatible_dim_mapping
+                changed = True
+
+    return changed
+
+
+def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
+    changed = False
+    op_dist_attr = dist_op.dist_attr
+    op_desc = dist_op.serial_op.desc
+    input_arg_names = op_desc.input_arg_names()
+    input_dims_mapping_dict = {}
+    input_dims_mapping_lens = {}
+    max_dims_mapping_len = -1
+    for arg_name in input_arg_names:
+        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+        if max_dims_mapping_len < len(dims_mapping):
+            max_dims_mapping_len = len(dims_mapping)
+        input_dims_mapping_dict[arg_name] = dims_mapping
+        input_dims_mapping_lens[arg_name] = len(dims_mapping)
+
+    dims_mapping_list = []
+    for arg_name in input_arg_names:
+        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+            new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
+            for i in range(input_dims_mapping_lens[arg_name]):
+                new_idx = (max_dims_mapping_len -
+                           input_dims_mapping_lens[arg_name]) + i
+                new_dims_mapping[new_idx] = input_dims_mapping_dict[arg_name][i]
+            dims_mapping_list.append(new_dims_mapping)
+        else:
+            dims_mapping_list.append(input_dims_mapping_dict[arg_name])
+    output_arg_names = op_desc.output_arg_names()
+    for arg_name in output_arg_names:
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        assert len(dims_mapping) == max_dims_mapping_len
+        dims_mapping_list.append(dims_mapping)
+
+    compatible_dims_mapping = compute_compatible_dims_mapping(dims_mapping_list)
+    assert compatible_dims_mapping is not None, "There is no compatible dim mapping."
+
+    for arg_name in input_arg_names:
+        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+            new_dims_mapping = [
+                -1 for _ in range(input_dims_mapping_lens[arg_name])
+            ]
+            for i in range(input_dims_mapping_lens[arg_name]):
+                new_idx = (max_dims_mapping_len -
+                           input_dims_mapping_lens[arg_name]) + i
+                new_dims_mapping[i] = compatible_dims_mapping[new_idx]
+            if new_dims_mapping != input_dims_mapping_dict[arg_name]:
+                op_dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
+                changed = True
+        else:
+            if compatible_dims_mapping != input_dims_mapping_dict[arg_name]:
+                op_dist_attr.set_input_dims_mapping(arg_name,
+                                                    compatible_dims_mapping)
+                changed = True
+
+    for arg_name in output_arg_names:
+        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+        if compatible_dims_mapping != dims_mapping:
+            op_dist_attr.set_output_dims_mapping(arg_name,
+                                                 compatible_dims_mapping)
+            changed = True
+
+    return changed
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4162f697d27eac..5fdcd6d0a9d385 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -92,6 +92,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_searcher)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
@@ -257,6 +258,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_searcher)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
@@ -643,6 +645,7 @@ if(WITH_DISTRIBUTE)
             py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_searcher MODULES test_auto_parallel_searcher ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
new file mode 100644
index 00000000000000..665a16c862c848
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+# import os
+# import copy
+# import json
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+# from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+# from paddle.distributed.auto_parallel.cluster import Cluster
+# from paddle.distributed.auto_parallel.utils import SerialProgramInfo
+# from paddle.distributed.auto_parallel.searcher import Checker, Enumerater
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+# from paddle.distributed.auto_parallel.utils import get_all_distributed_main_program
+from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
+from paddle.distributed.auto_parallel.utils import update_op_dims_mapping_by_default_dist_impl
+from paddle.distributed.auto_parallel.utils import update_op_dims_mapping_by_elementwise_like_dist_impl
+
+paddle.enable_static()
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = paddle.unsqueeze(out, axis=0)
+        out = paddle.reshape(out, [4, 1024])
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+        loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = loss_func(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def set_default_dist_attr(program, dist_context, process_mesh):
+    ops = program.global_block().ops
+    vars = program.global_block().vars
+    for op in ops:
+        op_dist_attr = OperatorDistributedAttribute()
+        op_dist_attr.process_mesh = process_mesh
+        for var_name in op.input_arg_names:
+            tensor_dist_attr = TensorDistributedAttribute()
+            tensor_dist_attr.process_mesh = process_mesh
+            tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
+            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
+                                                          tensor_dist_attr)
+            op_dist_attr.set_input_dims_mapping(var_name,
+                                                tensor_dist_attr.dims_mapping)
+
+        for var_name in op.output_arg_names:
+            tensor_dist_attr = TensorDistributedAttribute()
+            tensor_dist_attr.process_mesh = process_mesh
+            tensor_dist_attr.dims_mapping = [-1 for i in vars[var_name].shape]
+            dist_context.set_tensor_dist_attr_for_program(vars[var_name],
+                                                          tensor_dist_attr)
+            op_dist_attr.set_output_dims_mapping(var_name,
+                                                 tensor_dist_attr.dims_mapping)
+        dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
+
+    dist_context.add_process_mesh(process_mesh)
+
+
+class TestMLPSearcher(unittest.TestCase):
+    def test_update(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        _, train_program, startup_program = mlp_forward(train_program,
+                                                        startup_program)
+        global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        dist_context = DistributedContext()
+        set_default_dist_attr(train_program, dist_context, global_process_mesh)
+        ops = train_program.global_block().ops
+        vars = train_program.global_block().vars
+        from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
+        from paddle.distributed.auto_parallel.completion import is_elementwise_like_op
+        from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+
+        for op in ops:
+            dist_op_impl_container = get_distributed_operator_impl_container(
+                op.type)
+            if dist_op_impl_container is None:
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                dist_op = DistributedOperator(op, op_dist_attr)
+                if is_elementwise_like_op(op.type):
+                    changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
+                        dist_op)
+                    self.assertFalse(changed)
+
+                    dist_op.dist_attr.set_output_dims_mapping(
+                        op.output_arg_names[0], [0] + [
+                            -1
+                            for i in range(
+                                1, len(vars[op.output_arg_names[0]].shape))
+                        ])
+                    try:
+                        changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
+                            dist_op)
+                    except:
+                        continue
+                    self.assertTrue(changed)
+                else:
+                    changed = update_op_dims_mapping_by_default_dist_impl(
+                        dist_op)
+                    self.assertFalse(changed)
+
+                    dist_op.dist_attr.set_output_dims_mapping(
+                        op.output_arg_names[0], [0] + [
+                            -1
+                            for i in range(
+                                1, len(vars[op.output_arg_names[0]].shape))
+                        ])
+                    try:
+                        changed = update_op_dims_mapping_by_default_dist_impl(
+                            dist_op)
+                    except:
+                        continue
+                    self.assertTrue(changed)
+
+
+if __name__ == "__main__":
+    unittest.main()