From 19b02d95e099ae066a1f58161501ed2d5140988a Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Mon, 25 Oct 2021 19:46:15 +0800
Subject: [PATCH 01/14] [NPU] modifications for model ernie-1.0 (#36642)

* [NPU] modifications for model ernie-1.0

* rollback 503003 and change cast to dtype
---
 paddle/fluid/operators/cumsum_op_npu.cc       |  45 +-
 .../elementwise/elementwise_sub_op_npu.cc     |   6 +
 .../fluid/operators/lookup_table_v2_op_npu.cc |  55 +-
 paddle/fluid/operators/matmul_op_npu.cc       | 528 ++++++++++++++----
 .../tests/unittests/npu/test_cumsum_op_npu.py |  40 ++
 .../npu/test_elementwise_sub_op_npu.py        |   5 +
 .../npu/test_lookup_table_v2_op_npu.py        |  40 +-
 .../tests/unittests/npu/test_matmul_op_npu.py | 329 +++++++++++
 8 files changed, 908 insertions(+), 140 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index e8cf1a46db3cca..486e85b0f0dfca 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -21,6 +21,38 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+static void CumsumImp(const Tensor& input, Tensor* output,
+                      const framework::NPUAttributeMap& attr_input,
+                      const framework::ExecutionContext& ctx) {
+  auto stream =
+      ctx.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+  if (input.type() == framework::proto::VarType::INT64) {
+    Tensor tmp_input;
+    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
+    auto dst_acl_dtype = ConvertToNpuDtype(tmp_input.type());
+    const auto& cast_runner_1 =
+        NpuOpRunner("Cast", {input}, {tmp_input},
+                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
+    cast_runner_1.Run(stream);
+
+    Tensor tmp_output;
+    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
+    const auto& runner =
+        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
+    runner.Run(stream);
+
+    dst_acl_dtype = ConvertToNpuDtype(output->type());
+    const auto& cast_runner_2 =
+        NpuOpRunner("Cast", {tmp_output}, {*output},
+                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
+    cast_runner_2.Run(stream);
+  } else {
+    const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
+    runner.Run(stream);
+  }
+}
+
 template <typename DeviceContext, typename T>
 class CumSumNPUKernel : public framework::OpKernel<T> {
  public:
@@ -36,10 +68,6 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
     framework::NPUAttributeMap attr_input = {
         {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
     bool flatten = ctx.Attr<bool>("flatten");
     if (flatten) {
       PADDLE_ENFORCE_EQ(
@@ -53,11 +81,9 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
 
       new_x.Resize(framework::make_ddim({x->numel()}));
 
-      const auto& runner = NpuOpRunner("CumsumD", {new_x}, {*out}, attr_input);
-      runner.Run(stream);
+      CumsumImp(new_x, out, attr_input, ctx);
     } else {
-      const auto& runner = NpuOpRunner("CumsumD", {*x}, {*out}, attr_input);
-      runner.Run(stream);
+      CumsumImp(*x, out, attr_input, ctx);
     }
   }
 };
@@ -69,5 +95,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
     cumsum, ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
+#endif
     ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
     ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 48b98dafc7bb56..4cc4228b164298 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -167,10 +167,16 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::ElementwiseSubNPUKernel<int64_t>,
+#endif
                        ops::ElementwiseSubNPUKernel<float>,
                        ops::ElementwiseSubNPUKernel<plat::float16>);
 
 REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
                        ops::ElementwiseSubGradNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::ElementwiseSubGradNPUKernel<int64_t>,
+#endif
                        ops::ElementwiseSubGradNPUKernel<float>,
                        ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index b75ae8a65881a5..3cb91c712335d6 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+constexpr int64_t kNoPadding = -1;
+
 template <typename DeviceContext, typename T>
 class LookupTableV2NPUKernel : public framework::OpKernel<T> {
  public:
@@ -35,16 +38,52 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument("npu only accept LoDTensor"));
     output_t->mutable_data<T>(ctx.GetPlace());
 
-    NpuOpRunner runner;
-    runner.SetType("GatherV2")
-        .AddInput(*table_t)
-        .AddInput(*ids_t)
-        .AddInput(std::vector<int32_t>{0})
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
+    if (padding_idx == kNoPadding) {
+      NpuOpRunner runner;
+      runner.SetType("GatherV2")
+          .AddInput(*table_t)
+          .AddInput(*ids_t)
+          .AddInput(std::vector<int32_t>{0})
+#if (CANN_VERSION_CODE >= 503003)
+          .AddAttrs({{"batch_dims", 0}})
+#endif
+          .AddOutput(*output_t);
+      runner.Run();
+    } else {
+      Tensor tmp_table_t(table_t->type());
+      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
+
+      Tensor index;
+      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<int32_t>(&index,
+                                         static_cast<int32_t>(padding_idx));
+
+      auto updata_dim = framework::make_ddim({1, table_t->dims()[1]});
+      Tensor update;
+      update.mutable_data<T>(updata_dim, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
+      update.Resize(updata_dim);
+
+      NpuOpRunner update_runner;
+      update_runner.SetType("TensorScatterUpdate")
+          .AddInput(*table_t)
+          .AddInput(index)
+          .AddInput(update)
+          .AddOutput(tmp_table_t);
+      update_runner.Run();
+
+      NpuOpRunner runner;
+      runner.SetType("GatherV2")
+          .AddInput(tmp_table_t)
+          .AddInput(*ids_t)
+          .AddInput(std::vector<int32_t>{0})
 #if (CANN_VERSION_CODE >= 503003)
-        .AddAttrs({{"batch_dims", 0}})
+          .AddAttrs({{"batch_dims", 0}})
 #endif
-        .AddOutput(*output_t);
-    runner.Run();
+          .AddOutput(*output_t);
+      runner.Run();
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
index d5606177a55926..df811abc1de98b 100644
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
@@ -21,40 +19,253 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void Mul(const framework::ExecutionContext& ctx,
+                const aclrtStream& stream, const Tensor& X, const Tensor& Y,
+                Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
+    runner_dx.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
+    runner_dx.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void Dot(const framework::ExecutionContext& ctx,
+                const aclrtStream& stream, const Tensor& X, const Tensor& Y,
+                Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner =
+        NpuOpRunner("MatMul", {X, Y}, {*Out},
+                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner =
+        NpuOpRunner("MatMul", {X, Y}, {Out_temp},
+                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+    const auto& runner =
+        NpuOpRunner("BatchMatMul", {X, Y}, {*Out},
+                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+    runner.Run(stream);
+  } else {
+    Tensor Out_temp(Out->type());
+    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
+    const auto& out_temp_runner =
+        NpuOpRunner("BatchMatMul", {X, Y}, {Out_temp},
+                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+    out_temp_runner.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
+    runner.Run(stream);
+  }
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& brd_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = brd_dims.size();
+  int64_t diff = brd_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (brd_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
+                                   {{"axes", axes}, {"keep_dims", false}});
+  runner.Run(stream);
+}
+
 template <typename DeviceContext, typename T>
 class MatMulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* Out = ctx.Output<framework::Tensor>("Out");
     bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-    if (x->dims().size() == 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-      const auto& runner = NpuOpRunner(
-          "MatMul", {*x, *y}, {*out},
-          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      PADDLE_ENFORCE_EQ(
+          X->numel(), Y->numel(),
+          platform::errors::InvalidArgument(
+              "X's numbers must be equal to Y's numbers,"
+              "when X/Y's dims =1. But received X has [%d] elements,"
+              "received Y has [%d] elements",
+              X->numel(), Y->numel()));
+      Out->Resize({1});
+      Dot<T>(ctx, stream, *X, *Y, Out, alpha);
+      return;
+    }
 
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (transpose_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (x_ndim == 2 && y_ndim == 2) {
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y,
+                  alpha);
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (transpose_x == false && y_ndim == 2) {
+      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
+      x_temp.Resize(framework::make_ddim(vec_dim));
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y,
+                  alpha);
+      return;
+    }
 
-    } else if (x->dims().size() > 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-      const auto& runner =
-          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
-                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
 
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
     }
+    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, transpose_x,
+                transpose_y, alpha);
   }
 };
 
@@ -62,109 +273,200 @@ template <typename DeviceContext, typename T>
 class MatMulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
     bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (x->dims().size() == 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", false}});
-
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
 
-          runner_dy.Run(stream);
-        }
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      Tensor dout_temp(dOut->type());
+      dout_temp.Resize(X->dims());
+      dout_temp.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner;
+      runner.SetType("BroadcastTo")
+          .AddInput(*dOut)
+          .AddInput(std::move(x_dims))
+          .AddOutput(dout_temp)
+          .Run(stream);
+
+      if (dX) {
+        Mul<T>(ctx, stream, dout_temp, *Y, dX, alpha);
+      }
+      if (dY) {
+        Mul<T>(ctx, stream, dout_temp, *X, dY, alpha);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
 
-          runner_dy.Run(stream);
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_dims));
+        if (transpose_x) {
+          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, transpose_y, true,
+                      alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y,
+                      alpha);
         }
+        dX->Resize(X->dims());
       }
-    } else if (x->dims().size() > 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", false}});
-
-          runner_dx.Run(stream);
+      if (dY) {
+        dY->Resize(framework::make_ddim(y_dims));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, transpose_x,
+                      alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !transpose_x, false,
+                      alpha);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
-                          {{"adj_x1", true}, {"adj_x2", false}});
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
 
-          runner_dy.Run(stream);
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (transpose_x == false && y_ndim == 2) {
+      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
+      dout_temp.Resize(
+          framework::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_vec_dim));
+        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y,
+                    alpha);
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        x_temp.Resize(framework::make_ddim(x_vec_dim));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false, alpha);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false, alpha);
         }
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
+      }
+      return;
+    }
 
-          runner_dx.Run(stream);
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
+
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+
+    if (dX) {
+      if (x_dims == x_broadcast_dims) {
+        if (transpose_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true,
+                      alpha);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false,
+                      !transpose_y, alpha);
+        }
+      } else {
+        Tensor dx_temp(X->type());
+        dx_temp.Resize(framework::make_ddim(x_broadcast_dims));
+        if (transpose_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, &dx_temp, transpose_y,
+                      true, alpha);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false,
+                      !transpose_y, alpha);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
-              (dy->dims().size() == 2)) {
-            framework::Tensor dout_tmp;
-            dout_tmp.ShareDataWith(*dout);
-            std::vector<int> vec_dim =
-                framework::vectorize<int>(dout_tmp.dims());
-            std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
-            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
-
-            framework::Tensor x_tmp;
-            x_tmp.ShareDataWith(*x);
-            std::vector<int> vec_dim_x =
-                framework::vectorize<int>(x_tmp.dims());
-            std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
-                                         vec_dim_x[2]};
-            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
-            const auto& runner_dy =
-                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
-                            {{"transpose_x1", true}, {"transpose_x2", false}});
-            runner_dy.Run(stream);
-          } else {
-            const auto& runner_dy =
-                NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
-                            {{"adj_x1", true}, {"adj_x2", false}});
-            runner_dy.Run(stream);
-          }
+        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
+      }
+    }
+    if (dY) {
+      if (y_dims == y_broadcast_dims) {
+        if (transpose_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x,
+                      alpha);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !transpose_x,
+                      false, alpha);
+        }
+      } else {
+        Tensor dy_temp(Y->type());
+        dy_temp.Resize(framework::make_ddim(y_broadcast_dims));
+        if (transpose_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true,
+                      transpose_x, alpha);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, &dy_temp,
+                      !transpose_x, false, alpha);
         }
+        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
       }
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
index 5a3f98524bbd09..9289da6641e7da 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
@@ -249,5 +249,45 @@ def init_testcase(self):
         self.outputs = {'Out': self.inputs['X'].cumsum()}
 
 
+#----------------Cumsum Int64----------------
+class TestNPUCumSumOpInt64(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {
+            'X': np.random.randint(
+                1, 10000, size=(5, 6, 10)).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+
+def create_test_int64(parent):
+    class TestCumSumInt64(parent):
+        def init_dtype(self):
+            self.dtype = np.int64
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Int64")
+    TestCumSumInt64.__name__ = cls_name
+    globals()[cls_name] = TestCumSumInt64
+
+
+create_test_int64(TestNPUCumSumOp1)
+create_test_int64(TestNPUCumSumOp2)
+create_test_int64(TestNPUCumSumOp3)
+create_test_int64(TestNPUCumSumOp4)
+create_test_int64(TestNPUCumSumOp5)
+create_test_int64(TestNPUCumSumOp7)
+create_test_int64(TestNPUCumSumExclusive1)
+create_test_int64(TestNPUCumSumExclusive2)
+create_test_int64(TestNPUCumSumExclusive3)
+create_test_int64(TestNPUCumSumExclusive4)
+create_test_int64(TestNPUCumSumExclusive5)
+create_test_int64(TestNPUCumSumReverseExclusive)
+create_test_int64(TestNPUCumSumWithFlatten1)
+create_test_int64(TestNPUCumSumWithFlatten2)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
index 7c8710fd42b36e..fac2bc66ff49bd 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -95,6 +95,11 @@ def init_dtype(self):
         self.dtype = np.int32
 
 
+class TestElementwiseSubOpInt64(TestElementwiseSubOp):
+    def init_dtype(self):
+        self.dtype = np.int64
+
+
 class TestSubtractAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 56f04a6e993f3a..1031be4c1a7b41 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -33,14 +33,15 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
 
         self.init_dtype()
-        self.init_dim()
+        self.init_dims()
+        self.init_padding_idx()
         np.random.seed(SEED)
-        bsz = 6
-        seqlen = 8
-        vocab = 10
-        w = np.ones([vocab, self.dim]).astype(self.dtype)
-        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
-        out = np.ones([bsz, seqlen, self.dim]).astype(self.dtype)
+        w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
+        x = np.random.randint(
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32)
+        out = w[x]
+        if self.padding_idx != -1:
+            out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
 
         self.inputs = {
             'W': OpTest.np_dtype_to_fluid_dtype(w),
@@ -50,7 +51,7 @@ def setUp(self):
             'is_sparse': False,
             'is_distributed': False,
             'remote_prefetch': False,
-            'padding_idx': -1
+            'padding_idx': self.padding_idx
         }
         self.outputs = {'Out': out}
 
@@ -60,10 +61,16 @@ def set_npu(self):
     def init_dtype(self):
         self.dtype = np.float32
 
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         # embedding_dim is not multiple of 32
         self.dim = 20
 
+    def init_padding_idx(self):
+        self.padding_idx = -1
+
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
@@ -85,7 +92,10 @@ def set_npu(self):
 
 
 class TestLookupTableV2Dim32(TestLookupTableV2):
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         # embedding_dim is multiple of 32
         self.dim = 64
 
@@ -96,7 +106,10 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def init_dim(self):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
         self.dim = 64
 
     def set_npu(self):
@@ -104,5 +117,10 @@ def set_npu(self):
         self.__class__.no_need_check_grad = True
 
 
+class TestLookupTableV2WithPadding(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
new file mode 100644
index 00000000000000..a8dc0c137c3536
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmul_op_npu.py
@@ -0,0 +1,329 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "matmul"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y,
+                               self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y,
+            'alpha': self.alpha
+        }
+        self.outputs = {'Out': Out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+#--------------------test matmul alpha--------------------
+def create_test_alpha_class(parent):
+    class TestMatMulOpAlphaCase(parent):
+        def init_alpha(self):
+            self.alpha = 0.125
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+    TestMatMulOpAlphaCase.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpAlphaCase
+
+
+create_test_alpha_class(TestMatMulOp)
+create_test_alpha_class(TestMatMulOp1)
+create_test_alpha_class(TestMatMulOp2)
+create_test_alpha_class(TestMatMulOp3)
+create_test_alpha_class(TestMatMulOp4)
+create_test_alpha_class(TestMatMulOp5)
+create_test_alpha_class(TestMatMulOp6)
+create_test_alpha_class(TestMatMulOp9)
+create_test_alpha_class(TestMatMulOp10)
+create_test_alpha_class(TestMatMulOp11)
+create_test_alpha_class(TestMatMulOp12)
+create_test_alpha_class(TestMatMulOp13)
+
+
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()

From cea1ba88b8aa3940c55145e2a86b1ee48e0f2a57 Mon Sep 17 00:00:00 2001
From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com>
Date: Mon, 25 Oct 2021 21:34:24 +0800
Subject: [PATCH 02/14] add ctr accessor (#36601)

---
 paddle/fluid/distributed/ps.proto             |   9 -
 paddle/fluid/distributed/table/CMakeLists.txt |   5 +-
 .../fluid/distributed/table/ctr_accessor.cc   | 329 ++++++++++++++++++
 paddle/fluid/distributed/table/ctr_accessor.h | 223 ++++++++++++
 paddle/fluid/distributed/test/CMakeLists.txt  |   3 +
 .../distributed/test/ctr_accessor_test.cc     | 304 ++++++++++++++++
 paddle/fluid/string/string_helper.cc          |  34 --
 paddle/fluid/string/string_helper.h           |  34 +-
 8 files changed, 893 insertions(+), 48 deletions(-)
 create mode 100644 paddle/fluid/distributed/table/ctr_accessor.cc
 create mode 100644 paddle/fluid/distributed/table/ctr_accessor.h
 create mode 100644 paddle/fluid/distributed/test/ctr_accessor_test.cc

diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
index 002be15b003eb3..4483f960eb1371 100644
--- a/paddle/fluid/distributed/ps.proto
+++ b/paddle/fluid/distributed/ps.proto
@@ -119,13 +119,11 @@ message TableParameter {
 
 message TableAccessorParameter {
   optional string accessor_class = 1;
-  //  optional SparseSGDRuleParameter sparse_sgd_param = 2;
   optional uint32 fea_dim = 4 [ default = 11 ];
   optional uint32 embedx_dim = 5 [ default = 8 ];
   optional uint32 embedx_threshold = 6 [ default = 10 ];
   optional CtrAccessorParameter ctr_accessor_param = 7;
   repeated TableAccessorSaveParameter table_accessor_save_param = 8;
-  //  optional SparseCommonSGDRuleParameter sparse_commonsgd_param = 9;
   optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
   optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
 }
@@ -182,13 +180,6 @@ message TableAccessorSaveParameter {
   optional string deconverter = 3;
 }
 
-// message SparseSGDRuleParameter {
-//    optional double learning_rate = 1 [default = 0.05];
-//    optional double initial_g2sum = 2 [default = 3.0];
-//    optional double initial_range = 3 [default = 0.0001];
-//    repeated float weight_bounds = 4;
-//}
-
 message SparseCommonSGDRuleParameter {
   optional string name = 1;
   optional SparseNaiveSGDRuleParameter naive = 2;
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index b4b87e652b7dbc..7ec7041b63ba1f 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -36,7 +36,8 @@ cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto exec
 set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
+cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 
-
-cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost sparse_sgd_rule)
+cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor)
diff --git a/paddle/fluid/distributed/table/ctr_accessor.cc b/paddle/fluid/distributed/table/ctr_accessor.cc
new file mode 100644
index 00000000000000..1ef8c9e152733f
--- /dev/null
+++ b/paddle/fluid/distributed/table/ctr_accessor.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include <gflags/gflags.h>
+#include "glog/logging.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+
+int CtrCommonAccessor::initialize() {
+  auto name = _config.embed_sgd_param().name();
+  _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1);
+
+  name = _config.embedx_sgd_param().name();
+  _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embedx_sgd_rule->load_config(_config.embedx_sgd_param(),
+                                _config.embedx_dim());
+
+  common_feature_value.embed_sgd_dim = _embed_sgd_rule->dim();
+  common_feature_value.embedx_dim = _config.embedx_dim();
+  common_feature_value.embedx_sgd_dim = _embedx_sgd_rule->dim();
+  _show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
+
+  return 0;
+}
+
+size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); }
+
+size_t CtrCommonAccessor::dim_size(size_t dim) {
+  auto embedx_dim = _config.embedx_dim();
+  return common_feature_value.dim_size(dim, embedx_dim);
+}
+
+size_t CtrCommonAccessor::size() { return common_feature_value.size(); }
+
+size_t CtrCommonAccessor::mf_size() {
+  return (_config.embedx_dim() + common_feature_value.embedx_sgd_dim) *
+         sizeof(float);  // embedx embedx_g2sum
+}
+
+// pull value
+size_t CtrCommonAccessor::select_dim() {
+  auto embedx_dim = _config.embedx_dim();
+  return 1 + embedx_dim;
+}
+
+size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); }
+
+size_t CtrCommonAccessor::select_size() { return select_dim() * sizeof(float); }
+
+// push value
+size_t CtrCommonAccessor::update_dim() {
+  auto embedx_dim = _config.embedx_dim();
+  return 4 + embedx_dim;
+}
+
+size_t CtrCommonAccessor::update_dim_size(size_t dim) { return sizeof(float); }
+
+size_t CtrCommonAccessor::update_size() { return update_dim() * sizeof(float); }
+
+bool CtrCommonAccessor::shrink(float* value) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delete_after_unseen_days =
+      _config.ctr_accessor_param().delete_after_unseen_days();
+  auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
+
+  // time_decay first
+  common_feature_value.show(value) *= _show_click_decay_rate;
+  common_feature_value.click(value) *= _show_click_decay_rate;
+
+  // shrink after
+  auto score = show_click_score(common_feature_value.show(value),
+                                common_feature_value.click(value));
+  auto unseen_days = common_feature_value.unseen_days(value);
+  if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
+    return true;
+  }
+  return false;
+}
+
+bool CtrCommonAccessor::save(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    // save all
+    case 0: {
+      return true;
+    }
+    // save xbox delta
+    case 1:
+    // save xbox base
+    case 2: {
+      if (show_click_score(common_feature_value.show(value),
+                           common_feature_value.click(value)) >=
+              base_threshold &&
+          common_feature_value.delta_score(value) >= delta_threshold &&
+          common_feature_value.unseen_days(value) <= delta_keep_days) {
+        // do this after save, because it must not be modified when retry
+        if (param == 2) {
+          common_feature_value.delta_score(value) = 0;
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+    // already decayed in shrink
+    case 3: {
+      // do this after save, because it must not be modified when retry
+      // common_feature_value.unseen_days(value)++;
+      return true;
+    }
+    // save revert batch_model
+    case 5: {
+      return true;
+    }
+    default:
+      return true;
+  }
+}
+
+void CtrCommonAccessor::update_stat_after_save(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    case 1: {
+      if (show_click_score(common_feature_value.show(value),
+                           common_feature_value.click(value)) >=
+              base_threshold &&
+          common_feature_value.delta_score(value) >= delta_threshold &&
+          common_feature_value.unseen_days(value) <= delta_keep_days) {
+        common_feature_value.delta_score(value) = 0;
+      }
+    }
+      return;
+    case 3: {
+      common_feature_value.unseen_days(value)++;
+    }
+      return;
+    default:
+      return;
+  }
+}
+
+int32_t CtrCommonAccessor::create(float** values, size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* value = values[value_item];
+    value[common_feature_value.unseen_days_index()] = 0;
+    value[common_feature_value.delta_score_index()] = 0;
+    value[common_feature_value.show_index()] = 0;
+    value[common_feature_value.click_index()] = 0;
+    value[common_feature_value.slot_index()] = -1;
+    _embed_sgd_rule->init_value(
+        value + common_feature_value.embed_w_index(),
+        value + common_feature_value.embed_g2sum_index());
+    _embedx_sgd_rule->init_value(
+        value + common_feature_value.embedx_w_index(),
+        value + common_feature_value.embedx_g2sum_index(), false);
+  }
+  return 0;
+}
+
+bool CtrCommonAccessor::need_extend_mf(float* value) {
+  float show = value[common_feature_value.show_index()];
+  float click = value[common_feature_value.click_index()];
+  float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
+                click * _config.ctr_accessor_param().click_coeff();
+  return score >= _config.embedx_threshold();
+}
+
+bool CtrCommonAccessor::has_mf(size_t size) {
+  return size > common_feature_value.embedx_g2sum_index();
+}
+
+// from CommonFeatureValue to CtrCommonPullValue
+int32_t CtrCommonAccessor::select(float** select_values, const float** values,
+                                  size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* select_value = select_values[value_item];
+    const float* value = values[value_item];
+    select_value[CtrCommonPullValue::embed_w_index()] =
+        value[common_feature_value.embed_w_index()];
+    memcpy(select_value + CtrCommonPullValue::embedx_w_index(),
+           value + common_feature_value.embedx_w_index(),
+           embedx_dim * sizeof(float));
+  }
+  return 0;
+}
+
+// from CtrCommonPushValue to CtrCommonPushValue
+// first dim: item
+// second dim: field num
+int32_t CtrCommonAccessor::merge(float** update_values,
+                                 const float** other_update_values,
+                                 size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  size_t total_dim = CtrCommonPushValue::dim(embedx_dim);
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* update_value = update_values[value_item];
+    const float* other_update_value = other_update_values[value_item];
+    for (auto i = 0u; i < total_dim; ++i) {
+      if (i != CtrCommonPushValue::slot_index()) {
+        update_value[i] += other_update_value[i];
+      }
+    }
+  }
+  return 0;
+}
+
+// from CtrCommonPushValue to CommonFeatureValue
+// first dim: item
+// second dim: field num
+int32_t CtrCommonAccessor::update(float** update_values,
+                                  const float** push_values, size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* update_value = update_values[value_item];
+    const float* push_value = push_values[value_item];
+    float push_show = push_value[CtrCommonPushValue::show_index()];
+    float push_click = push_value[CtrCommonPushValue::click_index()];
+    float slot = push_value[CtrCommonPushValue::slot_index()];
+    update_value[common_feature_value.show_index()] += push_show;
+    update_value[common_feature_value.click_index()] += push_click;
+    update_value[common_feature_value.slot_index()] = slot;
+    update_value[common_feature_value.delta_score_index()] +=
+        (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
+        push_click * _config.ctr_accessor_param().click_coeff();
+    update_value[common_feature_value.unseen_days_index()] = 0;
+    _embed_sgd_rule->update_value(
+        update_value + common_feature_value.embed_w_index(),
+        update_value + common_feature_value.embed_g2sum_index(),
+        push_value + CtrCommonPushValue::embed_g_index());
+    _embedx_sgd_rule->update_value(
+        update_value + common_feature_value.embedx_w_index(),
+        update_value + common_feature_value.embedx_g2sum_index(),
+        push_value + CtrCommonPushValue::embedx_g_index());
+  }
+  return 0;
+}
+
+bool CtrCommonAccessor::create_value(int stage, const float* value) {
+  // stage == 0, pull
+  // stage == 1, push
+  if (stage == 0) {
+    return true;
+  } else if (stage == 1) {
+    // operation
+    auto show = CtrCommonPushValue::show_const(value);
+    auto click = CtrCommonPushValue::click_const(value);
+    auto score = show_click_score(show, click);
+    if (score <= 0) {
+      return false;
+    }
+    if (score >= 1) {
+      return true;
+    }
+    return local_uniform_real_distribution<float>()(local_random_engine()) <
+           score;
+  } else {
+    return true;
+  }
+}
+
+float CtrCommonAccessor::show_click_score(float show, float click) {
+  auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
+  auto click_coeff = _config.ctr_accessor_param().click_coeff();
+  return (show - click) * nonclk_coeff + click * click_coeff;
+}
+
+std::string CtrCommonAccessor::parse_to_string(const float* v, int param) {
+  thread_local std::ostringstream os;
+  os.clear();
+  os.str("");
+  os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " "
+     << v[5];
+  for (int i = common_feature_value.embed_g2sum_index();
+       i < common_feature_value.embedx_w_index(); i++) {
+    os << " " << v[i];
+  }
+  auto show = common_feature_value.show_const(v);
+  auto click = common_feature_value.click_const(v);
+  auto score = show_click_score(show, click);
+  if (score >= _config.embedx_threshold()) {
+    for (auto i = common_feature_value.embedx_w_index();
+         i < common_feature_value.dim(); ++i) {
+      os << " " << v[i];
+    }
+  }
+  return os.str();
+}
+
+int CtrCommonAccessor::parse_from_string(const std::string& str, float* value) {
+  int embedx_dim = _config.embedx_dim();
+
+  _embedx_sgd_rule->init_value(
+      value + common_feature_value.embedx_w_index(),
+      value + common_feature_value.embedx_g2sum_index());
+  auto ret = paddle::string::str_to_float(str.data(), value);
+  CHECK(ret >= 6) << "expect more than 6 real:" << ret;
+  return ret;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/ctr_accessor.h b/paddle/fluid/distributed/table/ctr_accessor.h
new file mode 100644
index 00000000000000..3c2ac7189f7772
--- /dev/null
+++ b/paddle/fluid/distributed/table/ctr_accessor.h
@@ -0,0 +1,223 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+
+class CtrCommonAccessor : public ValueAccessor {
+ public:
+  struct CtrCommonFeatureValue {
+    /*
+       float slot;
+       float unseen_days;
+       float delta_score;
+       float show;
+       float click;
+       float embed_w;
+       std::vector<float> embed_g2sum;
+       std::vector<float> embedx_w;
+       std::<vector>float embedx_g2sum;
+       */
+
+    int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
+    int dim_size(size_t dim, int embedx_dim) { return sizeof(float); }
+    int size() { return dim() * sizeof(float); }
+    int slot_index() { return 0; }
+    int unseen_days_index() { return slot_index() + 1; }
+    int delta_score_index() { return unseen_days_index() + 1; }
+    int show_index() { return delta_score_index() + 1; }
+    int click_index() { return show_index() + 1; }
+    int embed_w_index() { return click_index() + 1; }
+    int embed_g2sum_index() { return embed_w_index() + 1; }
+    int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; }
+    int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; }
+
+    float& unseen_days(float* val) { return val[unseen_days_index()]; }
+    float& delta_score(float* val) { return val[delta_score_index()]; }
+    float& show(float* val) { return val[show_index()]; }
+    float& click(float* val) { return val[click_index()]; }
+    float& slot(float* val) { return val[slot_index()]; }
+    float& embed_w(float* val) { return val[embed_w_index()]; }
+    float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; }
+    float& embedx_w(float* val) { return val[embedx_w_index()]; }
+    float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; }
+    float show_const(const float* val) {
+      float s = val[show_index()];
+      return s;
+    }
+    float click_const(const float* val) {
+      float c = val[click_index()];
+      return c;
+    }
+    int embed_sgd_dim;
+    int embedx_dim;
+    int embedx_sgd_dim;
+  };
+
+  struct CtrCommonPushValue {
+    /*
+       float slot;
+       float show;
+       float click;
+       float embed_g;
+       std::vector<float> embedx_g;
+       */
+
+    static int dim(int embedx_dim) { return 4 + embedx_dim; }
+
+    static int dim_size(int dim, int embedx_dim) { return sizeof(float); }
+    static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
+    static int slot_index() { return 0; }
+    static int show_index() { return CtrCommonPushValue::slot_index() + 1; }
+    static int click_index() { return CtrCommonPushValue::show_index() + 1; }
+    static int embed_g_index() { return CtrCommonPushValue::click_index() + 1; }
+    static int embedx_g_index() {
+      return CtrCommonPushValue::embed_g_index() + 1;
+    }
+    static float& slot(float* val) {
+      return val[CtrCommonPushValue::slot_index()];
+    }
+    static float& show(float* val) {
+      return val[CtrCommonPushValue::show_index()];
+    }
+    static float& click(float* val) {
+      return val[CtrCommonPushValue::click_index()];
+    }
+    static float show_const(const float* val) {
+      float s = val[show_index()];
+      return s;
+    }
+    static float click_const(const float* val) {
+      float c = val[click_index()];
+      return c;
+    }
+    static float& embed_g(float* val) {
+      return val[CtrCommonPushValue::embed_g_index()];
+    }
+    static float* embedx_g(float* val) {
+      return val + CtrCommonPushValue::embedx_g_index();
+    }
+  };
+
+  struct CtrCommonPullValue {
+    /*
+       float embed_w;
+       std::vector<float> embedx_w;
+       */
+
+    static int dim(int embedx_dim) { return 1 + embedx_dim; }
+    static int dim_size(size_t dim) { return sizeof(float); }
+    static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); }
+    static int embed_w_index() { return 0; }
+    static int embedx_w_index() { return 1; }
+    static float& embed_w(float* val) {
+      return val[CtrCommonPullValue::embed_w_index()];
+    }
+    static float* embedx_w(float* val) {
+      return val + CtrCommonPullValue::embedx_w_index();
+    }
+  };
+  CtrCommonAccessor() {}
+  virtual int initialize();
+  virtual ~CtrCommonAccessor() {}
+
+  // value维度
+  virtual size_t dim();
+  // value各个维度的size
+  virtual size_t dim_size(size_t dim);
+  // value各维度相加总size
+  virtual size_t size();
+  // value中mf动态长度部分总size大小, sparse下生效
+  virtual size_t mf_size();
+  // pull value维度
+  virtual size_t select_dim();
+  // pull value各个维度的size
+  virtual size_t select_dim_size(size_t dim);
+  // pull value各维度相加总size
+  virtual size_t select_size();
+  // push value维度
+  virtual size_t update_dim();
+  // push value各个维度的size
+  virtual size_t update_dim_size(size_t dim);
+  // push value各维度相加总size
+  virtual size_t update_size();
+  // 判断该value是否进行shrink
+  virtual bool shrink(float* value);
+  // 判断该value是否保存到ssd
+  // virtual bool save_ssd(float* value);
+  virtual bool need_extend_mf(float* value);
+  virtual bool has_mf(size_t size);
+  // 判断该value是否在save阶段dump,
+  // param作为参数用于标识save阶段，如downpour的xbox与batch_model
+  // param = 0, save all feature
+  // param = 1, save delta feature
+  // param = 2, save xbox base feature
+  bool save(float* value, int param) override;
+  // update delta_score and unseen_days after save
+  void update_stat_after_save(float* value, int param) override;
+  // keys不存在时，为values生成随机值
+  // 要求value的内存由外部调用者分配完毕
+  virtual int32_t create(float** value, size_t num);
+  // 从values中选取到select_values中
+  virtual int32_t select(float** select_values, const float** values,
+                         size_t num);
+  // 将update_values聚合到一起
+  virtual int32_t merge(float** update_values,
+                        const float** other_update_values, size_t num);
+  // 将update_values聚合到一起，通过it.next判定是否进入下一个key
+  // virtual int32_t merge(float** update_values, iterator it);
+  // 将update_values更新应用到values中
+  virtual int32_t update(float** values, const float** update_values,
+                         size_t num);
+
+  std::string parse_to_string(const float* value, int param) override;
+  int32_t parse_from_string(const std::string& str, float* v) override;
+  virtual bool create_value(int type, const float* value);
+
+  // 这个接口目前只用来取show
+  float get_field(float* value, const std::string& name) override {
+    // CHECK(name == "show");
+    if (name == "show") {
+      return common_feature_value.show(value);
+    }
+    return 0.0;
+  }
+
+ private:
+  // float show_click_score(float show, float click);
+
+  // SparseValueSGDRule* _embed_sgd_rule;
+  // SparseValueSGDRule* _embedx_sgd_rule;
+  // CtrCommonFeatureValue common_feature_value;
+  float _show_click_decay_rate;
+  int32_t _ssd_unseenday_threshold;
+
+ public:  // TODO(zhaocaibei123): it should be private, but we make it public
+          // for unit test
+  CtrCommonFeatureValue common_feature_value;
+  float show_click_score(float show, float click);
+  SparseValueSGDRule* _embed_sgd_rule;
+  SparseValueSGDRule* _embedx_sgd_rule;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 832797ec2fc0ee..f8cd9af4774ec5 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -26,3 +26,6 @@ cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost
 
 set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table)
+
+set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
new file mode 100644
index 00000000000000..8c667cad605fcc
--- /dev/null
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -0,0 +1,304 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/table/ctr_accessor.h"
+#include <cmath>
+#include <iostream>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
+
+TableAccessorParameter gen_param() {
+  TableAccessorParameter param;
+  param.set_accessor_class("CtrCommonAccessor");
+  param.set_fea_dim(11);
+  param.set_embedx_dim(8);
+  param.mutable_ctr_accessor_param()->set_nonclk_coeff(0.2);
+  param.mutable_ctr_accessor_param()->set_click_coeff(1);
+  param.mutable_ctr_accessor_param()->set_base_threshold(0.5);
+  param.mutable_ctr_accessor_param()->set_delta_threshold(0.2);
+  param.mutable_ctr_accessor_param()->set_delta_keep_days(16);
+  param.mutable_ctr_accessor_param()->set_show_click_decay_rate(0.99);
+  /*
+  param.mutable_embed_sgd_param()->set_name("naive");
+  auto* naive_param = param.mutable_embed_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+  */
+  param.mutable_embed_sgd_param()->set_name("StdAdaGradSGDRule");
+  auto* adagrad_param = param.mutable_embed_sgd_param()->mutable_adagrad();
+  adagrad_param->set_learning_rate(0.1);
+  adagrad_param->set_initial_range(0.3);
+  adagrad_param->set_initial_g2sum(0.0);
+  adagrad_param->add_weight_bounds(-10.0);
+  adagrad_param->add_weight_bounds(10.0);
+
+  param.mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule");
+  auto* naive_param = param.mutable_embedx_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+
+  return std::move(param);
+}
+
+TEST(downpour_feature_value_accessor_test, test_shrink) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  VLOG(3) << "size of struct: " << acc->common_feature_value.embed_sgd_dim
+          << " " << acc->common_feature_value.embedx_dim << " "
+          << acc->common_feature_value.embedx_sgd_dim << " "
+          << acc->common_feature_value.dim() << "\n";
+
+  float* value = new float[acc->dim()];
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    value[i] = i * 1.0;
+  }
+  ASSERT_TRUE(!acc->shrink(value));
+
+  // set unseen_days too long
+  value[1] = 1000;
+  // set delta score too small
+  value[2] = 0.001;
+  ASSERT_TRUE(acc->shrink(value));
+}
+
+TEST(downpour_feature_value_accessor_test, test_save) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  float* value = new float[acc->dim()];
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    value[i] = i * 1.0;
+  }
+
+  // save all feature
+  ASSERT_TRUE(acc->save(value, 0));
+
+  // save delta feature
+  ASSERT_TRUE(acc->save(value, 1));
+
+  // save base feature with time decay
+  ASSERT_TRUE(acc->save(value, 2));
+
+  VLOG(3) << "test_save:";
+  for (auto i = 0u; i < acc->dim(); ++i) {
+    VLOG(3) << value[i];
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_create) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  const int field_size = 7 + 8;
+  const int item_size = 10;
+
+  float** value = new float*[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    value[i] = new float[field_size];
+  }
+  ASSERT_EQ(acc->create(value, item_size), 0);
+
+  for (auto i = 0u; i < item_size; ++i) {
+    for (auto j = 0u; j < field_size; ++j) {
+      VLOG(3) << value[i][j] << " ";
+      // ASSERT_FLOAT_EQ(value[i][j], 0);
+    }
+    VLOG(3) << "\n";
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_update) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  VLOG(3) << "dim: " << acc->common_feature_value.dim() << "\n";
+  VLOG(3) << "update_dim: " << acc->update_dim() << "\n";
+
+  const int field_size = 7 + 8;
+  const int item_size = 10;
+
+  float** value = new float*[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    value[i] = new float[field_size];
+
+    for (auto j = 0u; j < field_size; ++j) {
+      value[i][j] = 0;
+    }
+  }
+
+  typedef const float* const_float_ptr;
+  const_float_ptr* grad = new const_float_ptr[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    float* p = new float[acc->update_dim()];
+    for (auto j = 0u; j < acc->update_dim(); ++j) {
+      p[j] = i;
+    }
+    grad[i] = p;
+  }
+
+  struct DownpourSparseValueTest {
+    float slot;
+    float unseen_days;
+    float delta_score;
+    float show;
+    float click;
+    float embed_w;
+    std::vector<float> embed_g2sum;
+    std::vector<float> embedx_w;
+    std::vector<float> embedx_g2sum;
+
+    void to_array(float* ptr, size_t dim) {
+      ptr[0] = slot;
+      ptr[1] = unseen_days;
+      ptr[2] = delta_score;
+      ptr[3] = show;
+      ptr[4] = click;
+      ptr[5] = embed_w;
+      int idx = 6;
+      for (auto j = 0u; j < 1; ++j) {
+        ptr[idx + j] = embed_g2sum[j];
+      }
+      idx += 1;
+      for (auto j = 0u; j < 8; ++j) {
+        ptr[idx + j] = embedx_w[j];
+      }
+      idx += 8;
+      for (auto j = 0u; j < 0; ++j) {
+        ptr[idx + j] = embedx_g2sum[j];
+      }
+    }
+  };
+  struct DownpourSparsePushValueTest {
+    float slot;
+    float show;
+    float click;
+    float embed_g;
+    std::vector<float> embedx_g;
+  };
+  std::vector<float*> exp_value;
+  for (auto i = 0u; i < item_size; ++i) {
+    DownpourSparseValueTest v;
+    v.slot = value[i][0];
+    v.unseen_days = value[i][1];
+    v.delta_score = value[i][2];
+    v.show = value[i][3];
+    v.click = value[i][4];
+    v.embed_w = value[i][5];
+
+    int idx = 6;
+    for (auto j = 0u; j < acc->common_feature_value.embed_sgd_dim; ++j) {
+      v.embed_g2sum.push_back(value[i][idx + j]);
+    }
+    idx += acc->common_feature_value.embed_sgd_dim;
+    for (auto j = 0u; j < acc->common_feature_value.embedx_dim; ++j) {
+      v.embedx_w.push_back(value[i][idx + j]);
+    }
+    idx += acc->common_feature_value.embedx_dim;
+    for (auto j = 0u; j < acc->common_feature_value.embedx_sgd_dim; ++j) {
+      v.embedx_g2sum.push_back(value[i][idx + j]);
+    }
+
+    DownpourSparsePushValueTest push_v;
+    push_v.slot = grad[i][0];
+    push_v.show = grad[i][1];
+    push_v.click = grad[i][2];
+    push_v.embed_g = grad[i][3];
+    for (auto j = 0; j < parameter.embedx_dim(); ++j) {
+      push_v.embedx_g.push_back(grad[i][4 + j]);
+    }
+
+    v.slot = push_v.slot;
+    v.unseen_days = 0;
+    v.show += push_v.show;
+    v.click += push_v.click;
+    v.delta_score += acc->show_click_score(push_v.show, push_v.click);
+
+    acc->_embed_sgd_rule->update_value(&v.embed_w, &v.embed_g2sum[0],
+                                       &push_v.embed_g);
+    acc->_embedx_sgd_rule->update_value(&v.embedx_w[0], &v.embedx_g2sum[0],
+                                        &push_v.embedx_g[0]);
+
+    float* ptr = new float[acc->dim()];
+    v.to_array(ptr, parameter.embedx_dim());
+    exp_value.push_back(ptr);
+  }
+  acc->update(value, grad, item_size);
+
+  for (auto i = 0u; i < item_size; ++i) {
+    for (auto j = 0u; j < acc->dim(); ++j) {
+      VLOG(3) << value[i][j] << ":" << exp_value[i][j] << " ";
+      ASSERT_FLOAT_EQ(value[i][j], exp_value[i][j]);
+    }
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_show_click_score) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  float show = 10;
+  float click = 6;
+  ASSERT_FLOAT_EQ(acc->show_click_score(show, click), 6.8);
+}
+
+TEST(downpour_feature_value_accessor_test, test_string_related) {
+  TableAccessorParameter parameter = gen_param();
+  CtrCommonAccessor* acc = new CtrCommonAccessor();
+  ASSERT_EQ(acc->configure(parameter), 0);
+  ASSERT_EQ(acc->initialize(), 0);
+
+  const int field_size = 15;
+  float* value = new float[field_size];
+  for (auto i = 0u; i < field_size; ++i) {
+    value[i] = i;
+  }
+
+  auto str = acc->parse_to_string(value, 0);
+
+  VLOG(3) << str << std::endl;
+
+  str = "0 1 2 3 4 5 6";
+  ASSERT_NE(acc->parse_from_string(str, value), 0);
+  // make sure init_zero=true
+
+  for (auto i = 7; i < 15; ++i) {
+    ASSERT_FLOAT_EQ(value[i], 0);
+  }
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
index 141ac2ba47c5b9..db9ee7592fc842 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
@@ -24,26 +24,6 @@
 namespace paddle {
 namespace string {
 
-inline size_t count_spaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
-inline size_t count_nonspaces(const char* s) {
-  size_t count = 0;
-
-  while (*s != 0 && !isspace(*s++)) {
-    count++;
-  }
-
-  return count;
-}
-
 // remove leading and tailing spaces
 std::string trim_spaces(const std::string& str) {
   const char* p = str.c_str();
@@ -74,20 +54,6 @@ std::string erase_spaces(const std::string& str) {
   return result;
 }
 
-inline int str_to_float(const char* str, float* v) {
-  const char* head = str;
-  char* cursor = NULL;
-  int index = 0;
-  while (*(head += count_spaces(head)) != 0) {
-    v[index++] = std::strtof(head, &cursor);
-    if (head == cursor) {
-      break;
-    }
-    head = cursor;
-  }
-  return index;
-}
-
 bool ends_with(std::string const& input, std::string const& test) {
   if (test.size() > input.size()) return false;
   return std::equal(test.rbegin(), test.rend(), input.rbegin());
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index 1ab7690f8b517b..4f1aee7c7ed17f 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -26,9 +26,25 @@
 namespace paddle {
 namespace string {
 
-inline size_t count_spaces(const char* s);
+inline size_t count_spaces(const char* s) {
+  size_t count = 0;
 
-inline size_t count_nonspaces(const char* s);
+  while (*s != 0 && isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
+
+inline size_t count_nonspaces(const char* s) {
+  size_t count = 0;
+
+  while (*s != 0 && !isspace(*s++)) {
+    count++;
+  }
+
+  return count;
+}
 
 template <class... ARGS>
 void format_string_append(std::string& str, const char* fmt,  // NOLINT
@@ -67,7 +83,19 @@ std::string trim_spaces(const std::string& str);
 // erase all spaces in str
 std::string erase_spaces(const std::string& str);
 
-int str_to_float(const char* str, float* v);
+inline int str_to_float(const char* str, float* v) {
+  const char* head = str;
+  char* cursor = NULL;
+  int index = 0;
+  while (*(head += count_spaces(head)) != 0) {
+    v[index++] = std::strtof(head, &cursor);
+    if (head == cursor) {
+      break;
+    }
+    head = cursor;
+  }
+  return index;
+}
 
 // checks whether the test string is a suffix of the input string.
 bool ends_with(std::string const& input, std::string const& test);

From 229bae819b68c5e13fa5aad1fe3b730cdb39d208 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Tue, 26 Oct 2021 10:08:07 +0800
Subject: [PATCH 03/14] Pool3d 2.0 (#36545)

---
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../inference/tensorrt/convert/pool3d_op.cc   | 228 +++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |   3 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   1 +
 .../tensorrt/plugin/pool3d_op_plugin.cu       | 375 ++++++++++++++++++
 .../tensorrt/plugin/pool3d_op_plugin.h        | 244 ++++++++++++
 paddle/fluid/operators/math/pooling.cu        |  48 +++
 paddle/fluid/operators/math/pooling.h         |  14 +
 .../unittests/ir/inference/CMakeLists.txt     |   2 +
 .../ir/inference/test_trt_pool3d_op.py        | 332 ++++++++++++++++
 11 files changed, 1248 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index eabca4197a1d39..dda4be8f81c63f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1415,6 +1415,7 @@ USE_TRT_CONVERTER(tile);
 USE_TRT_CONVERTER(conv3d);
 USE_TRT_CONVERTER(conv3d_transpose);
 USE_TRT_CONVERTER(mish);
+USE_TRT_CONVERTER(pool3d)
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ef12cb6b366177..b6aa0a230cc2d5 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -19,6 +19,7 @@ nv_library(tensorrt_converter
                 conv3d_op.cc
                 mish_op.cc
                 nearest_interp_v2_op.cc
+                pool3d_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
new file mode 100644
index 00000000000000..9baed499f14a78
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc
@@ -0,0 +1,228 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+inline void DealCeilMode(const nvinfer1::Dims &input_shape,
+                         std::vector<int> ksize, std::vector<int> strides,
+                         std::vector<int> paddings, nvinfer1::DimsCHW *pre_pad,
+                         nvinfer1::DimsCHW *post_pad, int input_dims) {
+  int input_depth = input_shape.d[input_dims - 3];
+  int input_height = input_shape.d[input_dims - 2];
+  int input_width = input_shape.d[input_dims - 1];
+
+  int floor_d_output_size =
+      (input_depth - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+  int ceil_d_output_size =
+      (input_depth - ksize[0] + 2 * paddings[0] + strides[0] - 1) / strides[0] +
+      1;
+
+  int floor_h_output_size =
+      (input_height - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+  int ceil_h_output_size =
+      (input_height - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+          strides[1] +
+      1;
+
+  int floor_w_output_size =
+      (input_width - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+  int ceil_w_output_size =
+      (input_width - ksize[2] + 2 * paddings[2] + strides[2] - 1) / strides[2] +
+      1;
+
+  if (floor_d_output_size != ceil_d_output_size) {
+    post_pad->c() = strides[0] - 1;
+  }
+
+  if (floor_h_output_size != ceil_h_output_size) {
+    post_pad->h() = strides[1] - 1;
+  }
+
+  if (floor_w_output_size != ceil_w_output_size) {
+    post_pad->w() = strides[2] - 1;
+  }
+}
+
+class Pool3dOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope, bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid pool3d op to tensorrt pool3d layer without bias";
+    framework::OpDesc op_desc(op, nullptr);
+    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    bool global_pooling =
+        BOOST_GET_CONST(bool, op_desc.GetAttr("global_pooling"));
+    std::string pool_type =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("pooling_type"));
+    std::vector<int> ksize =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ksize"));
+    std::vector<int> strides =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
+    std::vector<int> paddings =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+    bool exclusive = op_desc.HasAttr("exclusive")
+                         ? BOOST_GET_CONST(bool, op_desc.GetAttr("exclusive"))
+                         : true;
+    bool ceil_mode = BOOST_GET_CONST(bool, op_desc.GetAttr("ceil_mode"));
+    bool adaptive = false;
+    if (op_desc.HasAttr("adaptive"))
+      adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive"));
+    std::string padding_algorithm = "EXPLICIT";
+    if (op_desc.HasAttr("padding_algorithm"))
+      padding_algorithm =
+          BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm"));
+    if (padding_algorithm == "VALID" || padding_algorithm == "SAME") {
+      std::fill(paddings.begin(), paddings.end(), 0);
+    }
+
+    nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
+    nvinfer1::ReduceOperation reduce_operation =
+        nvinfer1::ReduceOperation::kMAX;
+    plugin::Pool3DPlugin::Pool3DType plugin_pool_type =
+        plugin::Pool3DPlugin::Pool3DType::max;
+    if (pool_type == "max") {
+      nv_pool_type = nvinfer1::PoolingType::kMAX;
+      reduce_operation = nvinfer1::ReduceOperation::kMAX;
+      plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::max;
+    } else if (pool_type == "avg") {
+      nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
+      reduce_operation = nvinfer1::ReduceOperation::kAVG;
+      plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::avg;
+    }
+    nvinfer1::DimsCHW nv_ksize(ksize[0], ksize[1], ksize[2]);
+    nvinfer1::DimsCHW nv_strides(strides[0], strides[1], strides[2]);
+    nvinfer1::DimsCHW nv_paddings(paddings[0], paddings[1], paddings[2]);
+    nvinfer1::ILayer *layer = nullptr;
+    if (op_desc.HasAttr("enable_int8")) {
+      CHECK(op_desc.HasAttr("X_scale"));
+      float input_scale = BOOST_GET_CONST(float, op_desc.GetAttr("X_scale"));
+      engine_->SetTensorDynamicRange(input1, input_scale);
+    }
+
+    if (engine_->with_dynamic_shape()) {
+      if (!adaptive && !global_pooling && !ceil_mode) {
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1,
+                                                nv_pool_type, nv_ksize);
+        pool_layer->setStrideNd(nv_strides);
+        pool_layer->setPaddingNd(nv_paddings);
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        layer = pool_layer;
+      } else if (global_pooling) {
+        auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
+                                                  reduce_operation, 28, true);
+        layer = reduce_layer;
+      } else {
+        plugin::Pool3DPluginDynamic *plugin = new plugin::Pool3DPluginDynamic(
+            ceil_mode, pool_type, adaptive, ksize, strides, paddings,
+            global_pooling);
+        layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
+      }
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool3d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+      return;
+    }
+
+    if (global_pooling == true) {
+      auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1,
+                                                reduce_operation, 14, true);
+      layer = reduce_layer;
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool3d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode) {
+        engine_->DeclareOutput(output_name);
+      }
+      return;
+    }
+
+    if (!adaptive) {
+      if (!ceil_mode) {
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1,
+                                                nv_pool_type, nv_ksize);
+        PADDLE_ENFORCE_NOT_NULL(
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool layer in converter could not be created."));
+        pool_layer->setStrideNd(nv_strides);
+        pool_layer->setPaddingNd(nv_paddings);
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        layer = pool_layer;
+      } else {
+        std::vector<int> input_shape_v;
+        for (int i = 0; i < input_dims; i++) {
+          input_shape_v.push_back(input_shape.d[i]);
+        }
+        plugin::Pool3DPlugin *plugin =
+            new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive,
+                                     ksize, strides, paddings, input_shape_v);
+        auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin);
+        PADDLE_ENFORCE_NOT_NULL(
+            pool_layer,
+            platform::errors::Fatal(
+                "trt pool3d plugin layer in converter could not be created."));
+        layer = pool_layer;
+      }
+    } else {
+      // Average pooling needs to exclude the padding pixels from the average
+      // mean.
+      // It is not supported well by TRT, we use a plugin here.
+      std::vector<int> input_shape_v;
+      for (int i = 0; i < input_dims; i++) {
+        input_shape_v.push_back(input_shape.d[i]);
+      }
+      plugin::Pool3DPlugin *plugin =
+          new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive, ksize,
+                                   strides, paddings, input_shape_v);
+      auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin);
+      PADDLE_ENFORCE_NOT_NULL(
+          pool_layer,
+          platform::errors::Fatal(
+              "trt pool3d plugin layer in converter could not be created."));
+      layer = pool_layer;
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "pool3d", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(pool3d);
+REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 91515f1fa58116..7049df4b300f17 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -142,7 +142,8 @@ struct SimpleOpTypeSetTeller : public Teller {
                                              "conv3d",
                                              "conv3d_transpose",
                                              "mish",
-                                             "nearest_interp_v2"};
+                                             "nearest_interp_v2",
+                                             "pool3d"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e6bcb59fd092c8..9e93894e623c00 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -10,6 +10,7 @@ nv_library(tensorrt_plugin
            roi_align_op_plugin.cu
            gather_nd_op_plugin.cu
            mish_op_plugin.cu
+           pool3d_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
new file mode 100644
index 00000000000000..861a9aa9d000bf
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu
@@ -0,0 +1,375 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, softwarepool
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+size_t Pool3DPlugin::getSerializationSize() const TRT_NOEXCEPT {
+  return getBaseSerializationSize() + SerializedSize(ceil_mode_) +
+         SerializedSize(pool3d_type_) + SerializedSize(adaptive_) +
+         SerializedSize(ksize_) + SerializedSize(strides_) +
+         SerializedSize(paddings_) + SerializedSize(input_shape_) +
+         SerializedSize(output_shape_);
+}
+
+// TRT will call this func when we need to serialize the configuration of
+// tensorrt.
+void Pool3DPlugin::serialize(void *buffer) const TRT_NOEXCEPT {
+  serializeBase(buffer);
+  SerializeValue(&buffer, ceil_mode_);
+  SerializeValue(&buffer, pool3d_type_);
+  SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, ksize_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, input_shape_);
+  SerializeValue(&buffer, output_shape_);
+}
+
+Pool3DPlugin *Pool3DPlugin::clone() const TRT_NOEXCEPT {
+  return new Pool3DPlugin(ceil_mode_, pool3d_type_, adaptive_, ksize_, strides_,
+                          paddings_, input_shape_);
+}
+
+const char *Pool3DPlugin::getPluginType() const TRT_NOEXCEPT {
+  return "pool3d_plugin";
+}
+
+int Pool3DPlugin::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+int Pool3DPlugin::initialize() TRT_NOEXCEPT { return 0; }
+
+nvinfer1::DataType Pool3DPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+void Pool3DPlugin::destroy() TRT_NOEXCEPT { delete this; }
+
+nvinfer1::Dims Pool3DPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nbInputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so the nbInputs "
+                        "value should be 1, but get %d.",
+                        nbInputs));
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The Pool3D Plugin only has one input, so "
+                                  "the index value should be 0, but get %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has four Dimensions, so the "
+                        "nbDims value should be 4, but get %d.",
+                        inputDims[0].nbDims));
+
+  nvinfer1::Dims const &input_dims = inputDims[0];
+
+  nvinfer1::Dims output_dims = input_dims;
+
+  output_dims.d[1] = output_shape_[1];
+  output_dims.d[2] = output_shape_[2];
+  output_dims.d[3] = output_shape_[3];
+  return output_dims;
+}
+
+int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs,
+#if IS_TRT_VERSION_LT(8000)
+                          void **outputs, void *workspace,
+                          cudaStream_t stream) TRT_NOEXCEPT {
+#else
+                          void *const *outputs, void *workspace,
+                          cudaStream_t stream) TRT_NOEXCEPT {
+#endif
+  int input_size = 0;
+  float const *idata = reinterpret_cast<float const *>(inputs[0]);
+  float *const *odatas = reinterpret_cast<float *const *>(outputs);
+
+  std::vector<int> input_shape = input_shape_;
+  std::vector<int> output_shape = output_shape_;
+  input_shape.insert(input_shape.begin(), batchSize);
+  output_shape.insert(output_shape.begin(), batchSize);
+
+  if (pool3d_type_ == Pool3DType::max) {
+    paddle::operators::math::MaxPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::MaxPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
+                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+  } else if (pool3d_type_ == Pool3DType::avg) {
+    paddle::operators::math::AvgPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::AvgPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(idata, input_shape, output_shape, ksize_, strides_,
+                   paddings_, true, adaptive_, odatas[0], stream, pool_process);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+// Dynamic Plugin below.
+
+Pool3DPluginDynamic::Pool3DPluginDynamic(void const *serialData,
+                                         size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+  const char *pool3d_type;
+  DeserializeValue(&serialData, &serialLength, &pool3d_type);
+  pool3d_type_ = std::string(pool3d_type);
+  DeserializeValue(&serialData, &serialLength, &adaptive_);
+  DeserializeValue(&serialData, &serialLength, &ksize_);
+  DeserializeValue(&serialData, &serialLength, &strides_);
+  DeserializeValue(&serialData, &serialLength, &paddings_);
+  DeserializeValue(&serialData, &serialLength, &is_global_);
+}
+
+nvinfer1::IPluginV2DynamicExt *Pool3DPluginDynamic::clone() const TRT_NOEXCEPT {
+  return new Pool3DPluginDynamic(ceil_mode_, pool3d_type_, adaptive_, ksize_,
+                                 strides_, paddings_, is_global_);
+}
+
+const char *Pool3DPluginDynamic::getPluginType() const TRT_NOEXCEPT {
+  return "pool3d_plugin_dynamic";
+}
+int Pool3DPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
+
+int Pool3DPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
+
+void Pool3DPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) TRT_NOEXCEPT {}
+
+size_t Pool3DPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
+  return 0;
+}
+
+size_t Pool3DPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
+  return SerializedSize(ceil_mode_) + SerializedSize(pool3d_type_.c_str()) +
+         SerializedSize(adaptive_) + SerializedSize(ksize_) +
+         SerializedSize(strides_) + SerializedSize(paddings_) +
+         SerializedSize(is_global_);
+}
+
+void Pool3DPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
+  SerializeValue(&buffer, ceil_mode_);
+  SerializeValue(&buffer, pool3d_type_.c_str());
+  SerializeValue(&buffer, adaptive_);
+  SerializeValue(&buffer, ksize_);
+  SerializeValue(&buffer, strides_);
+  SerializeValue(&buffer, paddings_);
+  SerializeValue(&buffer, is_global_);
+}
+
+nvinfer1::DimsExprs Pool3DPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nb_inputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The Split plugin should be only one input."));
+
+  PADDLE_ENFORCE_EQ(
+      inputs[0].d[1]->isConstant(), true,
+      platform::errors::InvalidArgument("The channel dimension should be "
+                                        "static, but we found it's dynamic."));
+  nvinfer1::DimsExprs output(inputs[0]);
+  if (is_global_) {
+    output.d[2] = expr_builder.constant(1);
+    output.d[3] = expr_builder.constant(1);
+    output.d[4] = expr_builder.constant(1);
+    return output;
+  }
+  if (adaptive_) {
+    output.d[2] = expr_builder.constant(ksize_[0]);
+    output.d[3] = expr_builder.constant(ksize_[1]);
+    output.d[4] = expr_builder.constant(ksize_[2]);
+    return output;
+  }
+
+  auto stri_0 = expr_builder.constant(strides_[0]);
+  auto stri_1 = expr_builder.constant(strides_[1]);
+  auto stri_2 = expr_builder.constant(strides_[2]);
+  auto one_value = expr_builder.constant(1);
+
+  auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
+  auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
+  auto v2_tmp = expr_builder.constant(-ksize_[2] + 2 * paddings_[2]);
+
+  auto ceil_tmp =
+      expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
+  auto ceil1_tmp =
+      expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
+  auto ceil2_tmp =
+      expr_builder.constant(-ksize_[2] + 2 * paddings_[2] + strides_[2] - 1);
+
+  if (!ceil_mode_) {
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *v0_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *v1_tmp),
+            *stri_1),
+        *one_value);
+    output.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[4], *v2_tmp),
+            *stri_2),
+        *one_value);
+
+  } else {
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *ceil_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *ceil1_tmp),
+            *stri_1),
+        *one_value);
+    output.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[4], *ceil2_tmp),
+            *stri_2),
+        *one_value);
+  }
+
+  return output;
+}
+
+bool Pool3DPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+          in_out[pos].format == nvinfer1::PluginFormat::kLINEAR);
+}
+
+nvinfer1::DataType Pool3DPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The Pool3D Plugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
+  return input_types[0];
+}
+
+int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
+                                 const nvinfer1::PluginTensorDesc *output_desc,
+                                 const void *const *inputs,
+                                 void *const *outputs, void *workspace,
+                                 cudaStream_t stream) TRT_NOEXCEPT {
+  auto input_dims = input_desc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int d = input_dims.d[2];
+  int h = input_dims.d[3];
+  int w = input_dims.d[4];
+
+  const float *input = static_cast<const float *>(inputs[0]);
+  float *output = static_cast<float *>(outputs[0]);
+
+  std::vector<int> input_shape, output_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  output_shape = input_shape;
+
+  std::vector<int> ksize = ksize_;
+  std::vector<int> paddings = paddings_;
+  if (is_global_) {
+    ksize[0] = d;
+    ksize[1] = h;
+    ksize[2] = w;
+    paddings[0] = 0;
+    paddings[1] = 0;
+    paddings[2] = 0;
+    output_shape[2] = 1;
+    output_shape[3] = 1;
+    output_shape[4] = 1;
+  } else {
+    auto data_dim = CalcOutputSize({d, h, w}, ceil_mode_, adaptive_, ksize_,
+                                   strides_, paddings_);
+    output_shape[2] = data_dim[0];
+    output_shape[3] = data_dim[1];
+    output_shape[4] = data_dim[2];
+  }
+
+  if (pool3d_type_ == "max") {
+    paddle::operators::math::MaxPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::MaxPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
+                   true, adaptive_, output, stream, pool_process);
+  } else if (pool3d_type_ == "avg") {
+    paddle::operators::math::AvgPool<float> pool_process;
+    paddle::operators::math::Pool3dDirectCUDAFunctor<
+        paddle::operators::math::AvgPool<float>, float>
+        pool3d_forward;
+    pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
+                   true, adaptive_, output, stream, pool_process);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
new file mode 100644
index 00000000000000..7c9a8625d70f3b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h
@@ -0,0 +1,244 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
+                                       const bool& ceil_mode,
+                                       const bool& adaptive,
+                                       const std::vector<int>& ksize,
+                                       const std::vector<int>& strides,
+                                       const std::vector<int>& paddings) {
+  std::vector<int> output_shape = input_shape;
+  if (adaptive) {
+    output_shape[0] = ksize[0];
+    output_shape[1] = ksize[1];
+    output_shape[2] = ksize[2];
+  } else {
+    int output_d =
+        (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+    int output_h =
+        (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+    int output_w =
+        (input_shape[2] - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+    if (ceil_mode) {
+      output_d =
+          (input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+              strides[0] +
+          1;
+      output_h =
+          (input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
+              strides[1] +
+          1;
+      output_w =
+          (input_shape[2] - ksize[2] + 2 * paddings[2] + strides[2] - 1) /
+              strides[2] +
+          1;
+    }
+    output_shape[0] = output_d;
+    output_shape[1] = output_h;
+    output_shape[2] = output_w;
+  }
+  return output_shape;
+}
+
+class Pool3DPlugin : public PluginTensorRTV2Ext {
+ public:
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  enum class Pool3DType {
+    max = 0,
+    avg,
+  };
+  Pool3DPlugin() {}
+  Pool3DPlugin(bool ceil_mode, Pool3DType pool3d_type, bool adaptive,
+               std::vector<int> ksize, std::vector<int> strides,
+               std::vector<int> paddings, std::vector<int> input_shape)
+      : ceil_mode_(ceil_mode),
+        pool3d_type_(pool3d_type),
+        adaptive_(adaptive),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        input_shape_(input_shape) {
+    output_shape_ = input_shape_;
+    std::vector<int> output_shape =
+        CalcOutputSize({input_shape_[1], input_shape_[2], input_shape_[3]},
+                       ceil_mode_, adaptive_, ksize_, strides_, paddings_);
+    output_shape_[1] = output_shape[0];
+    output_shape_[2] = output_shape[1];
+    output_shape_[3] = output_shape[2];
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  Pool3DPlugin(void const* serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+    DeserializeValue(&serialData, &serialLength, &pool3d_type_);
+    DeserializeValue(&serialData, &serialLength, &adaptive_);
+    DeserializeValue(&serialData, &serialLength, &ksize_);
+    DeserializeValue(&serialData, &serialLength, &strides_);
+    DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &input_shape_);
+    DeserializeValue(&serialData, &serialLength, &output_shape_);
+  }
+
+  Pool3DPlugin* clone() const TRT_NOEXCEPT override;
+
+  const char* getPluginType() const TRT_NOEXCEPT override;
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) TRT_NOEXCEPT override;
+
+  int initialize() TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override;
+
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+#else
+  int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+ private:
+  bool ceil_mode_;
+  Pool3DType pool3d_type_;
+  bool adaptive_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+};
+
+class Pool3DPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool3d_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new Pool3DPlugin(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(Pool3DPluginCreator);
+
+class Pool3DPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  Pool3DPluginDynamic() {}
+  Pool3DPluginDynamic(const bool& ceil_mode, const std::string& pool3d_type,
+                      const bool& adaptive, const std::vector<int>& ksize,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings, const bool& is_global)
+      : ceil_mode_(ceil_mode),
+        pool3d_type_(pool3d_type),
+        adaptive_(adaptive),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        is_global_(is_global) {}
+
+  Pool3DPluginDynamic(void const* serialData, size_t serialLength);
+  ~Pool3DPluginDynamic() {}
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ private:
+  bool ceil_mode_;
+  std::string pool3d_type_;
+  bool adaptive_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  bool is_global_;
+};
+
+class Pool3DPluginDynamicCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool3d_plugin_dynamic";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new Pool3DPluginDynamic(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(Pool3DPluginDynamicCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 48b0d2ab460571..84a970a9a26067 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -979,6 +979,49 @@ __global__ void KernelMaxPool3DGrad(
   }
 }
 
+template <typename PoolProcess, typename T>
+void Pool3dDirectCUDAFunctor<PoolProcess, T>::operator()(
+    const T* input, const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape, const std::vector<int>& ksize,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    bool exclusive, bool adaptive, T* output, gpuStream_t stream,
+    PoolProcess pool_compute) {
+  const int batch_size = input_shape[0];
+  const int input_channels = input_shape[1];
+  const int input_depth = input_shape[2];
+  const int input_height = input_shape[3];
+  const int input_width = input_shape[4];
+  const int output_channels = output_shape[1];
+  const int output_depth = output_shape[2];
+  const int output_height = output_shape[3];
+  const int output_width = output_shape[4];
+  const int ksize_depth = ksize[0];
+  const int ksize_height = ksize[1];
+  const int ksize_width = ksize[2];
+  const int stride_depth = strides[0];
+  const int stride_height = strides[1];
+  const int stride_width = strides[2];
+  const int padding_depth = paddings[0];
+  const int padding_height = paddings[1];
+  const int padding_width = paddings[2];
+
+  int nthreads = batch_size * output_channels * output_depth * output_height *
+                 output_width;
+  int thread_num = 1024;
+#ifdef WITH_NV_JETSON
+  thread_num = 512;
+#endif
+  int blocks = (nthreads + thread_num - 1) / thread_num;
+  dim3 threads(thread_num, 1);
+  dim3 grid(blocks, 1);
+
+  KernelPool3D<PoolProcess, T><<<grid, threads, 0, stream>>>(
+      nthreads, input, input_channels, input_depth, input_height, input_width,
+      output_depth, output_height, output_width, ksize_depth, ksize_height,
+      ksize_width, stride_depth, stride_height, stride_width, padding_depth,
+      padding_height, padding_width, pool_compute, exclusive, adaptive, output);
+}
+
 /*
  * Tensors are in NCDHW or NDHWC format.
  * Ksize, strides, paddings are three elements. These three elements represent
@@ -1315,6 +1358,11 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template class Pool3dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
+                                       float>;
+template class Pool3dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
+                                       float>;
+
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 3715f6e26104a1..4743f0dc9faf1d 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -187,6 +187,20 @@ class MaxPool2dGradFunctor {
                   const std::string data_format, framework::Tensor* input_grad);
 };
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename PoolProcess, typename T>
+class Pool3dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input, const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, bool exclusive,
+                  bool adaptive, T* output, gpuStream_t stream,
+                  PoolProcess pool_compute);
+};
+#endif
+
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index b951afdfad5ead..927456b396ea5b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -58,8 +58,10 @@ set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 if(WITH_NV_JETSON)
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
+  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 450)
 else()
   set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
+  set_tests_properties(test_trt_pool3d_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
 endif()
 set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
 set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
new file mode 100644
index 00000000000000..6fbddcf5a1fc05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pool3d_op.py
@@ -0,0 +1,332 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTPool3dTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def set_extra_config(self):
+        pass
+
+    def build_network(self):
+        self.set_extra_config()
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = fluid.layers.pool3d(
+                input=data,
+                pool_size=self.pool_size,
+                pool_type=self.pool_type,
+                pool_stride=self.pool_stride,
+                pool_padding=self.pool_padding,
+                global_pooling=self.global_pooling,
+                ceil_mode=self.ceil_mode,
+                exclusive=self.exclusive)
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+class TensorRTAvgPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'avg'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTGlobalPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = True
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTCeilPool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = True
+        self.exclusive = False
+
+
+class TensorRTExclusivePool3dTest(TensorRTPool3dTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 0
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = True
+
+
+class TensorRTSamePaddingPool3dTest(InferencePassTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 'SAME'
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTValidPaddingPool3dTest(InferencePassTest):
+    def set_extra_config(self):
+        self.pool_size = 2
+        self.pool_type = 'max'
+        self.pool_stride = 1
+        self.pool_padding = 'VALID'
+        self.global_pooling = False
+        self.ceil_mode = False
+        self.exclusive = False
+
+
+class TensorRTAdaptiveAvgPool3DTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def build_network(self):
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = paddle.nn.functional.adaptive_avg_pool3d(
+                x=data, output_size=[3, 3, 3])
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+class TensorRTAdaptiveMaxPool3DTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 3
+        self.depth = 8
+        self.height = 8
+        self.width = 8
+        self.enable_trt = True
+        self.serialize = False
+        self.precision = AnalysisConfig.Precision.Float32
+        self.feeds = {
+            'data': np.random.random(
+                [self.bs, self.channel, self.depth, self.height,
+                 self.width]).astype('float32'),
+        }
+
+    def build_network(self):
+        self.trt_parameters = TensorRTPool3dTest.TensorRTParam(
+            1 << 30, self.bs, 0, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.depth, self.height, self.width],
+                dtype='float32')
+            pool_out = paddle.nn.functional.adaptive_max_pool3d(
+                x=data, output_size=[3, 3, 3])
+            #out = fluid.layers.batch_norm(pool_out, is_test=True)
+            self.fetch_list = [pool_out]
+
+    def check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build_network()
+        self.check_output()
+
+    def test(self):
+        precision_options = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_options = [False, True]
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'data': [
+                self.bs, self.channel, self.depth // 2, self.height // 2,
+                self.width // 2
+            ]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, {
+            'data':
+            [self.bs, self.channel, self.depth, self.height, self.width]
+        }, False)
+        dynamic_shape_options = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_options, serialize_options, dynamic_shape_options):
+            is_dynamic = True if dynamic_shape_options is not None else False
+            with self.subTest('Precision: {}, Serialize: {}, Dynamic: {}'.
+                              format(precision, serialize, is_dynamic)):
+                self.precision = precision
+                self.serialize = serialize
+                self.dynamic_shape_params = dynamic_shape
+                self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From db633affe1b04a880935eeb20d405ff3466a0841 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Mon, 25 Oct 2021 21:13:53 -0500
Subject: [PATCH 04/14] Fix conv2d convert case (#36699)

* fix pool2d convert case

* add pool2d convert test case for trt6
---
 .../tests/unittests/ir/inference/test_trt_convert_pool2d.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index 05545f0b0e95c3..ddb96c37db780c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -33,6 +33,10 @@ def is_paddings_valid(self, program_config: ProgramConfig) -> bool:
                 for index in range(len(ksize)):
                     if ksize[index] <= paddings[index]:
                         return False
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000:
+            if program_config.ops[0].attrs['pooling_type'] == 'avg':
+                return False
         return True
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:

From eca78a9f1b850bb8264134bf572af84528a55a16 Mon Sep 17 00:00:00 2001
From: xiongkun <807377414@qq.com>
Date: Tue, 26 Oct 2021 10:37:52 +0800
Subject: [PATCH 05/14] Support various length support for SelectedRows in
 GLOO::AllGather (#36637)

* In cpu parallel using gloo, add various length support for SelectedRows

* fix bug

* fix bugs

* fix by code review

* remove timeout
---
 paddle/fluid/framework/fleet/gloo_wrapper.h   | 22 +++++-
 paddle/fluid/imperative/gloo_context.cc       | 73 +++++++------------
 .../fluid/tests/unittests/CMakeLists.txt      |  1 +
 .../fluid/tests/unittests/test_dist_base.py   | 30 +++++++-
 ...graph_sparse_embedding_diff_length_gloo.py | 46 ++++++++++++
 5 files changed, 119 insertions(+), 53 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py

diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index f1ec042dbd7050..42ae73f9b13f1e 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/allgather.h>
+#include <gloo/allgatherv.h>
 #include <gloo/allreduce.h>
 #include <gloo/barrier.h>
 #include <gloo/rendezvous/context.h>
@@ -238,10 +239,25 @@ class GlooWrapper {
     return ret;
   }
 
-  // TODO(xiongkun03): support all gather array of
+  // NOTE(@xiongkun03): support all gather array of
   //                   numbers with different length
-  //                   can use AllgathervOptions, may be work in different
-  //                   occasion. Need some survey.
+  //                   if the third argument is int, use allgather,
+  //                   if it is vector, use AllgathervOptions,
+  //                   which works in different length occasion.
+  template <typename T>
+  void AllGatherVector(T* input_ptr, T* output_ptr,
+                       std::vector<size_t>& element_nums) {  // NOLINT
+    CHECK_EQ(is_initialized_, true);
+#ifdef PADDLE_WITH_GLOO
+    gloo::AllgathervOptions opts(context_);
+    opts.setInput(input_ptr, element_nums[rank_]);
+    opts.setOutput(output_ptr, element_nums);
+    gloo::allgatherv(opts);
+#else
+    LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF";
+#endif
+  }
+
   template <typename T>
   void AllGatherVector(T* input_ptr, T* output_ptr,
                        size_t element_num) {  // NOLINT
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 0d93cdf57932fa..ef1bf0d158787e 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -53,15 +53,13 @@ void GLOOParallelContext::InitWithRingID(int ring_id) {
       platform::errors::OutOfRange("Still not implement InitWithRingID"));
 }
 
-#define GLOO_CASE(type, T, gw)                                        \
-  case type: {                                                        \
-    VLOG(4) << "Use the gloo all reduce to sync. SRC:" << src_tensor; \
-    std::vector<T> send_vector##T;                                    \
-    framework::TensorToVector<T>(src_tensor, &send_vector##T);        \
-    auto recv_vector##T = gw->AllReduce<T>(send_vector##T);           \
-    framework::TensorFromVector<T>(recv_vector##T, dst_tensor);       \
-    VLOG(4) << "DST:" << *dst_tensor;                                 \
-    break;                                                            \
+#define GLOO_CASE(type, T, gw)                                  \
+  case type: {                                                  \
+    std::vector<T> send_vector##T;                              \
+    framework::TensorToVector<T>(src_tensor, &send_vector##T);  \
+    auto recv_vector##T = gw->AllReduce<T>(send_vector##T);     \
+    framework::TensorFromVector<T>(recv_vector##T, dst_tensor); \
+    break;                                                      \
   }
 
 void GLOOParallelContext::AllReduceByStream(const framework::Variable &src,
@@ -118,7 +116,7 @@ void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor,
     const auto *src_tensor_ptr = src_tensor.data<T>();            \
     gw->AllGatherVector<T>(const_cast<T *>(src_tensor_ptr),       \
                            reinterpret_cast<T *>(dst_tensor_ptr), \
-                           value_sendcount);                      \
+                           element_nums);                         \
     break;                                                        \
   }
 
@@ -150,48 +148,31 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
   auto *dst_rows_ptr = dst_rows->MutableData(place);
   const int64_t *src_rows_ptr = src_rows.Data(place);
 
-  // VLOG(3) << "Selected Rows of src:" << string::join_strings(dst_rows, ',')
-
   auto *dst_tensor = dst->mutable_value();
   auto dims = src_tensor.dims();
   dims[0] = rows_num;
   auto feature_size = framework::product(dims) / dims[0];
   dst_tensor->Resize(dims);
-  if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks,
-                  [&](size_t row) { return row == cpu_rows_num_ptr[0]; })) {
-    // During sparse communication, the number of each card is same.
-    // Because gloo wrapper utility class currently don't support
-    // broadcast, so we only deal the-same case.
-    VLOG(3) << "Use the gloo all reduce to sync. SRC:" << src_tensor;
-    // framework::SerializeToStream(VLOG(4), src);
-    VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
-
-    gloo_wrapper->AllGatherVector<int64_t>(const_cast<int64_t *>(src_rows_ptr),
-                                           static_cast<int64_t *>(dst_rows_ptr),
-                                           rows_num_vector[0]);
-
-    switch (dtype) {
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float,
-                           gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double,
-                           gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper);
-      GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t,
-                           gloo_wrapper);
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Invalid datatype for allreduce"));
-      }
+
+  std::vector<size_t> element_nums = rows_num_vector;
+  std::for_each(element_nums.begin(), element_nums.end(),
+                [feature_size](size_t &x) { x = x * feature_size; });
+
+  auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype);
+  gloo_wrapper->AllGatherVector<int64_t>(const_cast<int64_t *>(src_rows_ptr),
+                                         static_cast<int64_t *>(dst_rows_ptr),
+                                         rows_num_vector);
+
+  switch (dtype) {
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper);
+    GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t,
+                         gloo_wrapper);
+    default: {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid datatype for allreduce"));
     }
-    VLOG(3) << "Selected Row DST:" << *dst_tensor;
-    VLOG(3) << "Selected Rows of DST:"
-            << string::join_strings(std::vector<int64_t>(*dst_rows), ',');
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "The number of each card is not the same, gloo only support the-same"
-        "batch division"));
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f9fe024b4b4e63..5b1c02e71abce1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -214,6 +214,7 @@ if (NOT WITH_GLOO)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_diff_length_gloo)
 endif()
 
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 63985415c51f6d..0b8a80f0c837a4 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -515,10 +515,28 @@ def _get_data(self, batch, args):
             return batch
         elif args.update_method != "local":
             new_batch = []
-            for offset, item in enumerate(batch):
-                if offset % 2 == args.trainer_id:
-                    new_batch.append(item)
-            return new_batch
+
+            # NOTE(@xiongkun03) args.diff_batch means batch length is different: 
+            # such as : batch = [2,3,4,5], then the first rank will get [2]  and 
+            # the second rank will get [3,4,5]. 
+            # this function is for test sparse_embedding_differ_length
+            if hasattr(args, "diff_batch") and args.diff_batch:
+                assert len(
+                    batch) > 2, "in differ_batch mode, len(batch) must > 2."
+                if paddle.distributed.get_rank() == 0:
+                    new_batch.append(batch[0])
+                elif paddle.distributed.get_rank() == 1:
+                    new_batch.extend([_ for _ in batch[1:]])
+                else:
+                    raise NotImplementedError(
+                        "Current TestParallelDyGraphRunnerBase don't support world_size > 2"
+                    )
+                return new_batch
+            else:
+                for offset, item in enumerate(batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
         else:
             return batch
 
@@ -699,6 +717,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_fleet_api', action='store_true')
     parser.add_argument('--use_fleet_api_20', action='store_true')
     parser.add_argument('--use_local_sgd', action='store_true')
+    parser.add_argument('--diff_batch', action='store_true')
     parser.add_argument('--ut4grad_allreduce', action='store_true')
     parser.add_argument(
         '--hallreduce_inter_nranks', type=int, required=False, default=2)
@@ -798,6 +817,7 @@ def setUp(self):
         self._gloo_mode = False  # now, support gloo backend
         self._pipeline_mode = False
         self._mp_mode = False
+        self._diff_batch = False
         # FIXME(typhoonzero): I added this stupid argument to enable
         # testing allreduce layers, which users can call layers.allreduce
         # to accumulate tensors at anywhere. Find a better way to do this
@@ -1100,6 +1120,8 @@ def _get_gloo_trainer_cmd(self, model, ep, update_method, trainer_id,
         #assert self._use_reader_alloc == False, "gloo not support _use_reduce"
         if self._save_model:
             tr_cmd += " --save_model"
+        if self._diff_batch:
+            tr_cmd += " --diff_batch"
         self.__use_cuda = False
         self.__use_xpu = False
         assert self.__use_cuda == False, "gloo not support use cuda"
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
new file mode 100644
index 00000000000000..1c425a40a9b397
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_diff_length_gloo.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding import TestSparseEmbedding
+from parallel_dygraph_sparse_embedding_fp64 import TestSparseEmbeddingFP64
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._gloo_mode = True
+        self._dygraph = True
+        self._diff_batch = True
+
+    def test_sparse_embedding(self):
+        self.check_with_place(
+            "parallel_dygraph_sparse_embedding.py",
+            delta=1e-5,
+            check_error_log=True,
+            log_name=flag_name)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 290ded7a6b73d2ef3f8bc8fa584196924db30b58 Mon Sep 17 00:00:00 2001
From: Jack Zhou <zhoushunjie@baidu.com>
Date: Tue, 26 Oct 2021 10:44:55 +0800
Subject: [PATCH 06/14] Optimize FasterTokenizer (#36701)

* optimize fast tokenizer
---
 .../operators/string/faster_tokenizer_op.cc   | 38 ++++++++++---------
 .../operators/string/faster_tokenizer_op.h    | 27 +++++++------
 2 files changed, 34 insertions(+), 31 deletions(-)
 mode change 100755 => 100644 paddle/fluid/operators/string/faster_tokenizer_op.h

diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 49457af8f00c80..42047021b408a8 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -100,9 +100,14 @@ void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
     // String is converted into wstring failedly.
     return;
   }
-
-  std::wstring dest_text;
-  for (auto ch : unicode_text) {
+  std::wstring cache_text = L"";
+  auto PushCacheText = [&]() {
+    if (cache_text != L"") {
+      res->emplace_back(cache_text);
+      cache_text = L"";
+    }
+  };
+  for (auto& ch : unicode_text) {
     if (ch == 0 || ch == 0xfffd || IsControl(ch)) {
       continue;
     }
@@ -110,25 +115,24 @@ void BasicTokenizer::Tokenize(const string& text, vector<wstring>* res) const {
       ch = do_lower_case(ch);
     }
     if (IsChineseChar(ch) || IsPunctuation(ch)) {
-      dest_text += ' ';
-      dest_text += ch;
-      dest_text += ' ';
+      PushCacheText();
+      res->emplace_back(std::wstring{ch});
     } else if (IsWhiteSpace(ch)) {
-      dest_text += ' ';
+      PushCacheText();
     } else {
-      dest_text += ch;
+      cache_text += ch;
     }
   }
-  boost::split(*res, dest_text, boost::is_any_of(kStripChars));
+  PushCacheText();
 }
 
 WordPieceTokenizer::WordPieceTokenizer(
-    framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
+    const framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/,
     const size_t max_input_chars_per_word /* = 100 */)
     : vocab_(vocab),
       unk_token_(unk_token),
       max_input_chars_per_word_(max_input_chars_per_word) {
-  unk_token_id_ = (*vocab_)[unk_token_];
+  unk_token_id_ = vocab_->at(unk_token_);
 }
 
 void WordPieceTokenizer::Tokenize(const wstring& text,
@@ -178,7 +182,7 @@ void WordPieceTokenizer::Tokenize(const wstring& text,
   }
 }
 
-BertTokenizer::BertTokenizer(framework::Vocab* vocab,
+BertTokenizer::BertTokenizer(const framework::Vocab* vocab,
                              bool do_lower_case /* = false */,
                              const wstring& unk_token /* = L"[UNK]" */,
                              const wstring& pad_token /* = L"[PAD]" */,
@@ -196,11 +200,11 @@ BertTokenizer::BertTokenizer(framework::Vocab* vocab,
       vocab_(vocab),
       basic_tokenizer_(do_lower_case_),
       word_piece_tokenizer_(vocab_, unk_token) {
-  unk_token_id_ = (*vocab_)[unk_token_];
-  pad_token_id_ = (*vocab_)[pad_token_];
-  cls_token_id_ = (*vocab_)[cls_token_];
-  mask_token_id_ = (*vocab_)[mask_token_];
-  sep_token_id_ = (*vocab_)[sep_token_];
+  unk_token_id_ = vocab_->at(unk_token_);
+  pad_token_id_ = vocab_->at(pad_token_);
+  cls_token_id_ = vocab_->at(cls_token_);
+  mask_token_id_ = vocab_->at(mask_token_);
+  sep_token_id_ = vocab_->at(sep_token_);
 
   all_special_tokens_ = vector<wstring>(
       {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_});
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h
old mode 100755
new mode 100644
index d9b7fa26a6704b..5218b7c2eaa51d
--- a/paddle/fluid/operators/string/faster_tokenizer_op.h
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.h
@@ -56,13 +56,13 @@ class BasicTokenizer {
 
 class WordPieceTokenizer {
  public:
-  explicit WordPieceTokenizer(framework::Vocab* vocab,
+  explicit WordPieceTokenizer(const framework::Vocab* vocab,
                               const wstring& unk_token = L"[UNK]",
                               const size_t max_input_chars_per_word = 100);
   void Tokenize(const wstring& text, vector<int64_t>* output) const;
 
  private:
-  framework::Vocab* vocab_;
+  const framework::Vocab* vocab_;
   wstring unk_token_{L"[UNK]"};
   int64_t unk_token_id_;
   size_t max_input_chars_per_word_;
@@ -70,7 +70,8 @@ class WordPieceTokenizer {
 
 class BertTokenizer {
  public:
-  explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false,
+  explicit BertTokenizer(const framework::Vocab* vocab,
+                         bool do_lower_case = false,
                          const wstring& unk_token = L"[UNK]",
                          const wstring& pad_token = L"[PAD]",
                          const wstring& cls_token = L"[CLS]",
@@ -106,7 +107,7 @@ class BertTokenizer {
   bool do_lower_case_;
   wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_;
   string padding_site_;
-  framework::Vocab* vocab_;
+  const framework::Vocab* vocab_;
   BasicTokenizer basic_tokenizer_;
   WordPieceTokenizer word_piece_tokenizer_;
   int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_,
@@ -140,21 +141,20 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
       return;
     }
 
-    BertTokenizer* tokenizer_ptr =
-        new BertTokenizer(const_cast<framework::Vocab*>(vocab), do_lower_case);
+    BertTokenizer tokenizer(vocab, do_lower_case);
     size_t batch_max_seq_len = 0;
     size_t batch_size = text->size();
 
     vector<unordered_map<string, vector<int64_t>>> batch_encode_inputs(
         batch_size);
     if (text_pair) {
-      tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair,
-                                 is_split_into_words, max_seq_len,
-                                 pad_to_max_seq_len);
+      tokenizer.BatchEncode(&batch_encode_inputs, *text, *text_pair,
+                            is_split_into_words, max_seq_len,
+                            pad_to_max_seq_len);
     } else {
-      tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector<string>(),
-                                 is_split_into_words, max_seq_len,
-                                 pad_to_max_seq_len);
+      tokenizer.BatchEncode(&batch_encode_inputs, *text, vector<string>(),
+                            is_split_into_words, max_seq_len,
+                            pad_to_max_seq_len);
     }
 
     for (size_t i = 0; i < batch_size; ++i) {
@@ -173,7 +173,7 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
                               static_cast<int64_t>(batch_max_seq_len)}));
     auto* seg_ids_data = seg_ids->mutable_data<T>(ctx.GetPlace());
 
-    auto pad_token_id = tokenizer_ptr->GetPadTokenID();
+    auto pad_token_id = tokenizer.GetPadTokenID();
     for (size_t i = 0; i < batch_size; i++) {
       auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"];
       auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"];
@@ -188,7 +188,6 @@ class FasterTokenizerKernel : public framework::OpKernel<T> {
       std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id,
                   (batch_max_seq_len - seq_len) * sizeof(T));
     }
-    delete tokenizer_ptr;
   }
 };
 

From 93c591e200be77887b69488f600c84d3dfabeb0b Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Tue, 26 Oct 2021 10:45:09 +0800
Subject: [PATCH 07/14] [Paddle-Inference]Add MatmulV2ToMatmul convert Pass,
 fix (matmul_v2, matmul, mul) convert pass, fix (matmul, mul) op_teller
 (#36652)

* new_Matmul2ToMatmulToMul

* new_Matmul2ToMatmulToMul

* fix paddle_pass_builder

* fix paddle_pass_builder

* fix paddle_pass_builder

* tem

* tem

* Add MatmulV2ToMatmul convert Pass; MatmulV2ToMul convert Pass

* Add MatmulV2ToMatmul convert Pass; MatmulV2ToMul convert Pass

* add matmul_broadcast_unitest

* fix op_teller
---
 .../ir/delete_quant_dequant_filter_op_pass.cc |   5 +-
 .../framework/ir/graph_pattern_detector.cc    |  51 ++--
 .../framework/ir/graph_pattern_detector.h     |  23 +-
 .../framework/ir/map_matmul_to_mul_pass.cc    | 221 ++++++++++++++----
 .../framework/ir/map_matmul_to_mul_pass.h     |  18 +-
 .../ir/multihead_matmul_fuse_pass.cc          |  19 +-
 .../inference/api/paddle_pass_builder.cc      |  23 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |  66 +++++-
 .../analyzer_seq_pool1_fuse_statis_tester.cc  |   4 +-
 .../inference/tests/infer_ut/test_LeViT.cc    |   6 +-
 .../unittests/ir/inference/test_trt_matmul.py |  38 +++
 11 files changed, 388 insertions(+), 86 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
index b9cc337df87929..2fc133edb7a960 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc
@@ -181,7 +181,7 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
                               "Weight scale should be nonzero, but get zero."));
         weight_scale[i] = weight_scale[i] / range;
       }
-    } else {
+    } else if (dequant_type == "fake_quantize_dequantize_abs_max") {
       // Implement quantize_dequantize_abs_max quantization algorithm
       float abs_max_weight = 0.;
       for (int j = 0; j < weight_tensor->numel(); j++) {
@@ -192,6 +192,9 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const {
                         platform::errors::InvalidArgument(
                             "Weight scale should be nonzero, but get zero"));
       weight_scale.push_back(abs_max_weight / range);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported quantize_dequantize op type: %s", dequant_type));
     }
 
     nodes2rm.insert(quant_dequant_op_outscale);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 71b30d854ca24d..6830a1f85e02a9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1606,6 +1606,7 @@ PDNode *patterns::Matmul::operator()() {
                          ->assert_is_op_input("matmul", "X");
   auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
                          ->AsInput()
+                         ->assert_is_persistable_var()
                          ->assert_is_op_input("matmul", "Y");
   auto matmul_out = pattern->NewNode(matmul_out_repr())
                         ->AsOutput()
@@ -1615,23 +1616,45 @@ PDNode *patterns::Matmul::operator()() {
   return matmul_out;
 }
 
+// MatmulV2: tensor * weight
+PDNode *patterns::MatmulV2Weight::operator()() {
+  auto matmul_v2_op =
+      pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2");
+
+  auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "X");
+  auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr())
+                            ->AsInput()
+                            ->assert_is_persistable_var()  // Y is weight
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("matmul_v2", "Out");
+
+  matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y})
+      .LinksTo({matmul_v2_out});
+  return matmul_v2_out;
+}
+
+// MatmulV2: tensor * tensor or tensor * weight
 PDNode *patterns::MatmulV2::operator()() {
-  auto matmul_op =
-      pattern->NewNode(matmul_op_repr())->assert_is_op("matmul_v2");
+  auto matmul_v2_op =
+      pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2");
 
-  auto matmul_in_x = pattern->NewNode(matmul_in_x_repr())
-                         ->AsInput()
-                         ->assert_is_op_input("matmul_v2", "X");
-  auto matmul_in_y = pattern->NewNode(matmul_in_y_repr())
-                         ->assert_is_persistable_var()
-                         ->AsInput()
-                         ->assert_is_op_input("matmul_v2", "Y");
-  auto matmul_out = pattern->NewNode(matmul_out_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("matmul_v2", "Out");
+  auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "X");
+  auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("matmul_v2", "Out");
 
-  matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out});
-  return matmul_out;
+  matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y})
+      .LinksTo({matmul_v2_out});
+  return matmul_v2_out;
 }
 
 PDNode *patterns::Squeeze2Matmul::operator()() {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index cc9d1c76ab11bf..6657ab5a6a5764 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -976,17 +976,28 @@ struct Matmul : public PatternBase {
   PATTERN_DECL_NODE(matmul_out);
 };
 
-// Matmul_v2 op
-// Forward pass for matmul_v2.
+// MatmulV2: tensor * weight
+struct MatmulV2Weight : public PatternBase {
+  MatmulV2Weight(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "matmul_v2_weight") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(matmul_v2_in_x);
+  PATTERN_DECL_NODE(matmul_v2_in_y);
+  PATTERN_DECL_NODE(matmul_v2_op);
+  PATTERN_DECL_NODE(matmul_v2_out);
+};
+
+// MatmulV2: tensor * tensor or tensor * weight
 struct MatmulV2 : public PatternBase {
   MatmulV2(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "matmul_v2") {}
 
   PDNode* operator()();
-  PATTERN_DECL_NODE(matmul_in_x);
-  PATTERN_DECL_NODE(matmul_in_y);
-  PATTERN_DECL_NODE(matmul_op);
-  PATTERN_DECL_NODE(matmul_out);
+  PATTERN_DECL_NODE(matmul_v2_in_x);
+  PATTERN_DECL_NODE(matmul_v2_in_y);
+  PATTERN_DECL_NODE(matmul_v2_op);
+  PATTERN_DECL_NODE(matmul_v2_out);
 };
 
 // Squeeze2 + Matmul
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index cdec49260f90cd..865b556f301c0d 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -68,7 +68,7 @@ MapMatmul2MulPass::MapMatmul2MulPass() {
       .End();
 }
 
-MapMatmulv2ToMulPass::MapMatmulv2ToMulPass() {
+MapMatmulV2ToMulPass::MapMatmulV2ToMulPass() {
   AddOpCompat(OpCompat("matmul_v2"))
       .AddInput("X")
       .IsTensor()
@@ -104,6 +104,45 @@ MapMatmulv2ToMulPass::MapMatmulv2ToMulPass() {
       .End();
 }
 
+MapMatmulV2ToMatmulPass::MapMatmulV2ToMatmulPass() {
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumEQ(1.0f)
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("transpose_X")
+      .IsType<bool>()
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+}
+
 Flatten2MatmulFusePass::Flatten2MatmulFusePass() {
   AddOpCompat(OpCompat("matmul"))
       .AddInput("X")
@@ -246,15 +285,11 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
     std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
+    flag = flag && x_rank >= 2 && y_rank == 2;
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "MapMatmul2MulPass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -268,6 +303,8 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(matmul_in_x, mul_node);
@@ -287,66 +324,72 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
-void MapMatmulv2ToMulPass::ApplyImpl(ir::Graph* graph) const {
+void MapMatmulV2ToMulPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   std::string name_scope = "map_matmul_v2_to_mul_pass";
   FusePassBase::Init(name_scope, graph);
 
   GraphPatternDetector gpd;
-  patterns::MatmulV2 matmul_pattern(gpd.mutable_pattern(), name_scope);
-  matmul_pattern();
+  patterns::MatmulV2Weight matmul_v2_weight_pattern(gpd.mutable_pattern(),
+                                                    name_scope);
+  matmul_v2_weight_pattern();
 
   int found_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "map matmul_v2 to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
-    bool flag = true;
+    VLOG(3) << "map matmul_v2 to mul";
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op,
+                              matmul_v2_weight_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out,
+                              matmul_v2_weight_pattern);
 
-    bool trans_x = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_x"));
-    bool trans_y = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_y"));
+    bool flag = true;
+    bool trans_x =
+        BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_x"));
+    bool trans_y =
+        BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_y"));
     flag = flag && !trans_x && !trans_y;
 
-    std::vector<int64_t> x_shape = matmul_in_x->Var()->GetShape();
-    std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
+    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
+    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
+    flag = flag && x_rank >= 2 && y_rank == 2;
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "MapMatmulV2ToMulPass in op compat failed.";
         return;
       }
-      OpDesc desc(matmul_op->Op()->Block());
+      OpDesc desc(matmul_v2_op->Op()->Block());
       desc.SetType("mul");
-      desc.SetInput("X", {matmul_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
+      desc.SetInput("X", {matmul_v2_in_x->Name()});
+      desc.SetInput("Y", {matmul_v2_in_y->Name()});
+      desc.SetOutput("Out", {matmul_v2_out->Name()});
       desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
       desc.SetAttr("y_num_col_dims", 1);
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
-        desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+      if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
+        desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
+        desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale"));
+        desc.SetAttr("weight_scale",
+                     matmul_v2_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_v2_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(matmul_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {matmul_op});
+      IR_NODE_LINK_TO(matmul_v2_in_x, mul_node);
+      IR_NODE_LINK_TO(matmul_v2_in_y, mul_node);
+      IR_NODE_LINK_TO(mul_node, matmul_v2_out);
+      GraphSafeRemoveNodes(graph, {matmul_v2_op});
       ++found_count;
 
       if (!IsCompat(desc)) {
-        LOG(WARNING) << "MapMatmulv2ToMulPass in out mul op compat failed.";
+        LOG(WARNING) << "MapMatmulV2ToMulPass in out mul op compat failed.";
         return;
       }
     }
@@ -356,6 +399,82 @@ void MapMatmulv2ToMulPass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_count);
 }
 
+void MapMatmulV2ToMatmulPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  std::string name_scope = "map_matmul_v2_to_matmul_pass";
+  FusePassBase::Init(name_scope, graph);
+
+  GraphPatternDetector gpd;
+  patterns::MatmulV2 matmul_v2_pattern(gpd.mutable_pattern(), name_scope);
+  matmul_v2_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "map matmul_v2 to matmul";
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x,
+                              matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y,
+                              matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op, matmul_v2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out, matmul_v2_pattern);
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "MapMatmulV2ToMatmulPass in op compat failed.";
+      return;
+    }
+
+    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
+    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
+    if (x_shape.size() != y_shape.size()) {
+      LOG(WARNING)
+          << "matmul op not support broadcast, please check inputs'shape. ";
+      return;
+    }
+    uint64_t dims = 2;
+    for (size_t i = 0; i < x_shape.size() - dims; ++i) {
+      if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
+        LOG(WARNING) << "matmul op not support broadcast, please check "
+                        "inputs'shape[i]. ";
+        return;
+      }
+    }
+
+    OpDesc desc(matmul_v2_op->Op()->Block());
+    desc.SetType("matmul");
+    desc.SetInput("X", {matmul_v2_in_x->Name()});
+    desc.SetInput("Y", {matmul_v2_in_y->Name()});
+    desc.SetOutput("Out", {matmul_v2_out->Name()});
+    desc.SetAttr("transpose_X", matmul_v2_op->Op()->GetAttr("trans_x"));
+    desc.SetAttr("transpose_Y", matmul_v2_op->Op()->GetAttr("trans_y"));
+    desc.SetAttr("alpha", 1.0f);
+    if (matmul_v2_op->Op()->HasAttr("use_mkldnn")) {
+      desc.SetAttr("use_mkldnn", matmul_v2_op->Op()->GetAttr("use_mkldnn"));
+    }
+    if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
+      desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
+      desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale"));
+      desc.SetAttr("weight_scale", matmul_v2_op->Op()->GetAttr("weight_scale"));
+      desc.SetAttr("out_threshold",
+                   matmul_v2_op->Op()->GetAttr("out_threshold"));
+    }
+    auto matmul_node = g->CreateOpNode(&desc);
+    IR_NODE_LINK_TO(matmul_v2_in_x, matmul_node);
+    IR_NODE_LINK_TO(matmul_v2_in_y, matmul_node);
+    IR_NODE_LINK_TO(matmul_node, matmul_v2_out);
+    GraphSafeRemoveNodes(graph, {matmul_v2_op});
+    ++found_count;
+
+    if (!IsCompat(desc)) {
+      LOG(WARNING) << "MapMatmulV2ToMatmulPass in out matmul op compat failed.";
+      return;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
 void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -402,7 +521,7 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Squeeze2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -416,6 +535,8 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(squeeze2_in_x, mul_node);
@@ -544,7 +665,7 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (flag) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Reshape2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -558,9 +679,11 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       if (!IsCompat(desc)) {
-        LOG(WARNING) << "reshape2 matmul pass in out mul op compat failed.";
+        LOG(WARNING) << "Reshape2MatmulFusePass in out mul op compat failed.";
         return;
       }
       auto mul_node = g->CreateOpNode(&desc);
@@ -629,7 +752,7 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
     if (pattern_found) {
       if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "Pass in op compat failed.";
+        LOG(WARNING) << "Flatten2MatmulFusePass in op compat failed.";
         return;
       }
       OpDesc desc(matmul_op->Op()->Block());
@@ -643,6 +766,8 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
         desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale"));
         desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale"));
+        desc.SetAttr("out_threshold",
+                     matmul_op->Op()->GetAttr("out_threshold"));
       }
       auto mul_node = g->CreateOpNode(&desc);
       IR_NODE_LINK_TO(flatten2_in_x, mul_node);
@@ -674,13 +799,21 @@ REGISTER_PASS_CAPABILITY(map_matmul_to_mul_pass)
             .EQ("mul", 0));
 
 REGISTER_PASS(map_matmul_v2_to_mul_pass,
-              paddle::framework::ir::MapMatmulv2ToMulPass);
+              paddle::framework::ir::MapMatmulV2ToMulPass);
 REGISTER_PASS_CAPABILITY(map_matmul_v2_to_mul_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("matmul_v2", 0)
             .EQ("mul", 0));
 
+REGISTER_PASS(map_matmul_v2_to_matmul_pass,
+              paddle::framework::ir::MapMatmulV2ToMatmulPass);
+REGISTER_PASS_CAPABILITY(map_matmul_v2_to_matmul_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul_v2", 0)
+            .LE("matmul", 1));
+
 REGISTER_PASS(squeeze2_matmul_fuse_pass,
               paddle::framework::ir::Squeeze2MatmulFusePass);
 REGISTER_PASS_CAPABILITY(squeeze2_matmul_fuse_pass)
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
index 8f462810fce51a..a924cd8ddf92c6 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h
@@ -49,10 +49,22 @@ class MapMatmul2MulPass : public FusePassBase {
 /*
  * Map matmul_v2 to mul, the same as MapMatmul2MulPass.
  */
-class MapMatmulv2ToMulPass : public FusePassBase {
+class MapMatmulV2ToMulPass : public FusePassBase {
  public:
-  MapMatmulv2ToMulPass();
-  virtual ~MapMatmulv2ToMulPass() {}
+  MapMatmulV2ToMulPass();
+  virtual ~MapMatmulV2ToMulPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+};
+
+/*
+ * Map matmul_v2 to matmul, not supoort broadcast.
+ */
+class MapMatmulV2ToMatmulPass : public FusePassBase {
+ public:
+  MapMatmulV2ToMatmulPass();
+  virtual ~MapMatmulV2ToMatmulPass() {}
 
  protected:
   void ApplyImpl(Graph* graph) const override;
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 4c0b28fd422662..8bbe6a12d8abc2 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -461,7 +461,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
   auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X");
 
   auto* matmul_qk =
       pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
@@ -1174,6 +1174,23 @@ MultiHeadMatmulV3FusePass::MultiHeadMatmulV3FusePass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("trans_y")  // QK(true) QKV(false)
+      .IsType<bool>()
+      .End();
+
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 9eccf0a6142753..8a54b04f4d8021 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -93,8 +93,9 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "squeeze2_matmul_fuse_pass",              //
       "reshape2_matmul_fuse_pass",              //
       "flatten2_matmul_fuse_pass",              //
-      "map_matmul_to_mul_pass",                 //
       "map_matmul_v2_to_mul_pass",              //
+      "map_matmul_v2_to_matmul_pass",           //
+      "map_matmul_to_mul_pass",                 //
       "fc_fuse_pass",                           //
       "conv_elementwise_add_fuse_pass",         //
       "add_support_int8_pass",
@@ -142,8 +143,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "squeeze2_matmul_fuse_pass",                 //
         "reshape2_matmul_fuse_pass",                 //
         "flatten2_matmul_fuse_pass",                 //
-        "map_matmul_to_mul_pass",                    //
         "map_matmul_v2_to_mul_pass",                 //
+        "map_matmul_v2_to_matmul_pass",              //
+        "map_matmul_to_mul_pass",                    //
         "fc_fuse_pass",                              //
         "fc_elementwise_layernorm_fuse_pass",        //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
@@ -196,15 +198,16 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "embedding_fc_lstm_fuse_pass", //
                   // TODO(wilber): fix correctness problem.
                   // "fc_lstm_fuse_pass",                    //
-                  "mul_lstm_fuse_pass",                      //
-                  "fc_gru_fuse_pass",                        //
-                  "mul_gru_fuse_pass",                       //
-                  "seq_concat_fc_fuse_pass",                 //
-                  "squeeze2_matmul_fuse_pass",               //
-                  "reshape2_matmul_fuse_pass",               //
-                  "flatten2_matmul_fuse_pass",               //
+                  "mul_lstm_fuse_pass",         //
+                  "fc_gru_fuse_pass",           //
+                  "mul_gru_fuse_pass",          //
+                  "seq_concat_fc_fuse_pass",    //
+                  "squeeze2_matmul_fuse_pass",  //
+                  "reshape2_matmul_fuse_pass",  //
+                  "flatten2_matmul_fuse_pass",  //
+                  "map_matmul_v2_to_mul_pass",  //
+                  // "map_matmul_v2_to_matmul_pass",            //
                   "map_matmul_to_mul_pass",                  //
-                  "map_matmul_v2_to_mul_pass",               //
                   "fc_fuse_pass",                            //
                   "repeated_fc_relu_fuse_pass",              //
                   "squared_mat_sub_fuse_pass",               //
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 7049df4b300f17..93ecde789c2152 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -340,6 +340,26 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                    "the pass.";
         return false;
       }
+
+      // not support broadcast
+      auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
+      auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
+      const auto x_shape = x_var_desc->GetShape();
+      const auto y_shape = y_var_desc->GetShape();
+      if (x_shape.size() != y_shape.size()) {
+        VLOG(3)
+            << "matmul op not support broadcast, please check inputs'shape. ";
+        return false;
+      }
+      uint64_t dims = 2;
+      for (size_t i = 0; i < x_shape.size() - dims; ++i) {
+        if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
+          VLOG(3) << "matmul op not support broadcast, please check "
+                     "inputs'shape[i]. ";
+          return false;
+        }
+      }
+
       for (auto& param_name : desc.Inputs()) {
         for (auto& var_name : param_name.second) {
           auto* var_desc = block->FindVar(var_name);
@@ -1330,6 +1350,47 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "fc") {
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+
+      // y'shapes == 2
+      auto fc_inputs = desc.Inputs();
+      std::string fc_y = "";
+      if (fc_inputs.find("Y") != fc_inputs.end()) {
+        fc_y = "Y";
+      } else if (fc_inputs.find("W") != fc_inputs.end()) {
+        fc_y = "W";
+      } else {
+        VLOG(3) << " input_y(fc_op) must be Y or W ";
+        return false;
+      }
+
+      //  There is currently no input: Y(weight) more than two dimensions
+      /*
+      auto* y_var_desc = block->FindVar(desc.Input(fc_y)[0]);
+      const auto y_shape = y_var_desc->GetShape();
+      if (y_shape.size() != 2) {
+        VLOG(3)
+            << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes = "
+            << y_shape.size();
+        return false;
+      }
+      // y_num_col_dims ==1
+      if (desc.HasAttr("y_num_col_dims")) {
+        int y_num_col_dims =
+            BOOST_GET_CONST(int, desc.GetAttr("y_num_col_dims"));
+        if (y_num_col_dims != 1) {
+          VLOG(3) << " fc_op'y_num_col_dims must be 1, but y_num_col_dims = "
+                  << y_num_col_dims;
+          return false;
+        }
+      }
+      */
       int x_num_col_dims =
           desc.HasAttr("x_num_col_dims")
               ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
@@ -1337,8 +1398,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                      ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
                      : 1);
       if (x_num_col_dims < 1) {
-        VLOG(3) << "converter expects x_num_col_dims >= 1, "
-                   "but x_num_col_dims = %d.";
+        VLOG(3) << "fc_op expects x_num_col_dims >= 1, "
+                   "but x_num_col_dims = "
+                << x_num_col_dims;
         return false;
       }
     }
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
index b8ccb8cee507b9..d33b11c389a095 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
@@ -36,10 +36,10 @@ TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
   ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
   ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
   EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 0);
   EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
   LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
+  EXPECT_EQ(num_ops, 185);
 }
 
 }  // namespace seq_pool1_tester
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index 2fe9b6c14446f0..b74d1189b804be 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -77,7 +77,7 @@ TEST(tensorrt_tester_LeViT, trt_fp32_bz2) {
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
@@ -103,7 +103,7 @@ TEST(tensorrt_tester_LeViT, serial_diff_batch_trt_fp32) {
   config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
-  config.EnableTensorRtEngine(1 << 20, max_batch_size, 6,
+  config.EnableTensorRtEngine(1 << 20, max_batch_size, 50,
                               paddle_infer::PrecisionType::kFloat32, false,
                               false);
   paddle_infer::services::PredictorPool pred_pool(config, 1);
@@ -145,7 +145,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
                   FLAGS_modeldir + "/inference.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 080d1ccc9054bc..99e99a8387784c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -107,5 +107,43 @@ def set_params(self):
         self.alpha = 2.0
 
 
+class TensorRTMatMulBroadcastTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        place = fluid.CPUPlace()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_x = fluid.data(
+                name="data_x", shape=[-1, 6, 24], dtype="float32")
+            data_y = fluid.data(name="data_y", shape=[24, 16], dtype="float32")
+            matmul_out = fluid.layers.matmul(
+                x=data_x,
+                y=data_y,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
+            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+
+        self.feeds = {
+            "data_x": np.ones([2, 6, 24]).astype("float32"),
+            "data_y": np.ones([24, 16]).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulBroadcastTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = False
+        self.alpha = 1.0
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()

From 21bece3f6c3aa19c0622c2f9bbd59fbe510c9320 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 26 Oct 2021 10:47:25 +0800
Subject: [PATCH 08/14] enable flags_benchmark for dygraph (#36686)

---
 paddle/fluid/imperative/prepared_operator.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8f45cd0fa6ea14..c31464bf20acc9 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
+DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace imperative {
@@ -208,6 +209,19 @@ static void PreparedOpRunImpl(
         op.Type(), outs, dev_ctx->GetPlace());
   }
 
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *

From 81e0c1baa1019f01b3166ad5198fcc4a111bc369 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Tue, 26 Oct 2021 11:00:12 +0800
Subject: [PATCH 09/14] move fft and signal files, move signal APIs (#36540)

* move signal apis

* move fft.py and signal.py to paddle/, fix typos

* fix relative imports from fft.py and signal.py

* fix typos
---
 python/paddle/__init__.py                     |    1 +
 python/paddle/fft.py                          | 1633 ++++++++++++++++-
 .../fluid/tests/unittests/test_signal.py      |   20 +-
 python/paddle/{tensor => }/signal.py          |   26 +-
 python/paddle/tensor/__init__.py              |    2 -
 python/paddle/tensor/fft.py                   | 1601 ----------------
 6 files changed, 1621 insertions(+), 1662 deletions(-)
 rename python/paddle/{tensor => }/signal.py (97%)
 delete mode 100644 python/paddle/tensor/fft.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 471f6f395351ec..29548a64f3dadb 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -296,6 +296,7 @@
 from . import hub  # noqa: F401
 from . import linalg  # noqa: F401
 from . import fft  # noqa: F401
+from . import signal  # noqa: F401
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 3ac02c9c8dc18a..de15eba0feffaa 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -12,50 +12,1613 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .tensor.fft import fft  # noqa: F401
-from .tensor.fft import fft2  # noqa: F401
-from .tensor.fft import fftn  # noqa: F401
-from .tensor.fft import ifft  # noqa: F401
-from .tensor.fft import ifft2  # noqa: F401
-from .tensor.fft import ifftn  # noqa: F401
-from .tensor.fft import rfft  # noqa: F401
-from .tensor.fft import rfft2  # noqa: F401
-from .tensor.fft import rfftn  # noqa: F401
-from .tensor.fft import irfft  # noqa: F401
-from .tensor.fft import irfft2  # noqa: F401
-from .tensor.fft import irfftn  # noqa: F401
-from .tensor.fft import hfft  # noqa: F401
-from .tensor.fft import hfft2  # noqa: F401
-from .tensor.fft import hfftn  # noqa: F401
-from .tensor.fft import ihfft  # noqa: F401
-from .tensor.fft import ihfft2  # noqa: F401
-from .tensor.fft import ihfftn  # noqa: F401
-from .tensor.fft import fftfreq  # noqa: F401
-from .tensor.fft import rfftfreq  # noqa: F401
-from .tensor.fft import fftshift  # noqa: F401
-from .tensor.fft import ifftshift  # noqa: F401
-
-__all__ = [ # noqa
+from typing import Sequence
+import numpy as np
+import paddle
+from .tensor.attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
+from .fluid.framework import in_dygraph_mode
+from . import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.layer_helper import LayerHelper
+
+__all__ = [
     'fft',
-    'fft2',
-    'fftn',
     'ifft',
-    'ifft2',
-    'ifftn',
     'rfft',
-    'rfft2',
-    'rfftn',
     'irfft',
-    'irfft2',
-    'irfftn',
     'hfft',
-    'hfft2',
-    'hfftn',
     'ihfft',
+    'fft2',
+    'ifft2',
+    'rfft2',
+    'irfft2',
+    'hfft2',
     'ihfft2',
+    'fftn',
+    'ifftn',
+    'rfftn',
+    'irfftn',
+    'hfftn',
     'ihfftn',
     'fftfreq',
     'rfftfreq',
     'fftshift',
-    'ifftshift'
+    'ifftshift',
 ]
+
+
+def _check_normalization(norm):
+    if norm not in ['forward', 'backward', 'ortho']:
+        raise ValueError(
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".
+            format(norm))
+
+
+def _check_fft_n(n):
+    if not isinstance(n, int):
+        raise ValueError(
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+    if n <= 0:
+        raise ValueError(
+            "Invalid FFT argument n({}), it should be positive.".format(n))
+
+
+def _check_fft_shape(x, s):
+    ndim = x.ndim
+    if not isinstance(s, Sequence):
+        raise ValueError(
+            "Invaid FFT argument s({}), it should be a sequence of integers.")
+
+    if len(s) > ndim:
+        raise ValueError(
+            "Length of FFT argument s should not be larger than the rank of input. "
+            "Received s: {}, rank of x: {}".format(s, ndim))
+    for size in s:
+        if not isinstance(size, int) or size <= 0:
+            raise ValueError("FFT sizes {} contains invalid value ({})".format(
+                s, size))
+
+
+def _check_fft_axis(x, axis):
+    ndim = x.ndim
+    if not isinstance(axis, int):
+        raise ValueError(
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+    if axis < -ndim or axis >= ndim:
+        raise ValueError(
+            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
+                axis, ndim, ndim))
+
+
+def _check_fft_axes(x, axes):
+    ndim = x.ndim
+    if not isinstance(axes, Sequence):
+        raise ValueError(
+            "Invalid FFT axes ({}), it should be a sequence of integers.".
+            format(axes))
+    if len(axes) > ndim:
+        raise ValueError(
+            "Length of fft axes should not be larger than the rank of input. "
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+    for axis in axes:
+        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
+            raise ValueError(
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
+                format(axes, axis, ndim, ndim))
+
+
+def _resize_fft_input(x, s, axes):
+    if len(s) != len(axes):
+        raise ValueError("length of `s` should equals length of `axes`.")
+    shape = x.shape
+    ndim = x.ndim
+
+    axes_to_pad = []
+    paddings = []
+    axes_to_slice = []
+    slices = []
+    for i, axis in enumerate(axes):
+        if shape[axis] < s[i]:
+            axes_to_pad.append(axis)
+            paddings.append(s[i] - shape[axis])
+        elif shape[axis] > s[i]:
+            axes_to_slice.append(axis)
+            slices.append((0, s[i]))
+
+    if axes_to_slice:
+        x = paddle.slice(
+            x,
+            axes_to_slice,
+            starts=[item[0] for item in slices],
+            ends=[item[1] for item in slices])
+    if axes_to_pad:
+        padding_widths = [0] * (2 * ndim)
+        for axis, pad in zip(axes_to_pad, paddings):
+            padding_widths[2 * axis + 1] = pad
+        x = paddle.nn.functional.pad(x, padding_widths)
+    return x
+
+
+def _normalize_axes(x, axes):
+    ndim = x.ndim
+    return [item if item >= 0 else (item + ndim) for item in axes]
+
+
+def _check_at_least_ndim(x, rank):
+    if x.ndim < rank:
+        raise ValueError("The rank of the input ({}) should >= {}".format(
+            x.ndim, rank))
+
+
+# public APIs 1d
+def fft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Calculate one-dimensional discrete Fourier transform.
+
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    calculate the 1-D * n * point discrete Fourier transform (DFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            fft_xp = paddle.fft.fft(xp).numpy()
+            print(fft_xp)
+            #  [1.+1.25396034e+00j 1.+4.38128627e+00j 1.-4.38128627e+00j
+            #   1.-1.25396034e+00j 1.-4.81574619e-01j 1.+8.88178420e-16j
+            #   1.+4.81574619e-01j]
+
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=True, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=True, name=name)
+
+
+def ifft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the 1-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
+
+    The input should be ordered in the same way as is returned by `fft`,
+    i.e.,
+
+    * ``x[0]`` should contain the zero frequency term,
+    * ``x[1:n//2]`` should contain the positive-frequency terms,
+    * ``x[n//2 + 1:]`` should contain the negative-frequency terms, in
+      increasing order starting from the most negative frequency.
+
+    For an even number of input points, ``x[n//2]`` represents the sum of
+    the values at the positive and negative Nyquist frequencies, as the two
+    are aliased together. 
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. If `n` is less than 
+            the length input, the input will be cropped. If larger, the input is filled 
+            with zeros. If `n` is not given, the input length along the axis specified 
+            by `axis` is used.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        by `axis`, or the last one if `axis` is not specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.exp(3j * np.pi * np.arange(7) / 7)
+            xp = paddle.to_tensor(x)
+            ifft_xp = paddle.fft.ifft(xp).numpy()
+            print(ifft_xp)
+            #  [0.14285714+1.79137191e-01j 0.14285714+6.87963741e-02j
+            #   0.14285714+1.26882631e-16j 0.14285714-6.87963741e-02j
+            #   0.14285714-1.79137191e-01j 0.14285714-6.25898038e-01j
+            #   0.14285714+6.25898038e-01j]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fft_r2c(
+            x, n, axis, norm, forward=False, onesided=False, name=name)
+    else:
+        return fft_c2c(x, n, axis, norm, forward=False, name=name)
+
+
+def rfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The one dimensional FFT for real input.
+
+    This function computes the one dimensional *n*-point discrete Fourier
+    Transform (DFT) of a real-valued tensor by means of an efficient algorithm
+    called the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor) : Real-valued input tensor 
+        n(int, optional): Number of points along transformation axis in the 
+            input to use. If `n` is smaller than the length of the input, the 
+            input is cropped. If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional): Axis over which to compute the FFT. Default value 
+            is last axis.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward  pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor
+
+    Raises:
+
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
+        print(paddle.fft.rfft(x))
+        # Tensor(shape=[3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [ (1+0j), -1j    , (-1+0j)])
+    """
+    return fft_r2c(x, n, axis, norm, forward=True, onesided=True, name=name)
+
+
+def irfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Computes the inverse of `rfft`.
+
+    This function calculates the inverse of the one-dimensional *n* point discrete 
+    Fourier transform of the actual input calculated by "rfft". In other words, 
+    ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
+
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
+    followed by the complex positive frequency term, in the order of increasing frequency. 
+    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
+    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    positive frequency term.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
+        in some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            irfft_xp = paddle.fft.irfft(xp).numpy()
+            print(irfft_xp)
+            #  [0. 1. 0. 0.]
+
+    """
+    return fft_c2r(x, n, axis, norm, forward=False, name=name)
+
+
+def hfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    Compute the FFT of a signal that has Hermitian symmetry, a real
+    spectrum.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        n (int, optional): The length of the output transform axis. For `n` output
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            along the ` axis'.
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
+            is used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        some cases.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([1, -1j, -1])
+            xp = paddle.to_tensor(x)
+            hfft_xp = paddle.fft.hfft(xp).numpy()
+            print(hfft_xp)
+            #  [0. 0. 0. 4.]
+    """
+
+    return fft_c2r(x, n, axis, norm, forward=True, name=name)
+
+
+def ihfft(x, n=None, axis=-1, norm="backward", name=None):
+    """
+    The inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the one dimensional *n*-point inverse FFT of a signal 
+    that has Hermitian symmetry by means of an efficient algorithm called 
+    the Fast Fourier Transform (FFT).
+
+    When the DFT is computed for purely real input, the output is
+    Hermitian-symmetric. This function does not compute the negative frequency 
+    terms, and the length of the transformed axis of the output is therefore 
+    ``n//2 + 1``.
+
+    Args:
+        x(Tensor): Input tensor.
+        n(int, optional): The number of points along transformation axis in the 
+            input to use.  If `n` is smaller than the length of the input, the 
+            input is cropped.  If it is larger, the input is padded with zeros. 
+            If `n` is not given, the length of the input along the axis 
+            specified by `axis` is used.
+        axis(int, optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fft_r2c(x, n, axis, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs nd
+def fftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D discrete Fourier Transform.
+
+    This function calculates the n-D discrete Fourier transform on any number of axes 
+    in the M-D array by fast Fourier transform (FFT).
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:4, :4, :4][1]
+            xp = paddle.to_tensor(x)
+            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
+            print(fftn_xp)
+            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
+            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
+            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=True, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
+
+
+def ifftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform over any number of axes in an M-D array by
+    means of the Fast Fourier Transform (FFT).  In other words,
+    ``ifftn(fftn(x)) == x`` to within numerical accuracy.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fftn`, i.e., it should have the term for zero frequency
+    in all axes in the low-order corner, the positive frequency terms in the
+    first half of all axes, the term for the Nyquist frequency in the middle
+    of all axes and the negative frequency terms in the second half of all
+    axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type. It's a complex.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+            This corresponds to ``n`` for ``fft(x, n)``.
+            Along any axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used.
+        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
+            axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            scaled by ``1/sqrt(n)``.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+        
+    Returns:
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.eye(3)
+            xp = paddle.to_tensor(x)
+            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
+            print(ifftn_xp)
+
+            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
+            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
+            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
+
+    """
+    if is_interger(x) or is_floating_point(x):
+        return fftn_r2c(
+            x, s, axes, norm, forward=False, onesided=False, name=name)
+    else:
+        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
+
+
+def rfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The N dimensional FFT for real input.
+
+    This function computes the N-dimensional discrete Fourier Transform over
+    any number of axes in an M-dimensional real array by means of the Fast
+    Fourier Transform (FFT).  By default, all axes are transformed, with the
+    real transform performed over the last axis, while the remaining
+    transforms are complex.
+
+    The transform for real input is performed over the last transformation
+    axis, as by `rfft`, then the transform over the remaining axes is
+    performed as by `fftn`.  The order of the output is as for `rfft` for the
+    final transformation axis, and as for `fftn` for the remaining
+    transformation axes.
+
+    Args:
+        x(Tensor) : Input tensor, taken to be real.
+        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
+            the given shape is smaller than that of the input, the input is 
+            cropped.  If it is larger, the input is padded with zeros. if `s` is 
+            not given, the shape of the input along the axes specified by `axes` 
+            is used.
+        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
+            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+            specified.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor): complex tensor
+
+
+    Raises:
+        ValueError: If `s` and `axes` have different length.
+
+    Examples:
+    .. code-block:: python
+        import paddle
+
+        # default, all axis will be used to exec fft
+        x = paddle.ones((2, 3, 4))
+        print(paddle.fft.rfftn(x))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(24+0j), 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+        # use axes(2, 0)
+        print(paddle.fft.rfftn(x, axes=(2, 0)))
+        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[[(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ]],
+        #
+        #         [[0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ],
+        #          [0j     , 0j     , 0j     ]]])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
+
+
+def irfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Computes the inverse of `rfftn`.
+
+    This function computes the inverse of the N-D discrete
+    Fourier Transform for real input over any number of axes in an
+    M-D array by means of the Fast Fourier Transform (FFT). In
+    other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
+    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
+    and for the same reason.)
+
+    The input should be ordered in the same way as is returned by `rfftn`,
+    i.e., as for `irfft` for the final transformation axis, and as for `ifftn`
+    along all the other axes.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        each transformed axis is as given by the corresponding element of `s`, or the length of the input
+        in every axis except for the last one if `s` is not given. In the final transformed axis the length
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
+        transformed axis of the input. To get an odd number of output points in the final axis, 
+        `s` must be specified.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfftn_xp = paddle.fft.irfftn(xp).numpy()
+            print(irfftn_xp)
+            #  [ 2.25 -1.25  0.25  0.75]
+    
+    """
+    return fftn_c2r(x, s, axes, norm, forward=False, name=name)
+
+
+def hfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
+    signal with a real spectrum.
+
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
+    complex input on any axis in M-D array by fast Fourier transform (FFT). 
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    for the same reason that ``irfft` requires ``x.shape``.)
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): The length of the output transform axis. 
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
+            number of input points used along this axis, except for the last axis,
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
+            the shape indicated by `s` is smaller than that of the input, the input 
+            is cropped. If it is larger, the input is padded with zeros. 
+            If `s` is not given, the shape of the input along the axes specified by axes 
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            ``k`` is the length of the input along that axis.
+        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
+            `len(s)` axes are used, or all axes if `s` is also not specified.      
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        a combination of `s` or `X`.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfftn_xp = paddle.fft.hfftn(xp).numpy()
+            print(hfftn_xp)
+            #  [ 9.  3.  1. -5.]
+
+
+    """
+    return fftn_c2r(x, s, axes, norm, forward=True, name=name)
+
+
+def ihfftn(x, s=None, axes=None, norm="backward", name=None):
+    """
+    The n dimensional inverse FFT of a signal that has Hermitian symmetry.
+
+    This function computes the n dimensional inverse FFT over any number of axes 
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    efficient algorithm called the Fast Fourier Transform (FFT).
+
+    Args:
+        x(Tensor): Input tensor.
+        s(Sequence[int], optional) : Shape (length along each transformed axis) 
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
+            1, etc.). Along any axis, if the given shape is smaller than that 
+            of the input, the input is cropped. If it is larger, the input is 
+            padded with zeros. if `s` is not given, the shape of the input 
+            along the axes specified by `axes` is used.
+        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
+            given, the last axis is used.
+        norm(str, optional) : Normalization mode, indicates which direction of 
+            the forward/backward pair of transforms is scaled and with what 
+            normalization factor. Include {"backward", "ortho", "forward"}, 
+            default value is "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : complex tensor.
+
+    Examples:
+    .. code-block:: python
+        import paddle 
+
+        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
+        print(paddle.fft.ifft(spectrum))
+        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
+        print(paddle.fft.ihfft(spectrum))
+        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
+        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
+
+    """
+    return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
+
+
+# public APIs 2d
+def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D discrete Fourier Transform
+
+    This function computes the N-D discrete Fourier Transform
+    over any axes in an M-D array by means of the
+    Fast Fourier Transform (FFT). By default, the transform is computed over
+    the last two axes of the input array, i.e., a 2-dimensional FFT.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            fft2_xp = paddle.fft.fft2(xp).numpy()
+            print(fft2_xp)
+            #  [[ 2.+0.j -2.+0.j]
+            #   [ 0.+0.j  0.+0.j]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return fftn(x, s, axes, norm, name)
+
+
+def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D inverse discrete Fourier Transform.
+
+    This function computes the inverse of the 2-D discrete Fourier
+    Transform over any number of axes in an M-D array by means of
+    the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(x)) == x``
+    to within numerical accuracy. By default, the inverse transform is
+    computed over the last two axes of the input array.
+
+    The input, analogously to `ifft`, should be ordered in the same way as is
+    returned by `fft2`, i.e., it should have the term for zero frequency
+    in the low-order corner of the two axes, the positive frequency terms in
+    the first half of these axes, the term for the Nyquist frequency in the
+    middle of the axes and the negative frequency terms in the second half of
+    both axes, in order of decreasingly negative frequency.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+            Along each axis, if the given shape is smaller than that of the input,
+            the input is cropped. If it is larger, the input is padded with zeros.
+            if `s` is not given, the shape of the input along the axes specified
+            by `axes` is used. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
+            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        or the last two axes if `axes` is not given.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:2, :2][1]
+            xp = paddle.to_tensor(x)
+            ifft2_xp = paddle.fft.ifft2(xp).numpy()
+            print(ifft2_xp)
+            #  [[ 0.5+0.j -0.5+0.j]
+            #   [ 0. +0.j  0. +0.j]]
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ifftn(x, s, axes, norm, name)
+
+
+def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    The two dimensional FFT with real tensor input.
+
+    This is really just `rfftn` with different default behavior.
+    For more details see `rfftn`.
+
+    Args:
+        x(Tensor): Input tensor, taken to be real.
+        s(Sequence[int]) : Shape of the FFT.
+        axes(Sequence[int], optional): Axes over which to compute the FFT.
+        norm(str, optional) : {"backward", "ortho", "forward"}, 
+            default is "backward". Indicates which direction of the 
+            forward/backward pair of transforms is scaled and with what 
+            normalization factor.
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns: 
+        out(Tensor): The result of the real 2-D FFT.
+
+    Raises:
+
+
+    Examples:
+
+    .. code-block:: python
+        import paddle
+        import numpy as np
+
+        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
+        print(paddle.fft.rfft2(x))
+        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
+        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
+        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
+        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return rfftn(x, s, axes, norm, name)
+
+
+def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Computes the inverse of `rfft2`.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
+            must be two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name` . 
+    
+    Returns:
+        Real tensor. The result of the inverse real 2-D FFT.
+
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            irfft2_xp = paddle.fft.irfft2(xp).numpy()
+            print(irfft2_xp)
+            #  [[ 2.375 -1.125  0.375  0.875]
+            #   [ 0.125  0.125  0.125  0.125]]
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return irfftn(x, s, axes, norm, name)
+
+
+def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the 2-D FFT of a Hermitian complex array.
+
+    Args:
+        x (Tensor): The input data. It's a Tensor type.
+        s (sequence of ints, optional): Shape of the real output. Default is None.
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
+            two-dimensional. If not specified, the last two axes are used by default.       
+        norm (str): Indicates which direction to scale the `forward` or `backward` transform
+            pair and what normalization factor to use. The parameter value must be one 
+            of "forward" or "backward" or "ortho". Default is "backward".
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`. 
+    
+    Returns:
+        Real tensor. The real result of the 2-D Hermitian complex real FFT.
+    
+    Raises:
+        ValueError: if `s` not be a sequence of 2 integers or None.
+        ValueError: if `axes` not be a sequence of 2 integers or None.
+        ValueError: If the input dimension is smaller than 2.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
+            xp = paddle.to_tensor(x)
+            hfft2_xp = paddle.fft.hfft2(xp).numpy()
+            print(hfft2_xp)
+            #  [[19.  7.  3. -9.]
+            #   [ 1.  1.  1.  1.]]
+
+
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return hfftn(x, s, axes, norm, name)
+
+
+def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
+    """
+    Compute the two dimensional inverse FFT of a real spectrum.
+
+    This is really `ihfftn` with different defaults.
+    For more details see `ihfftn`.
+
+    Args:
+        x(Tensor): Input tensor
+        s(Sequence[int], optional): Shape of the real input to the inverse FFT.
+        axes(Sequance[int], optional): The axes over which to compute the 
+            inverse fft. Default is the last two axes.
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        "backward".
+        name(str, optional): The default value is None.  Normally there is no 
+            need for user to set this property. For more information, please 
+            refer to :ref:`api_guide_Name` . 
+
+    Returns:
+        out(Tensor) : The result of the inverse hermitian 2-D FFT.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:5, :5][0].astype(np.float64)
+            xp = paddle.to_tensor(x)
+            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
+            print(ihfft2_xp)
+            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
+            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
+            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
+    """
+    _check_at_least_ndim(x, 2)
+    if s is not None:
+        if not isinstance(s, Sequence) or len(s) != 2:
+            raise ValueError(
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
+                format(s))
+    if axes is not None:
+        if not isinstance(axes, Sequence) or len(axes) != 2:
+            raise ValueError(
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
+                format(axes))
+    return ihfftn(x, s, axes, norm, name)
+
+
+# public APIs utilities
+def fftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned float array `f` contains the frequency bin centers in cycles
+    per unit of the sample spacing (with zero at the start).  For instance, if
+    the sample spacing is in seconds, then the frequency unit is cycles/second.
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length 'n' containing the sampling frequency.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.5
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            print(fftfreq_xp)
+
+            #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = (n + 1) // 2
+    neg_max = n // 2
+    indices = paddle.arange(-neg_max, pos_max, dtype=dtype, name=name)
+    indices = paddle.roll(indices, -neg_max, name=name)
+    return indices * val
+
+
+def rfftfreq(n, d=1.0, dtype=None, name=None):
+    """
+    Return the Discrete Fourier Transform sample frequencies.
+
+    The returned floating-point array "F" contains the center of the frequency unit, 
+    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+
+    Given input length `n` and a sample spacing `d`::
+
+      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
+      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
+
+    the Nyquist frequency component is considered to be positive.
+
+    Args:
+        n (int): Dimension inputed.
+        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. A tensor of length ``n//2 + 1`` containing the sample frequencies.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            scalar_temp = 0.3
+            n = x.size
+            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
+            print(rfftfreq_xp)
+
+            #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #           [0.        , 0.66666669, 1.33333337])
+
+    """
+
+    dtype = paddle.framework.get_default_dtype()
+    val = 1.0 / (n * d)
+    pos_max = 1 + n // 2
+    indices = paddle.arange(0, pos_max, dtype=dtype, name=name)
+    return indices * val
+
+
+def fftshift(x, axes=None, name=None):
+    """
+    Shift the zero-frequency component to the center of the spectrum.
+
+    This function swaps half spaces for all the axes listed (all by default).
+    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
+            res = paddle.fft.fftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = shape[axes] // 2
+    else:
+        shifts = [shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+def ifftshift(x, axes=None, name=None):
+    """
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    odd length 'x' is different. An example.
+
+    Args:
+        n (int): Dimension inputed.
+        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
+            Default is None.
+        name (str, optional): The default value is None.  Normally there is no need for user to set 
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor. The shifted tensor.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.array([3, 1, 2, 2, 3], dtype=float)
+            n = x.size
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
+            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
+            print(res)
+            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
+
+    """
+    shape = paddle.shape(x)
+    if axes is None:
+        # shift all axes
+        rank = paddle.rank(x).reshape([1])
+        axes = axes or paddle.arange(0, rank)
+        shifts = [-size // 2 for size in shape]
+    elif isinstance(axes, int):
+        shifts = -shape[axes] // 2
+    else:
+        shifts = [-shape[ax] // 2 for ax in axes]
+    return paddle.roll(x, shifts, axes, name=name)
+
+
+# internal functions
+def fft_c2c(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_r2c(x, n, axis, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fft_c2r(x, n, axis, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    axis = axis if axis is not None else -1
+    _check_fft_axis(x, axis)
+    axes = [axis]
+    axes = _normalize_axes(x, axes)
+    if n is not None:
+        _check_fft_n(n)
+        s = [n // 2 + 1]
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if n is not None:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', n)
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if n is not None:
+            attrs['last_dim_size'] = n
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_c2c(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes).tolist()
+        axes = [axes[i] for i in axes_argsoft]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+    op_type = 'fft_c2c'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
+
+
+def fftn_r2c(x, s, axes, norm, forward, onesided, name):
+    if is_interger(x):
+        x = paddle.cast(x, paddle.get_default_dtype())
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        x = _resize_fft_input(x, s, axes)
+
+    op_type = 'fft_r2c'
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
+
+    if in_dygraph_mode():
+        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                 'onesided', onesided)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {
+            'axes': axes,
+            'normalization': norm,
+            'forward': forward,
+            'onesided': onesided,
+        }
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _real_to_complex_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return out
+
+
+def fftn_c2r(x, s, axes, norm, forward, name):
+    if is_interger(x):
+        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
+    elif is_floating_point(x):
+        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
+    _check_normalization(norm)
+    if s is not None:
+        _check_fft_shape(x, s)
+
+    rank = x.ndim
+    if axes is None:
+        if s is None:
+            axes = list(range(rank))
+        else:
+            fft_ndims = len(s)
+            axes = list(range(rank - fft_ndims, rank))
+    else:
+        _check_fft_axes(x, axes)
+        axes = _normalize_axes(x, axes)
+        axes_argsoft = np.argsort(axes[:-1]).tolist()
+        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
+        if s is not None:
+            if len(s) != len(axes):
+                raise ValueError(
+                    "Length of s ({}) and length of axes ({}) does not match.".
+                    format(len(s), len(axes)))
+            s = [s[i] for i in axes_argsoft] + [s[-1]]
+
+    if s is not None:
+        fft_input_shape = list(s)
+        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
+        x = _resize_fft_input(x, fft_input_shape, axes)
+
+    op_type = 'fft_c2r'
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
+
+    if in_dygraph_mode():
+        if s:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
+                     'last_dim_size', s[-1])
+        else:
+            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
+        out = getattr(_C_ops, op_type)(x, *attrs)
+    else:
+        inputs = {'X': [x], }
+        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
+        if s:
+            attrs["last_dim_size"] = s[-1]
+        helper = LayerHelper(op_type, **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(
+            _complex_to_real_dtype(dtype))
+        outputs = {"Out": [out]}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py
index a109a5aa5d1a67..ecbbd8f52db9b5 100644
--- a/python/paddle/fluid/tests/unittests/test_signal.py
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -652,7 +652,7 @@ def test_frame(self):
         self.assertTrue(
             np.allclose(
                 frame_for_api_test(self.x, self.frame_length, self.hop_length, self.axis),
-                paddle.tensor.signal.frame(
+                paddle.signal.frame(
                     paddle.to_tensor(self.x),
                     self.frame_length,
                     self.hop_length,
@@ -678,7 +678,7 @@ def test_frame_static(self):
         mp, sp = paddle.static.Program(), paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
             input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.frame(
+            output = paddle.signal.frame(
                      input,
                      self.frame_length,
                      self.hop_length,
@@ -708,7 +708,7 @@ def test_frame_static(self):
 class TestFrameException(unittest.TestCase):
     def test_frame(self):
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.frame(
+            paddle.signal.frame(
                 paddle.to_tensor(self.x),
                 self.frame_length,
                 self.hop_length,
@@ -731,7 +731,7 @@ def test_overlap_add(self):
         self.assertTrue(
             np.allclose(
                 overlap_add_for_api_test(self.x, self.hop_length, self.axis),
-                paddle.tensor.signal.overlap_add(
+                paddle.signal.overlap_add(
                     paddle.to_tensor(self.x),
                     self.hop_length,
                     self.axis),
@@ -756,7 +756,7 @@ def test_overlap_add_static(self):
         mp, sp = paddle.static.Program(), paddle.static.Program()
         with paddle.static.program_guard(mp, sp):
             input = paddle.static.data('input', self.x.shape, dtype=self.x.dtype)
-            output = paddle.tensor.signal.overlap_add(
+            output = paddle.signal.overlap_add(
                      input,
                      self.hop_length,
                      self.axis),
@@ -783,7 +783,7 @@ def test_overlap_add_static(self):
 class TestOverlapAddException(unittest.TestCase):
     def test_overlap_add(self):
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.overlap_add(
+            paddle.signal.overlap_add(
                 paddle.to_tensor(self.x),
                 self.hop_length,
                 self.axis)
@@ -848,7 +848,7 @@ def test_stft(self):
         self.assertTrue(
             np.allclose(
                 stft(self.x, self.n_fft, self.hop_length, self.win_length, win_l, self.center, self.pad_mode),
-                paddle.tensor.signal.stft(
+                paddle.signal.stft(
                     paddle.to_tensor(self.x),
                     self.n_fft,
                     self.hop_length,
@@ -891,7 +891,7 @@ def test_stft(self):
             win_p = paddle.to_tensor(self.window)
 
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.stft(
+            paddle.signal.stft(
                 paddle.to_tensor(self.x),
                 self.n_fft,
                 self.hop_length,
@@ -934,7 +934,7 @@ def test_istft(self):
         self.assertTrue(
             np.allclose(
                 istft(self.x, self.hop_length, self.win_length, win_l, self.center, self.length),
-                paddle.tensor.signal.istft(
+                paddle.signal.istft(
                     paddle.to_tensor(self.x),
                     self.n_fft,
                     self.hop_length,
@@ -986,7 +986,7 @@ def test_istft(self):
             win_p = paddle.to_tensor(self.window)
 
         with self.assertRaises(self.expect_exception):
-            paddle.tensor.signal.istft(
+            paddle.signal.istft(
                 paddle.to_tensor(self.x),
                 self.n_fft,
                 self.hop_length,
diff --git a/python/paddle/tensor/signal.py b/python/paddle/signal.py
similarity index 97%
rename from python/paddle/tensor/signal.py
rename to python/paddle/signal.py
index 86022a17483566..fc80c7cbc80f36 100644
--- a/python/paddle/tensor/signal.py
+++ b/python/paddle/signal.py
@@ -16,16 +16,14 @@
 
 import paddle
 
-from .attribute import is_complex, is_floating_point
+from .tensor.attribute import is_complex, is_floating_point
 from .fft import fft_r2c, fft_c2r, fft_c2c
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.framework import in_dygraph_mode
-from ..fluid.layer_helper import LayerHelper
-from .. import _C_ops
+from .fluid.data_feeder import check_variable_and_dtype
+from .fluid.framework import in_dygraph_mode
+from .fluid.layer_helper import LayerHelper
+from . import _C_ops
 
 __all__ = [
-    'frame',
-    'overlap_add',
     'stft',
     'istft',
 ]
@@ -56,7 +54,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
     .. code-block:: python
 
         import paddle
-        from paddle.tensor.signal import frame
+        from paddle.signal import frame
         
         # 1D
         x = paddle.arange(8)
@@ -177,7 +175,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
     .. code-block:: python
 
         import paddle
-        from paddle.tensor.signal import overlap_add
+        from paddle.signal import overlap_add
         
         # 2D
         x0 = paddle.arange(16).reshape([8, 2])
@@ -291,11 +289,11 @@ def stft(x,
             real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
             `onesided` is `False`)
     
-    Exampels:
+    Examples:
         .. code-block:: python
     
             import paddle
-            from paddle.tensor.signal import stft
+            from paddle.signal import stft
     
             # real-valued input
             x = paddle.randn([8, 48000], dtype=paddle.float64)
@@ -415,7 +413,7 @@ def istft(x,
     - :math:`N`: Value of `n_fft`.
     - :math:`H`: Value of `hop_length`.
 
-    Result of `istft` expected to be the inverse of `paddle.tensor.signal.stft`, but it is
+    Result of `istft` expected to be the inverse of `paddle.signal.stft`, but it is
         not guaranteed to reconstruct a exactly realizible time-domain signal from a STFT
         complex tensor which has been modified (via masking or otherwise). Therefore, `istft`
         gives the [Griffin-Lim optimal estimate](https://ieeexplore.ieee.org/document/1164317)
@@ -454,12 +452,12 @@ def istft(x,
         A tensor of least squares estimation of the reconstructed signal(s) with shape
             `[..., seq_length]`
 
-    Exampels:
+    Examples:
         .. code-block:: python
 
             import numpy as np
             import paddle
-            from paddle.tensor.signal import stft, istft
+            from paddle.signal import stft, istft
 
             paddle.seed(0)
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index f528714e9164a4..04d0a3c745f10d 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -221,8 +221,6 @@
 from .array import create_array  # noqa: F401
 
 from .einsum import einsum  # noqa: F401
-from . import fft
-from . import signal
 
 #this list used in math_op_patch.py for _binary_creator_
 tensor_method_func  = [ #noqa
diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
deleted file mode 100644
index 20fd143589fa4b..00000000000000
--- a/python/paddle/tensor/fft.py
+++ /dev/null
@@ -1,1601 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Sequence
-import numpy as np
-import paddle
-from .attribute import is_complex, is_floating_point, is_interger, _real_to_complex_dtype, _complex_to_real_dtype
-from ..fluid.framework import in_dygraph_mode
-from .. import _C_ops
-from ..fluid.data_feeder import check_variable_and_dtype
-from ..fluid.layer_helper import LayerHelper
-
-__all__ = []
-
-
-def _check_normalization(norm):
-    if norm not in ['forward', 'backward', 'ortho']:
-        raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".
-            format(norm))
-
-
-def _check_fft_n(n):
-    if not isinstance(n, int):
-        raise ValueError(
-            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
-    if n <= 0:
-        raise ValueError(
-            "Invalid FFT argument n({}), it should be positive.".format(n))
-
-
-def _check_fft_shape(x, s):
-    ndim = x.ndim
-    if not isinstance(s, Sequence):
-        raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers.")
-
-    if len(s) > ndim:
-        raise ValueError(
-            "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim))
-    for size in s:
-        if not isinstance(size, int) or size <= 0:
-            raise ValueError("FFT sizes {} contains invalid value ({})".format(
-                s, size))
-
-
-def _check_fft_axis(x, axis):
-    ndim = x.ndim
-    if not isinstance(axis, int):
-        raise ValueError(
-            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
-    if axis < -ndim or axis >= ndim:
-        raise ValueError(
-            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim))
-
-
-def _check_fft_axes(x, axes):
-    ndim = x.ndim
-    if not isinstance(axes, Sequence):
-        raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".
-            format(axes))
-    if len(axes) > ndim:
-        raise ValueError(
-            "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
-    for axis in axes:
-        if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
-            raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".
-                format(axes, axis, ndim, ndim))
-
-
-def _resize_fft_input(x, s, axes):
-    if len(s) != len(axes):
-        raise ValueError("length of `s` should equals length of `axes`.")
-    shape = x.shape
-    ndim = x.ndim
-
-    axes_to_pad = []
-    paddings = []
-    axes_to_slice = []
-    slices = []
-    for i, axis in enumerate(axes):
-        if shape[axis] < s[i]:
-            axes_to_pad.append(axis)
-            paddings.append(s[i] - shape[axis])
-        elif shape[axis] > s[i]:
-            axes_to_slice.append(axis)
-            slices.append((0, s[i]))
-
-    if axes_to_slice:
-        x = paddle.slice(
-            x,
-            axes_to_slice,
-            starts=[item[0] for item in slices],
-            ends=[item[1] for item in slices])
-    if axes_to_pad:
-        padding_widths = [0] * (2 * ndim)
-        for axis, pad in zip(axes_to_pad, paddings):
-            padding_widths[2 * axis + 1] = pad
-        x = paddle.nn.functional.pad(x, padding_widths)
-    return x
-
-
-def _normalize_axes(x, axes):
-    ndim = x.ndim
-    return [item if item >= 0 else (item + ndim) for item in axes]
-
-
-def _check_at_least_ndim(x, rank):
-    if x.ndim < rank:
-        raise ValueError("The rank of the input ({}) should >= {}".format(
-            x.ndim, rank))
-
-
-# public APIs 1d
-def fft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Calculate one-dimensional discrete Fourier transform.
-
-    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
-    calculate the 1-D * n * point discrete Fourier transform (DFT).
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
-            by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
-        by `axis`, or the last one if `axis` is not specified.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.exp(3j * np.pi * np.arange(7) / 7)
-            xp = paddle.to_tensor(x)
-            fft_xp = paddle.fft.fft(xp).numpy()
-            print(fft_xp)
-            #  [1.+1.25396034e+00j 1.+4.38128627e+00j 1.-4.38128627e+00j
-            #   1.-1.25396034e+00j 1.-4.81574619e-01j 1.+8.88178420e-16j
-            #   1.+4.81574619e-01j]
-
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=True, onesided=False, name=name)
-    else:
-        return fft_c2c(x, n, axis, norm, forward=True, name=name)
-
-
-def ifft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Compute the 1-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
-    computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
-
-    The input should be ordered in the same way as is returned by `fft`,
-    i.e.,
-
-    * ``x[0]`` should contain the zero frequency term,
-    * ``x[1:n//2]`` should contain the positive-frequency terms,
-    * ``x[n//2 + 1:]`` should contain the negative-frequency terms, in
-      increasing order starting from the most negative frequency.
-
-    For an even number of input points, ``x[n//2]`` represents the sum of
-    the values at the positive and negative Nyquist frequencies, as the two
-    are aliased together. 
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
-            by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-    
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
-        by `axis`, or the last one if `axis` is not specified.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.exp(3j * np.pi * np.arange(7) / 7)
-            xp = paddle.to_tensor(x)
-            ifft_xp = paddle.fft.ifft(xp).numpy()
-            print(ifft_xp)
-            #  [0.14285714+1.79137191e-01j 0.14285714+6.87963741e-02j
-            #   0.14285714+1.26882631e-16j 0.14285714-6.87963741e-02j
-            #   0.14285714-1.79137191e-01j 0.14285714-6.25898038e-01j
-            #   0.14285714+6.25898038e-01j]
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fft_r2c(
-            x, n, axis, norm, forward=False, onesided=False, name=name)
-    else:
-        return fft_c2c(x, n, axis, norm, forward=False, name=name)
-
-
-def rfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    The one dimensional FFT for real input.
-
-    This function computes the one dimensional *n*-point discrete Fourier
-    Transform (DFT) of a real-valued tensor by means of an efficient algorithm
-    called the Fast Fourier Transform (FFT).
-
-    When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
-    ``n//2 + 1``.
-
-    Args:
-        x(Tensor) : Real-valued input tensor 
-        n(int, optional): Number of points along transformation axis in the 
-            input to use. If `n` is smaller than the length of the input, the 
-            input is cropped. If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
-            specified by `axis` is used.
-        axis(int, optional): Axis over which to compute the FFT. Default value 
-            is last axis.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward  pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor
-
-    Raises:
-
-
-    Examples:
-    .. code-block:: python
-        import paddle
-
-        x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
-        print(paddle.fft.rfft(x))
-        # Tensor(shape=[3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [ (1+0j), -1j    , (-1+0j)])
-    """
-    return fft_r2c(x, n, axis, norm, forward=True, onesided=True, name=name)
-
-
-def irfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Computes the inverse of `rfft`.
-
-    This function calculates the inverse of the one-dimensional *n* point discrete 
-    Fourier transform of the actual input calculated by "rfft". In other words, 
-    ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
-
-    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
-    followed by the complex positive frequency term, in the order of increasing frequency. 
-    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
-    the negative frequency term is regarded as the complex conjugate term of the corresponding 
-    positive frequency term.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
-            along the ` axis'.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
-        in some cases.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            irfft_xp = paddle.fft.irfft(xp).numpy()
-            print(irfft_xp)
-            #  [0. 1. 0. 0.]
-
-    """
-    return fft_c2r(x, n, axis, norm, forward=False, name=name)
-
-
-def hfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    Compute the FFT of a signal that has Hermitian symmetry, a real
-    spectrum.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
-            along the ` axis'.
-        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
-        some cases.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([1, -1j, -1])
-            xp = paddle.to_tensor(x)
-            hfft_xp = paddle.fft.hfft(xp).numpy()
-            print(hfft_xp)
-            #  [0. 0. 0. 4.]
-    """
-
-    return fft_c2r(x, n, axis, norm, forward=True, name=name)
-
-
-def ihfft(x, n=None, axis=-1, norm="backward", name=None):
-    """
-    The inverse FFT of a signal that has Hermitian symmetry.
-
-    This function computes the one dimensional *n*-point inverse FFT of a signal 
-    that has Hermitian symmetry by means of an efficient algorithm called 
-    the Fast Fourier Transform (FFT).
-
-    When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
-    ``n//2 + 1``.
-
-    Args:
-        x(Tensor): Input tensor.
-        n(int, optional): The number of points along transformation axis in the 
-            input to use.  If `n` is smaller than the length of the input, the 
-            input is cropped.  If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
-            specified by `axis` is used.
-        axis(int, optional) : Axis over which to compute the inverse FFT. If not
-            given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor.
-
-    Examples:
-    .. code-block:: python
-        import paddle 
-
-        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
-        print(paddle.fft.ifft(spectrum))
-        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
-        print(paddle.fft.ihfft(spectrum))
-        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
-        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
-
-    """
-    return fft_r2c(x, n, axis, norm, forward=False, onesided=True, name=name)
-
-
-# public APIs nd
-def fftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D discrete Fourier Transform.
-
-    This function calculates the n-D discrete Fourier transform on any number of axes 
-    in the M-D array by fast Fourier transform (FFT).
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
-            This corresponds to ``n`` for ``fft(x, n)``.
-            Along any axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used.
-        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
-        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:4, :4, :4][1]
-            xp = paddle.to_tensor(x)
-            fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
-            print(fftn_xp)
-            #  [[[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]
-            #   [[24.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+8.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.+0.j  0.+0.j  0.+0.j  0.-0.j]
-            #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=True, onesided=False, name=name)
-    else:
-        return fftn_c2c(x, s, axes, norm, forward=True, name=name)
-
-
-def ifftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the N-D discrete
-    Fourier Transform over any number of axes in an M-D array by
-    means of the Fast Fourier Transform (FFT).  In other words,
-    ``ifftn(fftn(x)) == x`` to within numerical accuracy.
-
-    The input, analogously to `ifft`, should be ordered in the same way as is
-    returned by `fftn`, i.e., it should have the term for zero frequency
-    in all axes in the low-order corner, the positive frequency terms in the
-    first half of all axes, the term for the Nyquist frequency in the middle
-    of all axes and the negative frequency terms in the second half of all
-    axes, in order of decreasingly negative frequency.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type. It's a complex.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
-            This corresponds to ``n`` for ``fft(x, n)``.
-            Along any axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used.
-        axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
-            scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-        
-    Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
-        `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.eye(3)
-            xp = paddle.to_tensor(x)
-            ifftn_xp = paddle.fft.ifftn(xp, axes=(1,)).numpy()
-            print(ifftn_xp)
-
-            #   [[ 0.33333333+0.j          0.33333333+0.j          0.33333333-0.j        ]
-            #   [ 0.33333333+0.j         -0.16666667+0.28867513j -0.16666667-0.28867513j]
-            #   [ 0.33333333+0.j         -0.16666667-0.28867513j -0.16666667+0.28867513j]]
-
-    """
-    if is_interger(x) or is_floating_point(x):
-        return fftn_r2c(
-            x, s, axes, norm, forward=False, onesided=False, name=name)
-    else:
-        return fftn_c2c(x, s, axes, norm, forward=False, name=name)
-
-
-def rfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    The N dimensional FFT for real input.
-
-    This function computes the N-dimensional discrete Fourier Transform over
-    any number of axes in an M-dimensional real array by means of the Fast
-    Fourier Transform (FFT).  By default, all axes are transformed, with the
-    real transform performed over the last axis, while the remaining
-    transforms are complex.
-
-    The transform for real input is performed over the last transformation
-    axis, as by `rfft`, then the transform over the remaining axes is
-    performed as by `fftn`.  The order of the output is as for `rfft` for the
-    final transformation axis, and as for `fftn` for the remaining
-    transformation axes.
-
-    Args:
-        x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int]) : Shape to use from the exec fft. The final element of 
-            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
-            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
-            the given shape is smaller than that of the input, the input is 
-            cropped.  If it is larger, the input is padded with zeros. if `s` is 
-            not given, the shape of the input along the axes specified by `axes` 
-            is used.
-        axes(Sequence[int]) : Axes over which to compute the FFT.  If not given, 
-            the last ``len(s)`` axes are used, or all axes if `s` is also not 
-            specified.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor): complex tensor
-
-
-    Raises:
-        ValueError: If `s` and `axes` have different length.
-
-    Examples:
-    .. code-block:: python
-        import paddle
-
-        # default, all axis will be used to exec fft
-        x = paddle.ones((2, 3, 4))
-        print(paddle.fft.rfftn(x))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-        # use axes(2, 0)
-        print(paddle.fft.rfftn(x, axes=(2, 0)))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-    """
-    return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
-
-
-def irfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Computes the inverse of `rfftn`.
-
-    This function computes the inverse of the N-D discrete
-    Fourier Transform for real input over any number of axes in an
-    M-D array by means of the Fast Fourier Transform (FFT). In
-    other words, ``irfftn(rfftn(x), x.shape) == x`` to within numerical
-    accuracy. (The ``a.shape`` is necessary like ``len(a)`` is for `irfft`,
-    and for the same reason.)
-
-    The input should be ordered in the same way as is returned by `rfftn`,
-    i.e., as for `irfft` for the final transformation axis, and as for `ifftn`
-    along all the other axes.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
-            number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
-            ``k`` is the length of the input along that axis.
-        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
-        each transformed axis is as given by the corresponding element of `s`, or the length of the input
-        in every axis except for the last one if `s` is not given. In the final transformed axis the length
-        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
-        transformed axis of the input. To get an odd number of output points in the final axis, 
-        `s` must be specified.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfftn_xp = paddle.fft.irfftn(xp).numpy()
-            print(irfftn_xp)
-            #  [ 2.25 -1.25  0.25  0.75]
-    
-    """
-    return fftn_c2r(x, s, axes, norm, forward=False, name=name)
-
-
-def hfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
-    signal with a real spectrum.
-
-    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
-    complex input on any axis in M-D array by fast Fourier transform (FFT). 
-    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
-    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
-    for the same reason that ``irfft` requires ``x.shape``.)
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
-            number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
-            ``k`` is the length of the input along that axis.
-        axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
-        a combination of `s` or `X`.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([2, 2, 3]) + 1j * np.array([2, 2, 3])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfftn_xp = paddle.fft.hfftn(xp).numpy()
-            print(hfftn_xp)
-            #  [ 9.  3.  1. -5.]
-
-
-    """
-    return fftn_c2r(x, s, axes, norm, forward=True, name=name)
-
-
-def ihfftn(x, s=None, axes=None, norm="backward", name=None):
-    """
-    The n dimensional inverse FFT of a signal that has Hermitian symmetry.
-
-    This function computes the n dimensional inverse FFT over any number of axes 
-    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
-    efficient algorithm called the Fast Fourier Transform (FFT).
-
-    Args:
-        x(Tensor): Input tensor.
-        s(Sequence[int], optional) : Shape (length along each transformed axis) 
-            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
-            1, etc.). Along any axis, if the given shape is smaller than that 
-            of the input, the input is cropped. If it is larger, the input is 
-            padded with zeros. if `s` is not given, the shape of the input 
-            along the axes specified by `axes` is used.
-        axis(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
-            given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : complex tensor.
-
-    Examples:
-    .. code-block:: python
-        import paddle 
-
-        spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
-        print(paddle.fft.ifft(spectrum))
-        # Tensor(shape=[6], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #       [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j), (2.3333334922790527+1.9868215517249155e-08j),  (1+1.9868215517249155e-08j)])
-        print(paddle.fft.ihfft(spectrum))
-        #  Tensor(shape = [4], dtype = complex64, place = CUDAPlace(0), stop_gradient = True,
-        #         [(-0.1666666716337204+0j),  (1-1.9868215517249155e-08j), (2.3333334922790527-1.9868215517249155e-08j),  (3.5+0j)])
-
-    """
-    return fftn_r2c(x, s, axes, norm, forward=False, onesided=True, name=name)
-
-
-# public APIs 2d
-def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D discrete Fourier Transform
-
-    This function computes the N-D discrete Fourier Transform
-    over any axes in an M-D array by means of the
-    Fast Fourier Transform (FFT). By default, the transform is computed over
-    the last two axes of the input array, i.e., a 2-dimensional FFT.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
-            Along each axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or the last two axes if `axes` is not given.
-    
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            fft2_xp = paddle.fft.fft2(xp).numpy()
-            print(fft2_xp)
-            #  [[ 2.+0.j -2.+0.j]
-            #   [ 0.+0.j  0.+0.j]]
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return fftn(x, s, axes, norm, name)
-
-
-def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D inverse discrete Fourier Transform.
-
-    This function computes the inverse of the 2-D discrete Fourier
-    Transform over any number of axes in an M-D array by means of
-    the Fast Fourier Transform (FFT). In other words, ``ifft2(fft2(x)) == x``
-    to within numerical accuracy. By default, the inverse transform is
-    computed over the last two axes of the input array.
-
-    The input, analogously to `ifft`, should be ordered in the same way as is
-    returned by `fft2`, i.e., it should have the term for zero frequency
-    in the low-order corner of the two axes, the positive frequency terms in
-    the first half of these axes, the term for the Nyquist frequency in the
-    middle of the axes and the negative frequency terms in the second half of
-    both axes, in order of decreasingly negative frequency.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
-            Along each axis, if the given shape is smaller than that of the input,
-            the input is cropped. If it is larger, the input is padded with zeros.
-            if `s` is not given, the shape of the input along the axes specified
-            by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-    
-    Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or the last two axes if `axes` is not given.
-
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:2, :2][1]
-            xp = paddle.to_tensor(x)
-            ifft2_xp = paddle.fft.ifft2(xp).numpy()
-            print(ifft2_xp)
-            #  [[ 0.5+0.j -0.5+0.j]
-            #   [ 0. +0.j  0. +0.j]]
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return ifftn(x, s, axes, norm, name)
-
-
-def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    The two dimensional FFT with real tensor input.
-
-    This is really just `rfftn` with different default behavior.
-    For more details see `rfftn`.
-
-    Args:
-        x(Tensor): Input tensor, taken to be real.
-        s(Sequence[int]) : Shape of the FFT.
-        axes(Sequence[int], optional): Axes over which to compute the FFT.
-        norm(str, optional) : {"backward", "ortho", "forward"}, 
-            default is "backward". Indicates which direction of the 
-            forward/backward pair of transforms is scaled and with what 
-            normalization factor.
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns: 
-        out(Tensor): The result of the real 2-D FFT.
-
-    Raises:
-
-
-    Examples:
-
-    .. code-block:: python
-        import paddle
-        import numpy as np
-
-        x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32))
-        print(paddle.fft.rfft2(x))
-        # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[ (50+0j)                                        ,  (1.1920928955078125e-07+0j)                    ,  0j                                             ],
-        #         [(-12.5+17.204774856567383j)                     , (-9.644234211236835e-08+7.006946134424652e-08j) ,  0j                                             ],
-        #         [(-12.500000953674316+4.061495304107666j)        , (3.6837697336977726e-08-1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.500000953674316-4.061495304107666j)        , (3.6837697336977726e-08+1.1337477445749755e-07j),  0j                                             ],
-        #         [(-12.5-17.204774856567383j)                     , (-9.644234211236835e-08-7.006946134424652e-08j) ,  0j                                             ]])
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return rfftn(x, s, axes, norm, name)
-
-
-def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Computes the inverse of `rfft2`.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
-        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
-            must be two-dimensional. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-    
-    Returns:
-        Real tensor. The result of the inverse real 2-D FFT.
-
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            irfft2_xp = paddle.fft.irfft2(xp).numpy()
-            print(irfft2_xp)
-            #  [[ 2.375 -1.125  0.375  0.875]
-            #   [ 0.125  0.125  0.125  0.125]]
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return irfftn(x, s, axes, norm, name)
-
-
-def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the 2-D FFT of a Hermitian complex array.
-
-    Args:
-        x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape of the real output. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
-            two-dimensional. If not specified, the last two axes are used by default.       
-        norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
-    Returns:
-        Real tensor. The real result of the 2-D Hermitian complex real FFT.
-    
-    Raises:
-        ValueError: if `s` not be a sequence of 2 integers or None.
-        ValueError: if `axes` not be a sequence of 2 integers or None.
-        ValueError: If the input dimension is smaller than 2.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = (np.array([[3,2,3],[2, 2, 3]]) + 1j * np.array([[3,2,3],[2, 2, 3]])).astype(np.complex128)
-            xp = paddle.to_tensor(x)
-            hfft2_xp = paddle.fft.hfft2(xp).numpy()
-            print(hfft2_xp)
-            #  [[19.  7.  3. -9.]
-            #   [ 1.  1.  1.  1.]]
-
-
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return hfftn(x, s, axes, norm, name)
-
-
-def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
-    """
-    Compute the two dimensional inverse FFT of a real spectrum.
-
-    This is really `ihfftn` with different defaults.
-    For more details see `ihfftn`.
-
-    Args:
-        x(Tensor): Input tensor
-        s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the 
-            inverse fft. Default is the last two axes.
-        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
-        "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
-
-    Returns:
-        out(Tensor) : The result of the inverse hermitian 2-D FFT.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.mgrid[:5, :5][0].astype(np.float64)
-            xp = paddle.to_tensor(x)
-            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
-            print(ihfft2_xp)
-            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
-            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
-            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
-            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
-            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
-    """
-    _check_at_least_ndim(x, 2)
-    if s is not None:
-        if not isinstance(s, Sequence) or len(s) != 2:
-            raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".
-                format(s))
-    if axes is not None:
-        if not isinstance(axes, Sequence) or len(axes) != 2:
-            raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".
-                format(axes))
-    return ihfftn(x, s, axes, norm, name)
-
-
-# public APIs utilities
-def fftfreq(n, d=1.0, dtype=None, name=None):
-    """
-    Return the Discrete Fourier Transform sample frequencies.
-
-    The returned float array `f` contains the frequency bin centers in cycles
-    per unit of the sample spacing (with zero at the start).  For instance, if
-    the sample spacing is in seconds, then the frequency unit is cycles/second.
-
-    Given input length `n` and a sample spacing `d`::
-
-      f = [0, 1, ...,   n/2-1,     -n/2, ..., -1] / (d*n)   if n is even
-      f = [0, 1, ..., (n-1)/2, -(n-1)/2, ..., -1] / (d*n)   if n is odd
-
-    Args:
-        n (int): Dimension inputed.
-        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. A tensor of length 'n' containing the sampling frequency.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.5
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
-            print(fftfreq_xp)
-
-            #  Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #           [ 0.        ,  0.40000001,  0.80000001, -0.80000001, -0.40000001])
-    """
-
-    dtype = paddle.framework.get_default_dtype()
-    val = 1.0 / (n * d)
-    pos_max = (n + 1) // 2
-    neg_max = n // 2
-    indices = paddle.arange(-neg_max, pos_max, dtype=dtype, name=name)
-    indices = paddle.roll(indices, -neg_max, name=name)
-    return indices * val
-
-
-def rfftfreq(n, d=1.0, dtype=None, name=None):
-    """
-    Return the Discrete Fourier Transform sample frequencies.
-
-    The returned floating-point array "F" contains the center of the frequency unit, 
-    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
-
-    Given input length `n` and a sample spacing `d`::
-
-      f = [0, 1, ...,     n/2-1,     n/2] / (d*n)   if n is even
-      f = [0, 1, ..., (n-1)/2-1, (n-1)/2] / (d*n)   if n is odd
-
-    the Nyquist frequency component is considered to be positive.
-
-    Args:
-        n (int): Dimension inputed.
-        d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. A tensor of length ``n//2 + 1`` containing the sample frequencies.
-
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
-            n = x.size
-            rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp)
-            print(rfftfreq_xp)
-
-            #  Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #           [0.        , 0.66666669, 1.33333337])
-
-    """
-
-    dtype = paddle.framework.get_default_dtype()
-    val = 1.0 / (n * d)
-    pos_max = 1 + n // 2
-    indices = paddle.arange(0, pos_max, dtype=dtype, name=name)
-    return indices * val
-
-
-def fftshift(x, axes=None, name=None):
-    """
-    Shift the zero-frequency component to the center of the spectrum.
-
-    This function swaps half spaces for all the axes listed (all by default).
-    Note that ``y[0]`` is the Nyquist component only if ``len(x)`` is even.
-
-    Args:
-        n (int): Dimension inputed.
-        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
-            Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. The shifted tensor.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.fftshift(fftfreq_xp).numpy()
-            print(res)
-            #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
-
-    """
-    shape = paddle.shape(x)
-    if axes is None:
-        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [size // 2 for size in shape]
-    elif isinstance(axes, int):
-        shifts = shape[axes] // 2
-    else:
-        shifts = [shape[ax] // 2 for ax in axes]
-    return paddle.roll(x, shifts, axes, name=name)
-
-
-def ifftshift(x, axes=None, name=None):
-    """
-    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
-    odd length 'x' is different. An example.
-
-    Args:
-        n (int): Dimension inputed.
-        axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
-            Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor. The shifted tensor.
-    
-    Examples:
-
-        .. code-block:: python
-
-            import numpy as np
-            import paddle
-
-            x = np.array([3, 1, 2, 2, 3], dtype=float)
-            n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
-            res = paddle.fft.ifftshift(fftfreq_xp).numpy()
-            print(res)
-            #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]
-
-    """
-    shape = paddle.shape(x)
-    if axes is None:
-        # shift all axes
-        rank = paddle.rank(x).reshape([1])
-        axes = axes or paddle.arange(0, rank)
-        shifts = [-size // 2 for size in shape]
-    elif isinstance(axes, int):
-        shifts = -shape[axes] // 2
-    else:
-        shifts = [-shape[ax] // 2 for ax in axes]
-    return paddle.roll(x, shifts, axes, name=name)
-
-
-# internal functions
-def fft_c2c(x, n, axis, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2c'
-
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fft_r2c(x, n, axis, norm, forward, onesided, name):
-    if is_interger(x):
-        x = paddle.cast(x, paddle.get_default_dtype())
-    _check_normalization(norm)
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_r2c'
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {
-            'axes': axes,
-            'normalization': norm,
-            'forward': forward,
-            'onesided': onesided,
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fft_c2r(x, n, axis, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    axis = axis if axis is not None else -1
-    _check_fft_axis(x, axis)
-    axes = [axis]
-    axes = _normalize_axes(x, axes)
-    if n is not None:
-        _check_fft_n(n)
-        s = [n // 2 + 1]
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2r'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        if n is not None:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', n)
-        else:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        if n is not None:
-            attrs['last_dim_size'] = n
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fftn_c2c(x, s, axes, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes).tolist()
-        axes = [axes[i] for i in axes_argsoft]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft]
-
-    if s is not None:
-        x = _resize_fft_input(x, s, axes)
-    op_type = 'fft_c2c'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(dtype)
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out
-
-
-def fftn_r2c(x, s, axes, norm, forward, onesided, name):
-    if is_interger(x):
-        x = paddle.cast(x, paddle.get_default_dtype())
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes[:-1]).tolist()
-        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft] + [s[-1]]
-
-    if s is not None:
-        x = _resize_fft_input(x, s, axes)
-
-    op_type = 'fft_r2c'
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], op_type)
-
-    if in_dygraph_mode():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {
-            'axes': axes,
-            'normalization': norm,
-            'forward': forward,
-            'onesided': onesided,
-        }
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-
-    return out
-
-
-def fftn_c2r(x, s, axes, norm, forward, name):
-    if is_interger(x):
-        x = paddle.cast(x, _real_to_complex_dtype(paddle.get_default_dtype()))
-    elif is_floating_point(x):
-        x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
-    _check_normalization(norm)
-    if s is not None:
-        _check_fft_shape(x, s)
-
-    rank = x.ndim
-    if axes is None:
-        if s is None:
-            axes = list(range(rank))
-        else:
-            fft_ndims = len(s)
-            axes = list(range(rank - fft_ndims, rank))
-    else:
-        _check_fft_axes(x, axes)
-        axes = _normalize_axes(x, axes)
-        axes_argsoft = np.argsort(axes[:-1]).tolist()
-        axes = [axes[i] for i in axes_argsoft] + [axes[-1]]
-        if s is not None:
-            if len(s) != len(axes):
-                raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
-            s = [s[i] for i in axes_argsoft] + [s[-1]]
-
-    if s is not None:
-        fft_input_shape = list(s)
-        fft_input_shape[-1] = fft_input_shape[-1] // 2 + 1
-        x = _resize_fft_input(x, fft_input_shape, axes)
-
-    op_type = 'fft_c2r'
-    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], op_type)
-
-    if in_dygraph_mode():
-        if s:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', s[-1])
-        else:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
-        out = getattr(_C_ops, op_type)(x, *attrs)
-    else:
-        inputs = {'X': [x], }
-        attrs = {'axes': axes, 'normalization': norm, 'forward': forward}
-        if s:
-            attrs["last_dim_size"] = s[-1]
-        helper = LayerHelper(op_type, **locals())
-        dtype = helper.input_dtype(input_param_name='x')
-        out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
-        outputs = {"Out": [out]}
-        helper.append_op(
-            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs)
-    return out

From 28bab073e4c1281fc7c580fdabfc672a05b47373 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Tue, 26 Oct 2021 12:46:33 +0800
Subject: [PATCH 10/14] Fix the null ptr bug in build_cinn_pass. (#36698)

* Fix the null ptr bug in build_cinn_pass.

* Add test for empty&ctrl var.
---
 .../framework/paddle2cinn/build_cinn_pass.cc  | 36 ++++++++++++-------
 .../paddle2cinn/build_cinn_pass_test.cc       | 29 +++++++++------
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e86a475e59add0..0664a63c2b72b3 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -114,7 +114,8 @@ void AddOutputVar(const std::unordered_set<Node*>& output_vars,
 // var node are from internal nodes
 std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
                                          const GraphNodeSet& cluster_internals,
-                                         const GraphNodeSet& cluster_inputs) {
+                                         const GraphNodeSet& cluster_inputs,
+                                         const GraphNodeSet& cluster_outputs) {
   // Graph's constructor must has one parameter, and in our code,
   // the ProgramDesc is useless, so here we pass a temporary object.
   auto subgraph = std::make_unique<Graph>(framework::ProgramDesc());
@@ -127,7 +128,12 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
 
   std::unordered_map<Node*, Node*> old_var2new_var;
   for (auto* var : cluster_internals) {
-    auto sub_node = subgraph->CreateVarNode(var->Var());
+    Node* sub_node;
+    if (var->Var() == nullptr) {
+      sub_node = subgraph->CreateEmptyNode(var->Name(), var->NodeType());
+    } else {
+      sub_node = subgraph->CreateVarNode(var->Var());
+    }
     old_var2new_var[var] = sub_node;
   }
 
@@ -140,7 +146,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     for (auto* var : op->inputs) {
       if (cluster_internals.count(var)) {
         old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]);
-      } else if (cluster_inputs.count(var)) {
+      } else if (cluster_inputs.count(var) && var->Var() != nullptr) {
         if (var->Var()->IsParameter()) {
           // Parameters have been preserved in scope, compared to feed var,
           // param just need add new var and don't need add feed op.
@@ -157,7 +163,7 @@ std::unique_ptr<Graph> CreateNewSubGraph(const GraphNodeSet& cluster,
     for (auto* var : op->outputs) {
       if (cluster_internals.count(var)) {
         old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]);
-      } else {
+      } else if (cluster_outputs.count(var) && var->Var() != nullptr) {
         // Create new output var node to guarantee the independency of
         // subgraph. In other words, the subgraph has no connection with
         // other graph, even the input graph.
@@ -239,14 +245,20 @@ Node* AddSpecialOpToGraph(const GraphNodeSet& cluster_inputs,
   framework::OpDesc special_op_desc;
   special_op_desc.SetType(kCinnLaunchOp);
   std::vector<std::string> input_names;
-  std::transform(cluster_inputs.begin(), cluster_inputs.end(),
-                 std::back_inserter(input_names),
-                 [](Node* n) { return n->Name(); });
+  std::for_each(cluster_inputs.begin(), cluster_inputs.end(),
+                [&input_names](Node* n) {
+                  if (n->Var() != nullptr) {
+                    input_names.emplace_back(n->Name());
+                  }
+                });
   special_op_desc.SetInput("X", input_names);
   std::vector<std::string> output_names;
-  std::transform(cluster_outputs.begin(), cluster_outputs.end(),
-                 std::back_inserter(output_names),
-                 [](Node* n) { return n->Name(); });
+  std::for_each(cluster_outputs.begin(), cluster_outputs.end(),
+                [&output_names](Node* n) {
+                  if (n->Var() != nullptr) {
+                    output_names.emplace_back(n->Name());
+                  }
+                });
   special_op_desc.SetOutput("Out", output_names);
   special_op_desc.SetAttr(kCompilationKey, compilation_key);
   special_op_desc.Flush();
@@ -362,8 +374,8 @@ void SearchAllSubgraphs(Graph* graph) {
                             &cluster_internals);
     // Create a new subgraph according to the found cluster and
     // save it in CinnCompiler
-    std::string compilation_key = cinn_compiler->AddGraph(
-        CreateNewSubGraph(cluster_set, cluster_internals, cluster_inputs));
+    std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
+        cluster_set, cluster_internals, cluster_inputs, cluster_outputs));
     // Replace the found cluster to a new special op node
     ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs,
                                      cluster_outputs, cluster_internals,
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index ab5768e0b2be35..79a27dccb4b00c 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <memory>
+#include <string>
 
 #include "gtest/gtest.h"
 
@@ -50,9 +51,10 @@ inline int CountNode(const std::unordered_set<Node*>& nodes,
 
 inline Node* GetNode(const std::unordered_set<Node*>& nodes,
                      const std::string& op_name) {
-  return *std::find_if(
-      nodes.begin(), nodes.end(),
-      [&op_name](const Node* node) { return node->Name() == op_name; });
+  return *std::find_if(nodes.begin(), nodes.end(),
+                       [&op_name](const Node* node) {
+                         return node->Name().find(op_name) != std::string::npos;
+                       });
 }
 
 inline bool CheckGraphIndependence(const std::unordered_set<Node*>& nodes) {
@@ -185,22 +187,25 @@ std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
   ir::Node* mul = g->CreateOpNode(&mul_op);
   ir::Node* relu = g->CreateOpNode(&relu_op);
 
+  ir::Node* v0 = g->CreateEmptyNode("var0", Node::Type::kVariable);
   ir::Node* v1 = g->CreateVarNode(&var1);
   ir::Node* v2 = g->CreateVarNode(&var2);
   ir::Node* v3 = g->CreateVarNode(&var3);
   ir::Node* v4 = g->CreateVarNode(&var4);
   ir::Node* v5 = g->CreateVarNode(&var5);
   ir::Node* v6 = g->CreateVarNode(&var6);
+  ir::Node* v7 = g->CreateControlDepVar();
 
   // fill op node
-  mul->inputs = {v1, v2};
+  mul->inputs = {v0, v1, v2};
   mul->outputs = {v3};
   add->inputs = {v3, v4};
   add->outputs = {v5};
   relu->inputs = {v5};
-  relu->outputs = {v6};
+  relu->outputs = {v6, v7};
 
   // fill variable node
+  v0->outputs = {mul};
   v1->outputs = {mul};
   v2->outputs = {mul};
 
@@ -213,6 +218,7 @@ std::unique_ptr<Graph> BuildAllOpSupportCinnGraph() {
   v5->outputs = {relu};
 
   v6->inputs = {relu};
+  v7->inputs = {relu};
 
   return g;
 }
@@ -225,25 +231,28 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) {
   pass->Apply(g.get());
 
   // After search, the graph should as following
-  // v1 --|
-  // v2 --| --> kCinnLaunchOp --> v6
+  // v0 --|
+  // v1 --|                   |--> v6
+  // v2 --| --> kCinnLaunchOp |--> v7
   // v4 --|
   const auto& nodes = g->Nodes();
-  ASSERT_EQ(nodes.size(), static_cast<size_t>(5));
+  ASSERT_EQ(nodes.size(), static_cast<size_t>(7));
   ASSERT_TRUE(CheckGraphIndependence(nodes));
 
   // A new op named kCinnLaunchOp should be added
   ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp));
   auto* cinn_op = GetNode(nodes, kCinnLaunchOp);
+  auto* v0 = GetNode(nodes, "var0");
   auto* v1 = GetNode(nodes, "var1");
   auto* v2 = GetNode(nodes, "var2");
   auto* v4 = GetNode(nodes, "var4");
   auto* v6 = GetNode(nodes, "var6");
+  auto* v7 = GetNode(nodes, Node::kControlDepVarName);
 
   ASSERT_EQ(
       std::unordered_set<Node*>(cinn_op->inputs.begin(), cinn_op->inputs.end()),
-      std::unordered_set<Node*>({v1, v2, v4}));
-  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6}));
+      std::unordered_set<Node*>({v0, v1, v2, v4}));
+  ASSERT_EQ(cinn_op->outputs, std::vector<Node*>({v6, v7}));
   ASSERT_EQ(v1->outputs, std::vector<Node*>({cinn_op}));
   ASSERT_EQ(v6->inputs, std::vector<Node*>({cinn_op}));
 

From 43dcf235c030fef33b44ac984064099643643670 Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Tue, 26 Oct 2021 13:17:34 +0800
Subject: [PATCH 11/14] fix wrong trt dim when input dim is 2 (#36614)

* fix wrong trt dim when input dim is 2

* update leaky_relu and instance_norm converter unit test

* add instance_norm input dim check
---
 paddle/fluid/inference/tensorrt/engine.h      |  11 ++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  16 +++
 .../plugin/instance_norm_op_plugin.cu         |   5 -
 .../fluid/inference/tests/api/CMakeLists.txt  |   7 --
 .../test_trt_convert_instance_norm.py         | 108 ++++++++++--------
 .../inference/test_trt_convert_leaky_relu.py  |  85 ++++++++------
 6 files changed, 138 insertions(+), 94 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index edf69dc7aa2b5f..0e1b9fe3366cac 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -116,6 +116,17 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
             input, ShapeStr(shape)));
       }
       return nvinfer1::Dims2(shape[1], shape[2]);
+    } else if (shape.size() == 2UL) {
+      if (shape[1] == -1) {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input [%s] shape of trt subgraph is %s, please enable "
+            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
+            input, ShapeStr(shape)));
+      }
+      nvinfer1::Dims dims;
+      dims.nbDims = 1;
+      dims.d[0] = shape[1];
+      return dims;
     }
     return nvinfer1::Dims3(shape[1], 1, 1);
   } else {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 93ecde789c2152..13504f444109b7 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -1064,6 +1064,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                 << desc.Output("Y").size();
         return false;
       }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != 4) {
+        VLOG(3) << "The instance_norm op only support 4-dimensional input in "
+                   "tensorrt.";
+        return false;
+      }
     }
 
     if (op_type == "leaky_relu") {
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index b7c4fb7c99acfd..a9a50543e7bb70 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -65,11 +65,6 @@ int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs,
 #endif
                                 cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = this->getInputDims(0);
-
-  PADDLE_ENFORCE_EQ(input_dims.nbDims, 3,
-                    platform::errors::InvalidArgument(
-                        "Input Dims should be 3 (except the batch), got %d",
-                        input_dims.nbDims));
   int n = batch_size;
   int c = input_dims.d[0];
   int h = input_dims.d[1];
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 11187a1c79fca3..6fd3944a6c5280 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -555,10 +555,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
         inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
-    set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
-        inference_download_and_uncompress_without_verify(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
-    endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
@@ -577,9 +573,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
-    inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
     inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
index 3f7c2a0fae6f06..acd920ccd57ae1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
@@ -24,8 +24,6 @@
 
 class TrtConvertInstanceNormTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        inputs = program_config.inputs
-        weights = program_config.weights
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
@@ -38,52 +36,71 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1(attrs: List[Dict[str, Any]], shape_input):
-            return np.ones(shape_input).astype(np.float32)
+            return np.random.random(shape_input).astype(np.float32)
 
         def generate_input2(attrs: List[Dict[str, Any]], shape_input):
-            return np.ones(len(shape_input) - 1).astype(np.float32)
-
-        for epsilon in [0.0005, -1, 1]:
-            dics = [{"epsilon": epsilon}]
-
-            ops_config = [{
-                "op_type": "instance_norm",
-                "op_inputs": {
-                    "X": ["input_data"],
-                    "Scale": ["scale_data"],
-                    "Bias": ["bias_data"]
-                },
-                "op_outputs": {
-                    "Y": ["y_data"],
-                    "SavedMean": ["saved_mean_data"],
-                    "SavedVariance": ["saved_variance_data"]
-                },
-                "op_attrs": dics[0]
-            }]
-            ops = self.generate_op_config(ops_config)
-            shape_input = [1, 3, 64, 64]
-            program_config = ProgramConfig(
-                ops=ops,
-                weights={
-                    "bias_data": TensorConfig(data_gen=partial(
-                        generate_input2, dics, shape_input)),
-                    "scale_data": TensorConfig(data_gen=partial(
-                        generate_input2, dics, shape_input))
-                },
-                inputs={
-                    "input_data": TensorConfig(data_gen=partial(
-                        generate_input1, dics, shape_input))
-                },
-                outputs=["y_data"])
-
-            yield program_config
+            return np.random.random(shape_input[1]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for shape_input in [[batch, 16], [batch, 32, 64],
+                                [batch, 16, 32, 64]]:
+                self.in_dim = len(shape_input)
+                for epsilon in [0.0005, -1, 1]:
+                    dics = [{"epsilon": epsilon}]
+                    ops_config = [{
+                        "op_type": "instance_norm",
+                        "op_inputs": {
+                            "X": ["input_data"],
+                            "Scale": ["scale_data"],
+                            "Bias": ["bias_data"]
+                        },
+                        "op_outputs": {
+                            "Y": ["y_data"],
+                            "SavedMean": ["saved_mean_data"],
+                            "SavedVariance": ["saved_variance_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={
+                            "bias_data": TensorConfig(data_gen=partial(
+                                generate_input2, dics, shape_input)),
+                            "scale_data": TensorConfig(data_gen=partial(
+                                generate_input2, dics, shape_input))
+                        },
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dics, shape_input))
+                        },
+                        outputs=["y_data"])
+
+                    yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+            if self.in_dim == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 4]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+            elif self.in_dim == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 4]}
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 3, 32]}
+            elif self.in_dim == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 1, 4, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 128, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3, 32, 32]
+                }
 
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
@@ -91,8 +108,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            inputs = program_config.inputs
-            if dynamic_shape:
+            if dynamic_shape or self.in_dim != 4:
                 return 0, 3
             return 1, 2
 
@@ -108,7 +124,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-2
+            attrs, False), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -117,7 +133,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(attrs,
-                                                                     True), 1e-2
+                                                                     True), 1e-5
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
index 2a8206e58e00e3..c647849fa7ee4b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
@@ -27,46 +27,59 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-        def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.ones([1, 3, 64, 64]).astype(np.float32)
-
-        for alpha in [0.02, 1.0, 100.0, -1.0, 0.0]:
-            for X_scale in [1.0, 100.0, 0.01, -0.1, 0.0]:
-                dics = [{
-                    "alpha": alpha,
-                    "use_mkldnn": True,
-                    "enable_int8": True,
-                    "X_scale": X_scale
-                }]
-
-                ops_config = [{
-                    "op_type": "leaky_relu",
-                    "op_inputs": {
-                        "X": ["input_data"],
-                    },
-                    "op_outputs": {
-                        "Out": ["y_data"],
-                    },
-                    "op_attrs": dics[0]
-                }]
-                ops = self.generate_op_config(ops_config)
-                program_config = ProgramConfig(
-                    ops=ops,
-                    weights={},
-                    inputs={
-                        "input_data":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
-                    },
-                    outputs=["y_data"])
-
-                yield program_config
+        def generate_input1(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [1, 2]:
+            for shape in [[batch, 64], [batch, 32, 64], [batch, 8, 32, 32]]:
+                self.input_dim = len(shape)
+                for alpha in [0.02, 1.0, 100.0, -1.0, 0.0]:
+                    dics = [{"alpha": alpha}]
+                    ops_config = [{
+                        "op_type": "leaky_relu",
+                        "op_inputs": {
+                            "X": ["input_data"],
+                        },
+                        "op_outputs": {
+                            "Out": ["y_data"],
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, shape))
+                        },
+                        outputs=["y_data"])
+
+                    yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [4, 3, 64, 64]}
+            if self.input_dim == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64, 128]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+            elif self.input_dim == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 128, 256]
+                }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 64]}
+            elif self.input_dim == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 8, 8, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [64, 64, 128, 128]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 16, 64, 32]
+                }
 
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}

From 3523bbe86376878fcda52b2dcc152db76971db87 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Tue, 26 Oct 2021 13:56:18 +0800
Subject: [PATCH 12/14] [NPU] fix argsort op, test=develop (#36576)

* [NPU] fix argsort op, test=develop

* remove debug files, test=develop

* fix typo, test=develop

* address review comments, test=develop
---
 paddle/fluid/operators/arg_max_op_xpu.cc      |   2 +-
 paddle/fluid/operators/arg_min_op_npu.cc      |   2 +-
 paddle/fluid/operators/argsort_op_npu.cc      | 345 ++++++++----------
 paddle/fluid/operators/cumsum_op_npu.cc       |   2 +-
 paddle/fluid/operators/dropout_op_npu.cc      |   2 +-
 paddle/fluid/operators/expand_v2_op_npu.cc    |   2 +-
 paddle/fluid/operators/huber_loss_op_npu.cc   |   5 +-
 .../fluid/operators/interpolate_v2_op_npu.cc  |   2 +-
 paddle/fluid/operators/is_empty_op_npu.cc     |   2 +-
 paddle/fluid/operators/log_loss_op_npu.cc     |   2 +-
 paddle/fluid/operators/meshgrid_op_npu.cc     |   2 +-
 paddle/fluid/operators/pad3d_op_npu.cc        |   2 +-
 .../operators/reduce_ops/reduce_max_op_npu.cc |   2 +-
 .../reduce_ops/reduce_prod_op_npu.cc          |   2 +-
 ...igmoid_cross_entropy_with_logits_op_npu.cc |   2 +-
 paddle/fluid/operators/slice_op_npu.cc        |   2 +-
 paddle/fluid/operators/tril_triu_op_npu.cc    |   2 +-
 .../unittests/npu/test_argsort_op_npu.py      |   8 +-
 18 files changed, 171 insertions(+), 217 deletions(-)

diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc
index 8060b5cf755c0e..71ec26ea5a7927 100644
--- a/paddle/fluid/operators/arg_max_op_xpu.cc
+++ b/paddle/fluid/operators/arg_max_op_xpu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index f776412c16239f..cc81e320080b74 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index e36dd322e0ea1d..f2a57b4b9bdfb1 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,156 +18,142 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void TranposeNPU(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, std::vector<int64_t>* perm,
+                        const Tensor& in, Tensor* out) {
+  out->mutable_data<T>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Transpose")
+      .AddInput(in)
+      .AddInput(std::move(*perm))
+      .AddOutput(*out)
+      .Run(stream);
+}
+
+static void CastToInt64(const framework::ExecutionContext& ctx,
+                        const aclrtStream& stream, const Tensor& in,
+                        Tensor* out) {
+  out->mutable_data<int64_t>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_INT64)
+      .Run(stream);
+}
+
+template <typename T>
 class ArgsortNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<framework::Tensor>("X");
     auto* output = ctx.Output<framework::Tensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
     auto* indices = ctx.Output<framework::Tensor>("Indices");
-    indices->mutable_data<int32_t>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    bool descending = ctx.Attr<bool>("descending");
 
-    int32_t axis = ctx.Attr<int>("axis");
-    auto in_dims = indices->dims();
+    auto in_dims = input->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    bool descending = ctx.Attr<bool>("descending");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    framework::NPUAttributeMap sort_attr_input = {
-        {"axis", static_cast<int32_t>(-1)}, {"descending", descending}};
+
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+    framework::NPUAttributeMap attr = {{"axis", -1},
+                                       {"descending", descending}};
+
+    Tensor indices_tmp(framework::proto::VarType::INT32);
+    indices_tmp.Resize(indices->dims());
 
     if (axis == -1 || axis + 1 == in_dims.size()) {
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input);
-      sort_runner.Run(stream);
+      output->mutable_data<T>(ctx.GetPlace());
+      indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
+      const auto& runner =
+          NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
+      runner.Run(stream);
     } else {
-      // transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap trans_attr_input = {{"perm", trans}};
-      Tensor trans_input;
-      trans_input.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_input_runner =
-          NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input);
-      trans_input_runner.Run(stream);
-      Tensor trans_indices;
-      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      const auto& trans_indice_runner = NpuOpRunner(
-          "TransposeD", {*indices}, {trans_indices}, trans_attr_input);
-      trans_indice_runner.Run(stream);
-      Tensor trans_output;
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_input(input->type());
+      trans_input.Resize(trans_dims);
+      TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
+
+      Tensor trans_output(input->type());
+      Tensor trans_indices(framework::proto::VarType::INT32);
       trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-      const auto& trans_output_runner = NpuOpRunner(
-          "TransposeD", {*output}, {trans_output}, trans_attr_input);
-      trans_output_runner.Run(stream);
-      const auto& sort_runner =
-          NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices},
-                      sort_attr_input);
-      sort_runner.Run(stream);
-      // transpose back
-      const auto& trans_indices_back_runner = NpuOpRunner(
-          "TransposeD", {trans_indices}, {*indices}, trans_attr_input);
-      trans_indices_back_runner.Run(stream);
-      const auto& trans_output_back_runner = NpuOpRunner(
-          "TransposeD", {trans_output}, {*output}, trans_attr_input);
-      trans_output_back_runner.Run(stream);
+      trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
+
+      const auto& runner = NpuOpRunner("Sort", {trans_input},
+                                       {trans_output, trans_indices}, attr);
+      runner.Run(stream);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
+      TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
     }
+    CastToInt64(ctx, stream, indices_tmp, indices);
   }
 };
 
-template <typename Type>
-static void ReshapeNPU(const framework::Tensor* input,
-                       const std::vector<Type>& input_shapes,
-                       framework::Tensor* output) {
-  output->ShareDataWith(*input);
-  output->Resize(framework::make_ddim(std::move(input_shapes)));
-}
-
 template <typename T, typename Type>
 static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          Type ind_lastdim, Type outer_dim,
-                          const framework::DDim& trans_dims,
-                          const framework::Tensor* input,
-                          const framework::Tensor* indices,
-                          framework::Tensor* t_out) {
-  // reshape input
-  Type input_shape = ind_lastdim * outer_dim;
-  std::vector<Type> input_shapes = {input_shape};
-  Tensor input_reshape_tensor(input->type());
-  ReshapeNPU<Type>(input, input_shapes, &input_reshape_tensor);
-  // reshape index
-  std::vector<Type> index_shapes = {outer_dim, ind_lastdim};
-  framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim});
-  Tensor ind_2d_tensor(indices->type());
-  ReshapeNPU<Type>(indices, index_shapes, &ind_2d_tensor);
-  // range_flatten_index
-  std::vector<int32_t> range_flatten_index;
-  for (Type i = 0; i < input_shape; i += ind_lastdim) {
-    range_flatten_index.push_back(static_cast<int32_t>(i));
+                          const aclrtStream& stream,
+                          const framework::DDim in_dims, const Tensor& input,
+                          const Tensor& indices, Tensor* t_out) {
+  const int64_t input_height =
+      framework::product(framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
+  const int64_t input_width = in_dims[in_dims.size() - 1];
+
+  Tensor input_tmp;
+  input_tmp.ShareDataWith(input);
+  input_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  Tensor indices_tmp;
+  indices_tmp.ShareDataWith(indices);
+  indices_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, input_width}));
+
+  std::vector<int64_t> indexs_value;
+  for (Type i = 0; i < input_height; i++) {
+    indexs_value.push_back(i * input_width);
   }
-  Tensor range_flatten_index_tensor(framework::proto::VarType::INT32);
-  range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim}));
-  range_flatten_index_tensor.mutable_data<int32_t>(
-      {static_cast<int>(range_flatten_index.size())}, ctx.GetPlace());
-  TensorFromVector(range_flatten_index, ctx.device_context(),
-                   &range_flatten_index_tensor);
-  Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type());
-  std::vector<Type> flatten_shape = {outer_dim, 1};
-  ReshapeNPU<Type>(&range_flatten_index_tensor, flatten_shape,
-                   &range_flatten_index_expand_tensor);
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  Tensor ind_2d_add_tensor;
-  ind_2d_add_tensor.mutable_data<int32_t>(ind_2d, ctx.GetPlace());
-  const auto& runner_ind_2d_tensor = NpuOpRunner(
-      std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor},
-      {ind_2d_add_tensor}, {});
-  runner_ind_2d_tensor.Run(stream);
-  Tensor ind_reshape_tensor(ind_2d_add_tensor.type());
-  ReshapeNPU<Type>(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor);
-  Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type());
-  std::vector<Type> ind_shape = {input_shape, 1};
-  ReshapeNPU<Type>(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor);
-  // expand_index
-  Tensor input_scatter_tensor;
-  input_scatter_tensor.Resize({input_shape});
-  input_scatter_tensor.mutable_data<T>(ctx.GetPlace());
-  Tensor input_scatter_tensor_ori;
-  input_scatter_tensor_ori.Resize({input_shape});
-  input_scatter_tensor_ori.mutable_data<T>(ctx.GetPlace());
-  std::vector<Type> trans_shapes;
-
-  for (int i = 0; i < trans_dims.size(); i++) {
-    trans_shapes.push_back(trans_dims[i]);
-  }
-  NpuOpRunner runner_scatter;
-  runner_scatter.SetType("TensorScatterUpdate")
-      .AddInput(input_scatter_tensor_ori)
-      .AddInput(ind_reshape_expand_tensor)
-      .AddInput(input_reshape_tensor)
-      .AddOutput(input_scatter_tensor);
-  runner_scatter.Run(stream);
-  framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(),
-                        ctx.template device_context<platform::DeviceContext>(),
-                        t_out);
-  t_out->Resize(framework::make_ddim(trans_shapes));
+  Tensor indexs_tmp(indices.type());
+  framework::TensorFromVector<int64_t>(indexs_value, ctx.device_context(),
+                                       &indexs_tmp);
+  indexs_tmp.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height, 1}));
+
+  Tensor indices_index(indices.type());
+  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
+  const auto& runner_add =
+      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
+  runner_add.Run(stream);
+
+  indices_index.Resize(
+      framework::make_ddim(std::vector<int64_t>{input_height * input_width}));
+
+  t_out->mutable_data<T>(ctx.GetPlace());
+  Tensor out_tmp(t_out->type());
+  out_tmp.ShareDataWith(*t_out);
+
+  const auto& runner =
+      NpuOpRunner("TensorScatterUpdate", {input_tmp, indices_index, input_tmp},
+                  {out_tmp}, {});
+  runner.Run(stream);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ArgsortGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,75 +161,42 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
     auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int axis = ctx.Attr<int>("axis");
+
     auto in_dims = indices->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    dX->mutable_data<T>(ctx.GetPlace());
-    Tensor dxt;
-    dxt.mutable_data<T>(dX->dims(), place);
-    const auto& runner_flatten =
-        NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {});
-    runner_flatten.Run(stream);
-    FillNpuTensorWithConstant<T>(&dxt, static_cast<T>(0));
     if (dO->numel() == 0) return;
-    // Do full assig  n
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
-      const int64_t ind_lastdim = in_dims[in_dims.size() - 1];
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, in_dims, dO,
-                                indices, dX);
 
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
+
+    if (axis == -1 || axis + 1 == in_dims.size()) {
+      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
     } else {
-      // If not full assign do transpose
-      std::vector<int> trans;
-      for (int i = 0; i < axis; i++) {
-        trans.push_back(i);
-      }
-      trans.push_back(in_dims.size() - 1);
-      for (int i = axis + 1; i < in_dims.size() - 1; i++) {
-        trans.push_back(i);
+      std::vector<int64_t> perm;
+      for (int64_t i = 0; i < in_dims.size(); i++) {
+        perm.emplace_back(i);
       }
-      trans.push_back(axis);
-      framework::DDim trans_dims(in_dims);
-      for (size_t i = 0; i < trans.size(); i++) {
-        trans_dims[i] = in_dims[trans[i]];
-      }
-      std::vector<int> axis;
-      for (size_t i = 0; i < trans.size(); i++) {
-        axis.push_back(in_dims[trans[i]]);
+      std::swap(perm[axis], perm[in_dims.size() - 1]);
+
+      std::vector<int64_t> shape;
+      for (size_t i = 0; i < perm.size(); i++) {
+        shape.emplace_back(in_dims[perm[i]]);
       }
-      framework::NPUAttributeMap attr_input = {{"perm", trans}};
-      Tensor trans_dO;
-      trans_dO.mutable_data<T>(trans_dims, ctx.GetPlace());
-      Tensor trans_ind;
-      trans_ind.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-      // Do transpose
-      const auto& runner_transpose_dx = NpuOpRunner(
-          std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input});
-      runner_transpose_dx.Run(stream);
-      const auto& runner_transpose_ind = NpuOpRunner(
-          std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input});
-      runner_transpose_ind.Run(stream);
-
-      const int64_t outer_dim = framework::product(
-          framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1));
-      const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1];
-
-      Tensor tmp_out;
-      tmp_out.mutable_data<T>(trans_dims, ctx.GetPlace());
-
-      FullAssignNPU<T, int64_t>(ctx, ind_lastdim, outer_dim, trans_dims,
-                                &trans_dO, &trans_ind, &tmp_out);
-
-      // transpose back
-      const auto& runner_transpose_out = NpuOpRunner(
-          std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input});
-      runner_transpose_out.Run(stream);
+      auto trans_dims = framework::make_ddim(shape);
+
+      Tensor trans_dout(dO->type());
+      Tensor trans_ids(indices->type());
+      trans_dout.Resize(trans_dims);
+      trans_ids.Resize(trans_dims);
+
+      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
+      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
+
+      Tensor trans_dx(dO->type());
+      trans_dx.Resize(trans_dims);
+      FullAssignNPU<T, int64_t>(ctx, stream, trans_dims, trans_dout, trans_ids,
+                                &trans_dx);
+
+      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
     }
   }
 };
@@ -251,11 +207,8 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(
-    argsort, ops::ArgsortNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ArgsortNPUKernel<plat::NPUDeviceContext, plat::float16>);
+REGISTER_OP_NPU_KERNEL(argsort, ops::ArgsortNPUKernel<float>,
+                       ops::ArgsortNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext, float>,
-                       ops::ArgsortGradNPUKernel<plat::NPUDeviceContext,
-                                                 paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(argsort_grad, ops::ArgsortGradNPUKernel<float>,
+                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 486e85b0f0dfca..0c0eb1577e8029 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/cum_op.h"
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index b5c8bfff0dc39f..50d247d9c05906 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include <memory>
 #include <string>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index 85fe86a9e606f3..4b0e0770573a6f 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/expand_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index a9426155941544..33cbaec4dfc462 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -1,13 +1,16 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/huber_loss_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index d893fbd0196289..b30c7ac810c011 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/interpolate_v2_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc
index 9155afecd021b7..01579abd74d234 100644
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ b/paddle/fluid/operators/is_empty_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/is_empty_op.h"
 
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index a8d906d4b5cad8..74b44165dcc4c1 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/log_loss_op.h"
 #include <cmath>
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
index 9605fa092f0697..f22e2e178ef851 100644
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ b/paddle/fluid/operators/meshgrid_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/meshgrid_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 3a1fba94550032..483c895e0e65a8 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index b343fc88d7b8d3..5efc7e9b869b7d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 834b63f199e37d..b5f571c7fea2ca 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index 6f3b40dbbf3942..400a09330a3483 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 52351a98bce37d..a9092d7e2abbce 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/slice_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index cdabc28255b518..6e7e03911370fd 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the Licnse. */
+limitations under the License. */
 
 #include "paddle/fluid/operators/tril_triu_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
index 824266578b9e57..2589b2a316a16e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py
@@ -18,7 +18,7 @@
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -63,9 +63,6 @@ def set_npu(self):
         self.__class__.use_npu = True
         self.__class__.no_need_check_grad = True
 
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
     def init_inputshape(self):
         self.input_shape = (2, 2, 2, 3, 3)
 
@@ -158,7 +155,8 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad_with_place(
+            self.place, ["X"], "Out", max_relative_error=0.03)
 
 
 class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32):

From 9aeca2f1805b48421c402c66f6087972c55cab33 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Tue, 26 Oct 2021 14:01:15 +0800
Subject: [PATCH 13/14] Move fused_attention and fused_feedforward functional
 api path to incubate (#36704)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

将 #35905 和 #35843 PR中新增的的python api接口移到incubate目录下。
---
 paddle/fluid/operators/fused/CMakeLists.txt    |  2 --
 .../fluid/tests/unittests/CMakeLists.txt       |  1 -
 .../tests/unittests/test_fused_attention_op.py |  3 ++-
 .../unittests/test_fused_feedforward_op.py     | 12 ++++++------
 .../paddle/incubate/nn/functional/__init__.py  | 18 ++++++++++++++++++
 .../nn/functional/fused_transformer.py         | 10 +++++-----
 python/paddle/nn/functional/__init__.py        |  4 ----
 7 files changed, 31 insertions(+), 19 deletions(-)
 create mode 100644 python/paddle/incubate/nn/functional/__init__.py
 rename python/paddle/{ => incubate}/nn/functional/fused_transformer.py (97%)

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 0e2dae75071e7f..eec925b2c057b7 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -81,10 +81,8 @@ if (WITH_GPU OR WITH_ROCM)
         nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
 
-
         op_library(fused_feedforward_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_feedforward);\n")
-
         # fused_attention_op
         op_library(fused_attention_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_attention);\n")
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 5b1c02e71abce1..d8212216d3f182 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -98,7 +98,6 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
 endforeach()
 
 if(NOT WITH_GPU)
-
     LIST(REMOVE_ITEM TEST_OPS test_fused_feedforward_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a5578d71c5cd06..1e0d83f8ac7759 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -18,6 +18,7 @@
 import paddle.nn as nn
 import paddle.fluid.core as core
 import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
 from paddle.nn.layer.norm import LayerNorm
 from paddle.nn.layer.common import Linear, Dropout
 from paddle.nn.layer.transformer import _convert_attention_mask
@@ -190,7 +191,7 @@ def GetFusedAttentionOut(self):
 
         if attn_mask is not None:
             attn_mask = _convert_attention_mask(attn_mask, x.dtype)
-        final_out = F.fused_multi_head_attention(
+        final_out = incubate_f.fused_multi_head_attention(
             x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm,
             ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor,
             out_linear_bias, attn_mask, self.dropout_prob,
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index d926512b592d74..5ea43d2edf0e66 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -18,6 +18,7 @@
 import paddle.fluid.core as core
 from paddle.nn.layer import transformer
 import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
 from paddle.nn.layer.norm import LayerNorm
 from paddle.nn.layer.common import Linear, Dropout
 import unittest
@@ -121,7 +122,7 @@ def FusedFFN(self):
         ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False)
         ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False)
         x = paddle.to_tensor(self.src, stop_gradient=False)
-        out = F.fused_feedforward(
+        out = incubate_f.fused_feedforward(
             x,
             linear1_weight,
             linear2_weight,
@@ -215,7 +216,7 @@ def test_static(self):
         ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model])
         ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model])
 
-        fused_out = F.fused_feedforward(
+        fused_out = incubate_f.fused_feedforward(
             x,
             linear1_weight,
             linear2_weight,
@@ -295,8 +296,7 @@ def test_dtype():
                     name='linear1_weight', shape=[1, 10, 10], dtype="float32")
                 linear2_weight = paddle.static.data(
                     name='linear2_weight', shape=[1, 10, 10], dtype="float32")
-                paddle.nn.functional.fused_feedforward(x, linear1_weight,
-                                                       linear2_weight)
+                incubate_f.fused_feedforward(x, linear1_weight, linear2_weight)
 
             self.assertRaises(TypeError, test_dtype)
 
@@ -307,7 +307,7 @@ def test_dropout_rate_type():
                     name='linear1_weight1', shape=[10, 10], dtype="float32")
                 linear2_weight = paddle.static.data(
                     name='linear2_weight1', shape=[10, 10], dtype="float32")
-                paddle.nn.functional.fused_feedforward(
+                incubate_f.fused_feedforward(
                     x, linear1_weight, linear2_weight, dropout1_rate="a")
 
             self.assertRaises(TypeError, test_dropout_rate_type)
@@ -319,7 +319,7 @@ def test_dropout_rate_value():
                     name='linear1_weight2', shape=[10, 10], dtype="float32")
                 linear2_weight = paddle.static.data(
                     name='linear2_weight2', shape=[10, 10], dtype="float32")
-                paddle.nn.functional.fused_feedforward(
+                incubate_f.fused_feedforward(
                     x, linear1_weight, linear2_weight, dropout2_rate=-1)
 
             self.assertRaises(ValueError, test_dropout_rate_value)
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
new file mode 100644
index 00000000000000..4d1c3eee025b04
--- /dev/null
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fused_transformer import fused_multi_head_attention
+from .fused_transformer import fused_feedforward
+
+__all__ = ['fused_multi_head_attention', 'fused_feedforward']
diff --git a/python/paddle/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
similarity index 97%
rename from python/paddle/nn/functional/fused_transformer.py
rename to python/paddle/incubate/nn/functional/fused_transformer.py
index d07927491491b8..75bf9f10cef314 100644
--- a/python/paddle/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import in_dygraph_mode
-from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
 from paddle import _C_ops
 
 __all__ = []
@@ -90,7 +90,7 @@ def fused_feedforward(x,
             x = paddle.to_tensor(x_data)
             linear1_weight = paddle.to_tensor(linear1_weight_data)
             linear2_weight = paddle.to_tensor(linear2_weight_data)
-            out = paddle.nn.functional.fused_feedforward(x, linear1_weight, linear2_weight)
+            out = paddle.incubate.nn.functional.fused_feedforward(x, linear1_weight, linear2_weight)
             print(out.numpy().shape)
             # (1, 8, 8)
     """
@@ -244,7 +244,7 @@ def fused_multi_head_attention(x,
 
             # required: gpu
             import paddle
-            import paddle.nn.functional as F
+            import paddle.incubate.nn.functional as F
 
             # input: [batch_size, seq_len, embed_dim]
             x = paddle.rand(shape=(2, 4, 128), dtype="float32")
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 2c0c4461330cd2..1af53e0826be87 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -61,7 +61,6 @@
 from .conv import conv1d  # noqa: F401
 from .conv import conv1d_transpose  # noqa: F401
 from .common import linear  # noqa: F401
-from .fused_transformer import fused_multi_head_attention  # noqa: F401
 from .conv import conv2d  # noqa: F401
 from .conv import conv2d_transpose  # noqa: F401
 from .conv import conv3d  # noqa: F401
@@ -111,7 +110,6 @@
 from .vision import pixel_shuffle  # noqa: F401
 from .input import one_hot  # noqa: F401
 from .input import embedding  # noqa: F401
-from .fused_transformer import fused_feedforward  # noqa: F401
 from ...fluid.layers import gather_tree  # noqa: F401
 from ...fluid.layers import temporal_shift  # noqa: F401
 
@@ -213,7 +211,5 @@
            'layer_norm',
            'instance_norm',
            'class_center_sample',
-            'fused_feedforward',
-           'fused_multi_head_attention',
            'sparse_attention',
 ]

From eb9ef8850c88c63ca061006a2d7250de6e41922e Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Tue, 26 Oct 2021 14:08:25 +0800
Subject: [PATCH 14/14] Modify paddle.static.nn.cond doc (#36694)

Update `cond` English document
---
 python/paddle/fluid/layers/control_flow.py | 41 ++++++++++++----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f444b5e9c0e5fd..af2316a9a443e2 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2316,10 +2316,13 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
         the same shape because of dataflow model of PaddlePaddle while the
         tensors in the tuples or the lists can have different shapes.
 
-        2. Any tensors or operations created outside of ``true_fn`` and
-        ``false_fn`` will be executed regardless of which branch is selected at
-        runtime. This has frequently surprised users who expected a lazy
-        semantics. For example:
+        2. This API could be used under both static mode or dygraph mode. If it
+        is in dygraph mode, the API only runs one branch based on condition.
+
+        3. If it is in static mode, any tensors or operations created outside 
+        or inside of ``true_fn`` and ``false_fn`` will be in net building
+        regardless of which branch is selected at runtime. This has frequently
+        surprised users who expected a lazy semantics. For example:
 
         .. code-block:: python
 
@@ -2328,9 +2331,11 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             a = paddle.zeros((1, 1))
             b = paddle.zeros((1, 1))
             c = a * b
-            out = paddle.nn.cond(a < b, lambda: a + c, lambda: b * b)
+            out = paddle.static.nn.cond(a < b, lambda: a + c, lambda: b * b)
 
-        No matter whether ``a < b`` , ``c = a * b`` will run.
+        No matter whether ``a < b`` , ``c = a * b`` will be in net building and
+        run. ``a + c`` and ``b * b`` will be in net building, but only one
+        branch will be executed during runtime.
 
     Args:
         pred(Tensor): A boolean tensor whose numel should be 1. The boolean
@@ -2366,24 +2371,24 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             #     return 3, 2
             #
 
-
             def true_func():
-                return paddle.fill_constant(shape=[1, 2], dtype='int32',
-                                            value=1), paddle.fill_constant(shape=[2, 3],
-                                                                           dtype='bool',
-                                                                           value=True)
+                return paddle.full(shape=[1, 2], dtype='int32',
+                                   fill_value=1), paddle.full(shape=[2, 3],
+                                                              dtype='bool',
+                                                              fill_value=True)
 
 
             def false_func():
-                return paddle.fill_constant(shape=[3, 4], dtype='float32',
-                                            value=3), paddle.fill_constant(shape=[4, 5],
-                                                                           dtype='int64',
-                                                                           value=2)
+                return paddle.full(shape=[3, 4], dtype='float32',
+                                   fill_value=3), paddle.full(shape=[4, 5],
+                                                              dtype='int64',
+                                                              fill_value=2)
+
 
-            x = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
-            y = paddle.fill_constant(shape=[1], dtype='float32', value=0.23)
+            x = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
+            y = paddle.full(shape=[1], dtype='float32', fill_value=0.23)
             pred = paddle.less_than(x=x, y=y, name=None)
-            ret = paddle.nn.cond(pred, true_func, false_func)
+            ret = paddle.static.nn.cond(pred, true_func, false_func)
             # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]