From 4a7f1a0d840244f41c180a93101bae50bf487879 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 2 Nov 2021 14:37:17 +0800
Subject: [PATCH] Add Intermediate Kernel API for refactor Tensor Lib (#36914)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* initial tensor design & sign kernel demo

* add move constructor for meta & add lodtensor

* add dirs & sign xpu kernel

* add mean cpu&cuda kernel impl

* move sign & mean xpu & npu kernel

* add selected_rows basic impl

* refactor design, BaseTensor to DenseTensor, etc.

* add scale mkldnn kernel

* polish xpu & npu impl details

* fix mkldnn reuse compile failed

* change tensor operation lib name

* rename util filename

* add more comments

* change TensorImplInterface to TensorInterface

* add kernel key and factory

* remove MKLDNNTensorMeta, add MKLDNNDenseTensor

* change XXDeviceContext to XXContext

* add base kernel registrar utils & test on sign

* replace boost::any by paddle::any

* fix several ci failed

* fix npu compile error

* add ordered map util

* fix multiple ordered_map compile errors

* move dev into include dir

* support sign op in static op run

* fix static op run error

* fix new executor compile failed

* add dygraph branch & remove sign_op.h

* fix test_infer_no_need_buffer_slots

* fix rocm compile link error

* fix unitybuild error & clear glog

* fix npu compile failed

* skip quant trans test

* fix part windows compile problem

* fix xpu enforce error

* fix inference test failed

* remove ordered_map to solve quant failed

* fix part of rcom compile faild

* add more register kernels

* revert scale kernel temporarily

* fix code format error

* add new kernel registrar marco

* rename top to tcmpt

* revert xpu, npu, mkldnn impl & remove op def

* add kernel args parse functor to auto parse args

* revert some change & add scale kernels

* add op proto in dygraph kernelcontext building

* polish kernel dispatch logic & nameing rule

* fix scale kernel match error

* fix scale test failed

* add mean API and unittest

* test mean api success

* add branch to solve compiled error

* skip clang format error

* add mean skip rule in op_library

* add dot kernel, api and unittest (#6)

* remove old kernel and add symbol link

* fix dot compiled failed

* add merco for module declare

* fix npu and xpu compile error

* revert sign, mean, scale, dot kernel removing

* add comment for keeping old kernel impl

* fix mutable_data error

* fix bfloat16 conflit

* fix inference undef error

* adapt to msvc compile rules

* polish comment for template inst

* add cmake template instantiation for win

* fix backend to place device id bug

* fix ifdef error

* Op2functor (#7)

* add kernel args maker class

* make args maker non-const

* remove debug log

* modify codes by review options

* split constructPrKernelContext function

* fix output name bug

* fix test_mean_op test_sign_op failed

* fill_any_like kernel refactor (#10)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* skip dtype for fill_any_like

* add attrs for kernel key constrcut

* add use_pt_kernel Flags to control whether to use pt kernel (#13)

* add use_pt_kernel Flags to control whether to use pt kernel

* change the default value to true for cheking pt kernels

* fix mutable_data cuda place error

* move high level apis into hapi

* remove selectedrows adapting temporarily

* Support Scalar in Tensor Compute Library (#14)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* remove mkldnn tensor & polish details

* use flat_hash_map and small_vector in kernel factory

* Refactor flatten kernel (#12)

* refactor flatten kernel

* update infershape function

* fix compile bugs

* fix bugs when merge

* fix compiler bugs

* fix bugs when run test_flatten_api

* fix bugs when run test

* Revert "use flat_hash_map and small_vector in kernel factory"

This reverts commit 23091495cfdd3df8cc1be592d30f09ea66a7c72b.

* Move cpu, cuda and other device code into kernels (#15)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc

* Perfect unitests (#16)

* perfect unittest

* update license

* replace with flat_hash_map, small_vector (#19)

* fix small_vector build error on windows platform

* replace with flat_hash_map, small_vector

* remove todo

* Perfect unitests (#20)

* perfect unittest

* update license

* fix bug when run tcmpt_utils_test

* refactor execution adapting impl

* fix insert conflit

* Fix CI bug of test_yolov3 (#21)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc

* Fix CI bug of test_yolov3

* add the tensor base class, test=develop (#17)

* update the tensor base class, test=develop

* remove two funcs, test=develop

* update the error msg, test=develop

Co-authored-by: Chen Weihang <chenweihang@baidu.com>

* [no-verify] commit backend and tensor signature changes

* Rename tcmpt to pten (#23)

* rename tcmpt to pten

* update omitted files for rename to pten

* update omitted file for rename to pten

* remove k of all enum var

* remove kernel_instantiate (#26)

* remove symbols and spatial_tensor

* change common to functions

* readd share tensor impl methods

* add a candidate dense tensor class, test=develop (#28)

* change all Pt to Pten

* resolve conflit with xiaowei

* Op2functor opt1 (#27)

* replace to small vector and change to const &

* add std::move

Co-authored-by: Chen Weihang <chenweihang@baidu.com>

* polish kernel factory and kernel registry

* fix operator test error msg mismatch

* remove tensor signature and backend set member

* move scalar and polish enforce

* revert dtype layout change to fix error

* fix enum operator override error

* Add Intermediate API layer

* add several base unittests

* add pten utils tests

* polish some details

* Dev/op2func refactor 3 (#30)

* add a candidate dense tensor class, test=develop

* remove TensorBase::backend(), test=develop

* remove some ops, test=develop

* cherry-pick the pr of tensor meta, test=develop

* moves the dense tensor and some ops, test=develop

* update the linalg operator, test=develop

* update other operators, test=develop

* fix errors, test=develop

* fix bugs, test=develop

* try to resolve the problem of windows ci, test=develop

* updates codes, test=develop

* fix the tensor_utils.cc, test=develop

* modify the dense tensor, test=develop

* fix the data type, test=develop

Co-authored-by: shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>

* intermediate api adapt to new dense tensor

* add some TODO and delete include header

Co-authored-by: Chen Weihang <chenweihang@baidu.com>
Co-authored-by: chentianyu03 <ctychentianyu@gmail.com>
Co-authored-by: zyfncg <1370305206@qq.com>
Co-authored-by: 石晓伟 <39303645+Shixiaowei02@users.noreply.github.com>
---
 paddle/fluid/framework/operator.cc           |   1 -
 paddle/fluid/imperative/prepared_operator.cc |   1 -
 paddle/pten/api/include/creation.h           |  21 ++++
 paddle/pten/api/include/linalg.h             |  19 +++
 paddle/pten/api/include/manipulation.h       |  20 ++++
 paddle/pten/api/include/math.h               |  57 +++++++++
 paddle/pten/hapi/lib/utils/tensor_utils.cc   |   1 +
 paddle/pten/kernels/cpu/manipulation.cc      |   5 +-
 paddle/pten/kernels/cuda/manipulation.cu     |   5 +-
 paddle/pten/tests/CMakeLists.txt             |   1 +
 paddle/pten/tests/test_dot_api.cc            |  54 +++++++++
 paddle/pten/tests/test_fill_api.cc           |  37 ++++++
 paddle/pten/tests/test_flatten_api.cc        |  46 ++++++++
 paddle/pten/tests/test_mean_api.cc           |  35 ++++++
 paddle/pten/tests/test_scale_api.cc          | 118 +++++++++++++++++++
 15 files changed, 413 insertions(+), 8 deletions(-)
 create mode 100644 paddle/pten/tests/test_scale_api.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 16e63e433e640..d317aac8594b4 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index db26c66958140..b2d55babc7e1c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/utils/small_vector.h"
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
index d7311e6cd283b..9795d88f81880 100644
--- a/paddle/pten/api/include/creation.h
+++ b/paddle/pten/api/include/creation.h
@@ -14,5 +14,26 @@
 
 #pragma once
 
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/creation.h"
 #include "paddle/pten/kernels/cuda/creation.h"
+
+namespace pten {
+
+// TODO(YuanRisheng) This function name should be same as User API name.
+// TODO(zyfncg) Automatic code generation
+template <typename T, typename ContextT>
+DenseTensor FillAnyLike(const ContextT& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& val) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  FillAnyLike<T>(dev_ctx, x, val, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
index d9798c3a2e0a8..0d4c7a60fbc14 100644
--- a/paddle/pten/api/include/linalg.h
+++ b/paddle/pten/api/include/linalg.h
@@ -15,5 +15,24 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/linalg.h"
 #include "paddle/pten/kernels/cuda/linalg.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Dot(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  auto out_meta = DotInferShape(x.meta(), y.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Dot<T>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
index f2acad9649969..1f867686a6eb7 100644
--- a/paddle/pten/api/include/manipulation.h
+++ b/paddle/pten/api/include/manipulation.h
@@ -15,5 +15,25 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/kernels/cuda/manipulation.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Flatten(const ContextT& dev_ctx,
+                    const DenseTensor& x,
+                    int start_axis,
+                    int stop_axis) {
+  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
index 5145c823a5c6e..fa512e8d6db0d 100644
--- a/paddle/pten/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -15,5 +15,62 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/math.h"
 #include "paddle/pten/kernels/cuda/math.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Sign<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Mean(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = ReductionInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Mean<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  float scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Scale<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+}  // namespace pten
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.cc b/paddle/pten/hapi/lib/utils/tensor_utils.cc
index a55c50db761a6..f7641f424f491 100644
--- a/paddle/pten/hapi/lib/utils/tensor_utils.cc
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.cc
@@ -45,6 +45,7 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
   SetLoD(&meta.lod, src.lod());
   auto shared_storage =
       pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
+
   return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                              std::move(meta));
 }
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index c436e14e0caab..87c76149f127f 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -24,10 +24,9 @@ void Flatten(const CPUContext& dev_ctx,
              int start_axis,
              int stop_axis,
              DenseTensor* out) {
-  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  auto out_dims = out->dims();
   pten::Copy(dev_ctx, x, out);
-  out->set_lod(out_meta.lod);
-  out->Resize(out_meta.dims);
+  out->Resize(out_dims);
 }
 
 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 43614f859c58b..38111f2b8c02f 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -24,10 +24,9 @@ void Flatten(const CUDAContext& dev_ctx,
              int start_axis,
              int stop_axis,
              DenseTensor* out) {
-  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  auto out_dims = out->dims();
   pten::Copy(dev_ctx, x, out);
-  out->set_lod(out_meta.lod);
-  out->Resize(out_meta.dims);
+  out->Resize(out_dims);
 }
 
 // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
index 3dc779380527f..3d2da6a5afdd1 100644
--- a/paddle/pten/tests/CMakeLists.txt
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -12,3 +12,4 @@ cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS linalg_api pten_hapi_utils)
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api pten_hapi_utils)
 cc_test(test_copy_api SRCS test_copy_api.cc DEPS utils_cpu pten_hapi_utils)
 cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS utils_cpu manipulation_api pten_hapi_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS math_api pten_hapi_utils)
diff --git a/paddle/pten/tests/test_dot_api.cc b/paddle/pten/tests/test_dot_api.cc
index 69e785904fe3c..5401b66544473 100644
--- a/paddle/pten/tests/test_dot_api.cc
+++ b/paddle/pten/tests/test_dot_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/hapi/lib/utils/allocator.h"
 
+#include "paddle/pten/api/include/linalg.h"
+
 PT_DECLARE_MODULE(LinalgCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -82,3 +84,55 @@ TEST(API, dot) {
   ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
   ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
 }
+
+// TODO(YuanRisheng) This unitest should be created in other file.
+//                   It is convenient to make compilation decoupling.
+TEST(DEV_API, dot) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  pten::DenseTensor dense_y(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y.mutable_data<float>();
+
+  float sum[3] = {0.0, 0.0, 0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sum[i] += (i * 10 + j) * (i * 10 + j) * 1.0;
+    }
+  }
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Dot<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      dense_y);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = sum;
+  auto actual_result0 = out.data<float>()[0];
+  auto actual_result1 = out.data<float>()[1];
+  auto actual_result2 = out.data<float>()[2];
+  ASSERT_NEAR(expect_result[0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
+}
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
index 4f93e03aca2f3..5a788226086dc 100644
--- a/paddle/pten/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/hapi/lib/utils/allocator.h"
 
+#include "paddle/pten/api/include/creation.h"
+
 PT_DECLARE_MODULE(CreationCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -131,3 +133,38 @@ TEST(API, ones_like) {
     ASSERT_EQ(actual_result[i], 1);
   }
 }
+
+TEST(DEV_API, fill_any_like) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 2}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  dense_x_data[0] = 0;
+  float val = 1.0;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::FillAnyLike<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      val);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto* actual_result = out.data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
diff --git a/paddle/pten/tests/test_flatten_api.cc b/paddle/pten/tests/test_flatten_api.cc
index 48d2205c2ff48..dfb777678a94d 100644
--- a/paddle/pten/tests/test_flatten_api.cc
+++ b/paddle/pten/tests/test_flatten_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/hapi/lib/utils/allocator.h"
 
+#include "paddle/pten/api/include/manipulation.h"
+
 PT_DECLARE_MODULE(ManipulationCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -70,3 +72,47 @@ TEST(API, flatten) {
   }
   ASSERT_EQ(value_equal, true);
 }
+
+TEST(DEV_API, flatten) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  for (int i = 0; i < dense_x.numel(); i++) {
+    dense_x_data[i] = i;
+  }
+  int start_axis = 1, stop_axis = 2;
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Flatten<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      start_axis,
+      stop_axis);
+
+  // 3. check result
+  std::vector<int> expect_shape = {3, 4, 3};
+  ASSERT_EQ(out.dims()[0], expect_shape[0]);
+  ASSERT_EQ(out.dims()[1], expect_shape[1]);
+  ASSERT_EQ(out.dims()[2], expect_shape[2]);
+  ASSERT_EQ(out.numel(), 36);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  bool value_equal = true;
+  auto* dense_out_data = out.data<float>();
+  for (int i = 0; i < dense_x.numel(); i++) {
+    if (std::abs(dense_x_data[i] - dense_out_data[i]) > 1e-6f)
+      value_equal = false;
+  }
+  ASSERT_EQ(value_equal, true);
+}
diff --git a/paddle/pten/tests/test_mean_api.cc b/paddle/pten/tests/test_mean_api.cc
index ee8388671b7eb..b3da90659d005 100644
--- a/paddle/pten/tests/test_mean_api.cc
+++ b/paddle/pten/tests/test_mean_api.cc
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/hapi/lib/utils/allocator.h"
 
+#include "paddle/pten/api/include/math.h"
+
 PT_DECLARE_MODULE(MathCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -67,3 +69,36 @@ TEST(API, mean) {
   auto actual_result = dense_out->data<float>()[0];
   ASSERT_NEAR(expect_result, actual_result, 1e-6f);
 }
+
+TEST(DEV_API, mean) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  float sum = 0.0;
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+    sum += i * 1.0;
+  }
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+  // 2. test API
+  auto out = pten::Mean<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)), dense_x);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 1);
+  ASSERT_EQ(out.numel(), 1);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = sum / 12;
+  auto actual_result = out.data<float>()[0];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
diff --git a/paddle/pten/tests/test_scale_api.cc b/paddle/pten/tests/test_scale_api.cc
new file mode 100644
index 0000000000000..9f80d6d2cc126
--- /dev/null
+++ b/paddle/pten/tests/test_scale_api.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/hapi/include/math.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
+
+#include "paddle/pten/api/include/math.h"
+
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, scale) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  float scale = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}
+
+TEST(DEV_API, scale_host) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 4}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+  }
+  const auto alloc2 = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor scale(alloc2,
+                          pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                framework::make_ddim({1}),
+                                                pten::DataLayout::NCHW));
+  scale.mutable_data<float>()[0] = 2;
+  float bias = 1;
+  bool bias_after_scale = true;
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto out = pten::Scale<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      scale,
+      bias,
+      bias_after_scale);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = 23;
+  auto actual_result = out.data<float>()[11];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}