From 3f5f789ed8e2f64c83c672f5ec842332879f1c04 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 14 Oct 2021 12:32:29 +0000
Subject: [PATCH] remove mkldnn tensor & polish details

---
 cmake/generic.cmake                           |   2 +-
 cmake/tcmpt.cmake                             |   9 +-
 paddle/fluid/framework/eigen.h                |  44 -----
 ...est_reference_count_pass_last_lived_ops.cc |   2 +-
 paddle/fluid/framework/operator.cc            |  15 --
 paddle/fluid/framework/tcmpt_utils.cc         |  38 +---
 paddle/fluid/framework/type_defs.h            |   2 -
 paddle/fluid/imperative/prepared_operator.cc  |  15 --
 .../pscore/heter_listen_and_server_test.cc    |   2 +-
 .../operators/pscore/heter_server_test.cc     |   2 +-
 paddle/fluid/operators/scale_op_xpu.cc        |   1 -
 paddle/fluid/operators/sign_op.cc             |   3 +-
 paddle/tcmpt/api/include/core.h               |   1 -
 paddle/tcmpt/core/mkldnn_dense_tensor.h       |  56 ------
 paddle/tcmpt/cpu/CMakeLists.txt               |   1 +
 paddle/tcmpt/cuda/CMakeLists.txt              |   1 +
 paddle/tcmpt/cuda/linalg.cu                   |  20 +--
 paddle/tcmpt/eigen/common.h                   | 170 ++++++++++++++++++
 paddle/tcmpt/eigen/dot.h                      |  50 ++++++
 paddle/tcmpt/eigen/fill.h                     |   5 +-
 paddle/tcmpt/eigen/mean.h                     |   6 +-
 paddle/tcmpt/eigen/scale.h                    |   6 +-
 paddle/tcmpt/eigen/sign.h                     |   6 +-
 23 files changed, 249 insertions(+), 208 deletions(-)
 delete mode 100644 paddle/tcmpt/core/mkldnn_dense_tensor.h
 create mode 100644 paddle/tcmpt/eigen/common.h
 create mode 100644 paddle/tcmpt/eigen/dot.h

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7390bd17e386e..12b4530a77a4c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -117,7 +117,7 @@ function(find_fluid_modules TARGET_NAME)
 endfunction(find_fluid_modules)
 
 set_property(GLOBAL PROPERTY TCMPT_MODULES "")
-# find all top modules is used for paddle static library
+# find all tcmpt modules is used for paddle static library
 # for building inference libs
 function(find_tcmpt_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
diff --git a/cmake/tcmpt.cmake b/cmake/tcmpt.cmake
index 3ffc168c6bed0..819cd42287974 100644
--- a/cmake/tcmpt.cmake
+++ b/cmake/tcmpt.cmake
@@ -1,4 +1,10 @@
-# TODO(chenweihang): keep message comment for debuging, remove it if needless
+# `kernel_instantiate` functionis used to declare the template instantiation of
+# the Kernel function generated through code analysis, only for windows
+# (because the windows platform msvc compiler cannot automatically instantiate
+# the template function through decltype)
+# TODO(chenweihang): keep message comment for debuging, it is still useful,
+# I will remove it if needless later
+
 function(kernel_instantiate TARGET)
     set(target_file ${CURRENT_BINARY_DIR}/${TARGET}.tmp CACHE INTERNAL "${CURRENT_BINARY_DIR}/${TARGET} file")
     set(target_file_final ${CURRENT_BINARY_DIR}/${TARGET})
@@ -36,7 +42,6 @@ function(kernel_instantiate TARGET)
     endforeach()
     # message(STATUS "INST CONTENT: ${instantiate_context}")
     file(APPEND ${target_file} "${instantiate_context}\n")
-    # copy_if_different(${target_file} ${target_file_final})
     string(REPLACE "." "_" cmd_name ${TARGET})
     # this is a dummy target for custom command, should always be run firstly to update ${target_file_final}
     # TODO(chenweihang): nameing rule need to enchance
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 56843b9aa6853..a6abda8a83bc8 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-
 namespace paddle {
 namespace framework {
 
@@ -69,28 +67,6 @@ struct EigenTensor {
   static ConstType From(const Tensor& tensor) {
     return From(tensor, tensor.dims_);
   }
-
-  // for pt::DenseTensor
-  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
-    // why tensor.data<T>() not work?
-    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
-    // EigenDim<D>::From(dims));
-    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
-  }
-
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
-    return From(tensor, tensor.dims());
-  }  // NOLINT
-
-  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
-    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
-    // EigenDim<D>::From(dims));
-    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const pt::DenseTensor& tensor) {
-    return From(tensor, tensor.dims());
-  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -133,17 +109,6 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
       const Tensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
-
-  // for pt::DenseTensor
-  static typename EigenVector::Type Flatten(
-      pt::DenseTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims())});
-  }
-
-  static typename EigenVector::ConstType Flatten(
-      const pt::DenseTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims())});
-  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -160,15 +125,6 @@ struct EigenScalar {
   static ConstType From(const Tensor& tensor) {
     return ConstType(tensor.data<T>());
   }
-
-  // for pt::DenseTensor
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
-    return Type(const_cast<T*>(tensor.data<T>()));
-  }
-
-  static ConstType From(const pt::DenseTensor& tensor) {
-    return ConstType(tensor.data<T>());
-  }
 };
 
 // Define Tensor with 32-bit index.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index 8cf541637557b..f410171f99896 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 USE_OP(elementwise_mul);
 USE_OP(elementwise_add);
 USE_OP(elementwise_add_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1b0cf462479d2..a47089ecba5cd 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1155,7 +1155,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
 
-  // VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
   if (FLAGS_use_pt_kernel &&
       pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
@@ -1263,17 +1262,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-static bool ContainSelectedRows(const VariableValueMap& inputs) {
-  for (auto& var_pair : inputs) {
-    for (auto* var : var_pair.second) {
-      if (var->IsType<SelectedRows>()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 // TODO(chenweihang): now only check single var input
 static bool IsValidVar(const std::string& name,
                        const VariableValueMap& inputs) {
@@ -1303,9 +1291,6 @@ static pt::KernelName ConstructPtKernelName(const std::string& op_type,
                                             const VariableValueMap& inputs) {
   std::string overload_name;
   // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  // if (ContainSelectedRows(inputs)) {
-  //   overload_name = pt::kContainSelectedRowsSuffix;
-  // }
   if (ContainHostTensor(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index f83f6b593a60d..71ef2d3450ae9 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -13,18 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tcmpt_utils.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
-
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/symbols.h"
 
 namespace paddle {
 namespace framework {
 
 // TODO(chenweihang, shixiaowei): adapt SelectedRows
-
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
     const LoDTensor& tensor, pt::Backend backend, pt::DataType dtype,
@@ -167,38 +164,5 @@ std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
   return nullptr;
 }
 
-/* For MKLDNNDenseTensor (move this part into a single file later) */
-#ifdef PADDLE_WITH_MKLDNN
-
-template <>
-std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
-    const Tensor& tensor, const platform::Place& place,
-    proto::VarType::Type type) {
-  auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pt::MKLDNNDenseTensor>(
-      pt::TensorMeta(tensor.dims(), pt::TransToPtBackend(place),
-                     pt::TransToPtDataType(type),
-                     pt::TransToPtLayout(tensor.layout()), tensor.offset()),
-      pt::TensorStatus());
-
-  if (holder != nullptr) {
-    tensor_impl->ShareAllocation(tensor.Holder());
-  } else {
-    VLOG(1) << "Old MKLDNN Tensor holder is nullptr.";
-  }
-
-  tensor_impl->set_format(tensor.format());
-  return tensor_impl;
-}
-
-template <>
-void ShareTensorImpl(pt::MKLDNNDenseTensor* tensor_impl, Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pt::TransToProtoVarType(tensor_impl->type()));
-  out->set_format(tensor_impl->format());
-}
-
-#endif
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 9d19d0bce6071..1c5469d02c3ef 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -33,7 +33,6 @@ class BlockDesc;
 class Variable;
 class InferNoNeedBufferVarsFN;
 
-// TODO(chenweihang): AttirbuteMap also need to be ordered
 // TODO(panyx0718): Replace vector with something like gtl::Vector.
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
@@ -44,7 +43,6 @@ using Attribute = boost::variant<
     std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
     std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
 
-// TODO(chenweihang): AttirbuteMap also need to be ordered
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c3cda9e8e992c..f7e57bec1da9e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -137,18 +137,6 @@ static framework::VariableValueMap BuildInputMap(
   return inputs;
 }
 
-template <typename VarType>
-static bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
-  for (auto& var_pair : inputs) {
-    for (auto& var : var_pair.second) {
-      if (var->Var().template IsType<framework::SelectedRows>()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 // TODO(chenweihang): enhance rules, not all dispensable inputs
 // are host tensor, now only for scale kernel verify
 template <typename VarType>
@@ -169,9 +157,6 @@ static pt::KernelName ConstructPtKernelName(
     const NameVarMap<VarType>& inputs) {
   std::string overload_name;
   // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  // if (ContainSelectedRows<VarType>(inputs)) {
-  //   overload_name = pt::kContainSelectedRowsSuffix;
-  // }
   if (ContainHostTensor<VarType>(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index bbc7f01597900..3b005e10d9b98 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -32,7 +32,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 DECLARE_double(eager_delete_tensor_gb);
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 3e6897073e129..df2eb70b144e4 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -29,7 +29,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index c467f3f89d064..e0dfad91570ad 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index a491da3931964..6207c33f9d629 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-
 #include "paddle/fluid/operators/sign_op.h"
+#include <memory>
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/tcmpt/api/include/core.h b/paddle/tcmpt/api/include/core.h
index d6b73dcbee66e..fd863186abb30 100644
--- a/paddle/tcmpt/api/include/core.h
+++ b/paddle/tcmpt/api/include/core.h
@@ -19,5 +19,4 @@ limitations under the License. */
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_factory.h"
-#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
 #include "paddle/tcmpt/core/scalar.h"
diff --git a/paddle/tcmpt/core/mkldnn_dense_tensor.h b/paddle/tcmpt/core/mkldnn_dense_tensor.h
deleted file mode 100644
index 0aea392fce93d..0000000000000
--- a/paddle/tcmpt/core/mkldnn_dense_tensor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLDNN
-
-#include "mkldnn.hpp"
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-
-namespace pt {
-
-class MKLDNNDenseTensor : public DenseTensor {
- public:
-  // Not allowed to initialize a tensor without descriptive metadata
-  MKLDNNDenseTensor() = delete;
-
-  MKLDNNDenseTensor(const MKLDNNDenseTensor&) = delete;
-  MKLDNNDenseTensor& operator=(const MKLDNNDenseTensor&) = delete;
-  MKLDNNDenseTensor(MKLDNNDenseTensor&&) = delete;
-  MKLDNNDenseTensor& operator=(MKLDNNDenseTensor&&) = delete;
-
-  MKLDNNDenseTensor(const TensorMeta& meta, const TensorStatus& status)
-      : DenseTensor(meta, status) {}
-
-  mkldnn::memory::format_tag format() const { return format_; }
-
-  void set_format(const mkldnn::memory::format_tag format) { format_ = format; }
-
- private:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-  mkldnn::memory::format_tag format_ = mkldnn::memory::format_tag::undef;
-};
-
-}  // namespace pt
-
-#endif
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index fbb0a45266003..3480ebba53155 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -1,5 +1,6 @@
 if(WIN32)
     set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cpu)
+    kernel_instantiate(creation.cc)
     kernel_instantiate(math.cc)
     kernel_instantiate(linalg.cc)
 endif()
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index 94de051e2e3a4..458d93529f435 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -1,5 +1,6 @@
 if(WIN32)
     set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cuda)
+    kernel_instantiate(creation.cu)
     kernel_instantiate(math.cu)
     kernel_instantiate(linalg.cu)
 endif()
diff --git a/paddle/tcmpt/cuda/linalg.cu b/paddle/tcmpt/cuda/linalg.cu
index acfdf59b27441..118d3326e5fb5 100644
--- a/paddle/tcmpt/cuda/linalg.cu
+++ b/paddle/tcmpt/cuda/linalg.cu
@@ -15,10 +15,9 @@
 #include "paddle/tcmpt/cuda/linalg.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/eigen/dot.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/complex.h"
 
 namespace pt {
@@ -28,22 +27,7 @@ void Dot(const CUDAContext& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out) {
-  out->mutable_data();
-  if (1 == out->dims().size()) {
-    auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
-    auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
-    auto eigen_y = paddle::framework::EigenVector<T>::Flatten(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
-  } else {
-    auto eigen_out = paddle::framework::EigenMatrix<T>::From(*out);
-    auto eigen_x = paddle::framework::EigenMatrix<T>::From(x);
-    auto eigen_y = paddle::framework::EigenMatrix<T>::From(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
-  }
+  eigen::Dot<CUDAContext, T>(dev_ctx, x, y, out);
 }
 
 }  // namespace pt
diff --git a/paddle/tcmpt/eigen/common.h b/paddle/tcmpt/eigen/common.h
new file mode 100644
index 0000000000000..37bed55a7d97a
--- /dev/null
+++ b/paddle/tcmpt/eigen/common.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pt {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE_EQ(arity(dims),
+                      D,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension size should be equal to %d, but "
+                          "received dimension size is %d.",
+                          arity(dims),
+                          D));
+    Type ret;
+    for (int64_t d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
+    // why tensor.data<T>() not work?
+    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
+    // EigenDim<D>::From(dims));
+    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
+  }
+
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims());
+  }  // NOLINT
+
+  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
+    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
+    // EigenDim<D>::From(dims));
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const pt::DenseTensor& tensor) {
+    return From(tensor, tensor.dims());
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(pt::DenseTensor& tensor,  // NOLINT
+                                            int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const pt::DenseTensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(
+      pt::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const pt::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+    return Type(const_cast<T*>(tensor.data<T>()));
+  }
+
+  static ConstType From(const pt::DenseTensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+// Define Tensor with 32-bit index.
+template <typename T, int D, int MajorType = Eigen::RowMajor>
+using Tensor32BitIndex =
+    Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, int>, Eigen::Aligned>;
+
+template <typename DSizes>
+Eigen::DSizes<int, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::DSizes<int, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <typename EigenTensor>
+Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>
+To32BitIndex(EigenTensor in) {
+  using RetType =
+      Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>;
+  return RetType(in.data(), To32BitDims(in.dimensions()));
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/eigen/dot.h b/paddle/tcmpt/eigen/dot.h
new file mode 100644
index 0000000000000..5e323e4448409
--- /dev/null
+++ b/paddle/tcmpt/eigen/dot.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Dot(const DevCtx& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  out->mutable_data();
+  if (1 == out->dims().size()) {
+    auto eigen_out = pt::EigenScalar<T>::From(*out);
+    auto eigen_x = pt::EigenVector<T>::Flatten(x);
+    auto eigen_y = pt::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = pt::EigenMatrix<T>::From(*out);
+    auto eigen_x = pt::EigenMatrix<T>::From(x);
+    auto eigen_y = pt::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace eigen
+}  // namespace pt
diff --git a/paddle/tcmpt/eigen/fill.h b/paddle/tcmpt/eigen/fill.h
index 6a21ca6932cd5..fb56ccdd8e125 100644
--- a/paddle/tcmpt/eigen/fill.h
+++ b/paddle/tcmpt/eigen/fill.h
@@ -15,8 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
-#include "paddle/fluid/framework/eigen.h"
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -50,7 +51,7 @@ void fill(const DeviceContext& context, DenseTensor* tensor, VType val) {
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(val)));
 
-  auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
+  auto t = pt::EigenVector<T>::Flatten(*tensor);
   t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
 }
 
diff --git a/paddle/tcmpt/eigen/mean.h b/paddle/tcmpt/eigen/mean.h
index bd2c5ad2bf219..e70870e7954b7 100644
--- a/paddle/tcmpt/eigen/mean.h
+++ b/paddle/tcmpt/eigen/mean.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -30,8 +30,8 @@ void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
-  auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenScalar<T>::From(*out);
 
   auto& dev = *dev_ctx.eigen_device();
   eigen_out.device(dev) = eigen_x.mean();
diff --git a/paddle/tcmpt/eigen/scale.h b/paddle/tcmpt/eigen/scale.h
index 5bea4fb300af4..152cb61800c8b 100644
--- a/paddle/tcmpt/eigen/scale.h
+++ b/paddle/tcmpt/eigen/scale.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -32,8 +32,8 @@ void Scale(const DevCtx& dev_ctx,
            DenseTensor* out) {
   // calc
   out->mutable_data<T>();
-  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
   auto& dev = *dev_ctx.eigen_device();
   // TODO(chenweihang): now the eigen function here need the dtype of scale,
   // eigen_x, bias should be same, so here need cast for two scalar arg,
diff --git a/paddle/tcmpt/eigen/sign.h b/paddle/tcmpt/eigen/sign.h
index b138123e81ee0..d41702576b3a1 100644
--- a/paddle/tcmpt/eigen/sign.h
+++ b/paddle/tcmpt/eigen/sign.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -33,8 +33,8 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
 
   auto& dev = *dev_ctx.eigen_device();
   paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(