From e78f3d95762dd623513e89bd6c2f0f95286fd10d Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Tue, 20 Jul 2021 18:54:15 +0200
Subject: [PATCH 1/7] added expand_v2 bf16/fp32 kernel

---
 paddle/fluid/operators/expand_op.cc           |  14 +-
 paddle/fluid/operators/expand_v2_op.cc        |  36 +++-
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   | 166 ++++++++++++++++++
 .../reduce_ops/mkldnn/reduce_mkldnn_op.h      |  12 +-
 paddle/fluid/platform/mkldnn_reuse.h          |  39 ++--
 5 files changed, 226 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index e7da08ff27711..02389c9baa537 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -75,9 +75,17 @@ class ExpandOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 618c1560c5eac..e6613453c2f59 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -89,9 +89,17 @@ class ExpandV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
@@ -130,6 +138,14 @@ class ExpandV2OpMaker : public framework::OpProtoAndCheckerMaker {
               "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("shape", "The expanded shape for each dimension.")
         .SetDefault({});
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
     AddComment(R"DOC(
 Expand the input to the given shape. The rank of X
 should be in [1, 6] and size of 'shape' must be in [1, 6] also.
@@ -200,9 +216,17 @@ class ExpandV2GradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
new file mode 100644
index 0000000000000..6e3c46a0588e4
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class ExpandMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto x_vec_dims = framework::vectorize(x->dims());
+    auto out_vec_dims = framework::vectorize(out->dims());
+
+    dnnl::memory::format_tag x_format_tag = x->format();
+    if (x_vec_dims.size() != out_vec_dims.size()) {
+      x_format_tag =
+          GetExtendedFormatTag(x_vec_dims, out_vec_dims.size(), x_format_tag);
+    }
+
+    out->set_format(x_format_tag);
+
+    platform::BroadcastDataMKLDNNHandler<T> handler(
+        dnnl::algorithm::binary_add, dev_ctx, onednn_engine, ctx.GetPlace(),
+        out, x, 0.0f, 1.0f, ctx.InputName("X"), x_vec_dims);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto binary_p = handler.AcquireForwardPrimitive();
+
+    const std::unordered_map<int, dnnl::memory> args = {
+        {DNNL_ARG_SRC_0, *dst_memory_p},
+        {DNNL_ARG_SRC_1, *src_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    binary_p->execute(astream, args);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+
+ private:
+  dnnl::memory::format_tag GetExtendedFormatTag(
+      std::vector<int64_t>& dims, int new_size,
+      mkldnn::memory::format_tag format_tag) const {
+    mkldnn::memory::desc md(dims, platform::MKLDNNGetDataType<T>(), format_tag);
+    std::vector<int64_t> new_dims(new_size, 1);
+    std::copy(dims.begin(), dims.end(),
+              new_dims.begin() + new_size - dims.size());
+
+    dims = std::move(new_dims);
+    return platform::GetMKLDNNFormat(md.reshape(dims));
+  }
+};
+
+template <typename T>
+class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dx_vec_dims = framework::vectorize(dx->dims());
+    auto dout_vec_dims = framework::vectorize(dout->dims());
+
+    dnnl::memory::format_tag dx_format_tag = dout->format();
+    if (dx_vec_dims.size() != dout_vec_dims.size()) {
+      dx_vec_dims.insert(dx_vec_dims.begin(),
+                         dout_vec_dims.size() - dx_vec_dims.size(), 1);
+      // dx_format_tag = GetExtendedFormatTag(x_vec_dims, out_vec_dims.size(),
+      // dx_format_tag);
+    }
+
+    // out->set_format(x_format_tag);
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    if (dout_vec_dims == dx_vec_dims) {
+      mkldnn::memory::data_type dout_type =
+          framework::ToMKLDNNDataType(dout->type());
+      std::string key = platform::CreateKey(
+          dev_ctx, dout_vec_dims, dout->format(), dout->format(), dout_type);
+      platform::ReorderMKLDNNHandler reorder_handler(
+          dout_vec_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          dout->format(), platform::to_void_cast(dout->data<T>()));
+
+      auto reorder_dst_memory_p =
+          reorder_handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(
+          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc()));
+    } else {
+      platform::ReductionMKLDNNHandler<T> handler(
+          dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
+          ctx.GetPlace(), dout, dx, ctx.InputName("X"), dx_vec_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(dout);
+      auto dst_memory_p = handler.AcquireDstMemory(dx);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+          paddle::framework::vectorize<int64_t>(dx->dims()))));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(expand_v2, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ExpandMKLDNNKernel<float>,
+                   ops::ExpandMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(expand_v2_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ExpandGradMKLDNNKernel<float>,
+                   ops::ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
\ No newline at end of file
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 40cd3ba974f04..6a9aae046f386 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -165,23 +165,21 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
       x_format_tag = getPlainFormatTag(output_dx);
     }
 
-    output_dx->mutable_data<T>(ctx.GetPlace());
     output_dx->set_format(x_format_tag);
-    output_dx->set_layout(input_dy->layout());
 
     platform::BroadcastDataMKLDNNHandler<T> handler(
         binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx,
         input_dy, scale_x, scale_y,
         ctx.InputName(framework::GradVarName("Out")), input_dims);
 
-    const auto src_dx_memory = handler.AcquireSrcMemory(output_dx);
-    const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy);
+    const auto src_memory_p = handler.AcquireSrcMemory(input_dy);
+    const auto dst_memory_p = handler.AcquireDstMemory(output_dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
-        {DNNL_ARG_SRC_0, *src_dx_memory},
-        {DNNL_ARG_SRC_1, *src_dy_memory},
-        {DNNL_ARG_DST, *src_dx_memory}};
+        {DNNL_ARG_SRC_0, *dst_memory_p},
+        {DNNL_ARG_SRC_1, *src_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     binary_prim->execute(astream, args);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 58622fb2529b8..81cd2793f376d 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -695,8 +695,8 @@ class BroadcastDataMKLDNNHandler
   BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
                              const MKLDNNDeviceContext& dev_ctx,
                              const mkldnn::engine engine,
-                             platform::Place cpu_place, const Tensor* x,
-                             const Tensor* y, float scale_x, float scale_y,
+                             platform::Place cpu_place, const Tensor* out,
+                             const Tensor* x, float scale_x, float scale_y,
                              const std::string& uniq_name,
                              const std::vector<int64_t>& input_dims)
       : platform::MKLDNNHandlerT<T, dnnl::binary>(
@@ -711,19 +711,12 @@ class BroadcastDataMKLDNNHandler
           x->format(), MKLDNNMemoryFormat::undef,
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
-      PADDLE_ENFORCE_EQ(
-          y->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
-      PADDLE_ENFORCE_NE(
-          y->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
-
-      const auto src0_tz = framework::vectorize(x->dims());
+      const auto src0_tz = framework::vectorize(out->dims());
 
       const auto src0_md = dnnl::memory::desc(
-          src0_tz, platform::MKLDNNGetDataType<T>(), x->format());
+          src0_tz, platform::MKLDNNGetDataType<T>(), out->format());
       const auto src1_md = dnnl::memory::desc(
-          input_dims, platform::MKLDNNGetDataType<T>(), x->format());
+          input_dims, platform::MKLDNNGetDataType<T>(), out->format());
 
       dnnl::primitive_attr attributes;
       attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -733,19 +726,15 @@ class BroadcastDataMKLDNNHandler
                                               src1_md, src0_md);
     }
   }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(framework::Tensor* input) {
-    T* input_data = input->data<T>();
-    memset(input_data, 0, this->fwd_pd_->src_desc().get_size());
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src0_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
-      const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(
-        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+  
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
+    T_out* ptr = output->mutable_data<T_out>(
+        this->place_, this->fwd_pd_->dst_desc().get_size());
+    ;
+    memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr,
+                                            "@dst_mem_p");
   }
 };
 

From a40c25d957e2ce594026dc5f571d5f3b81801ec6 Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Tue, 20 Jul 2021 19:00:30 +0200
Subject: [PATCH 2/7] minor change

---
 paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 6e3c46a0588e4..a5969d0e44402 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -102,11 +102,8 @@ class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
     if (dx_vec_dims.size() != dout_vec_dims.size()) {
       dx_vec_dims.insert(dx_vec_dims.begin(),
                          dout_vec_dims.size() - dx_vec_dims.size(), 1);
-      // dx_format_tag = GetExtendedFormatTag(x_vec_dims, out_vec_dims.size(),
-      // dx_format_tag);
     }
 
-    // out->set_format(x_format_tag);
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (dout_vec_dims == dx_vec_dims) {
       mkldnn::memory::data_type dout_type =

From b9b34fe306f3c42e26d349ffe775f4bdfa8a092e Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Wed, 21 Jul 2021 14:40:03 +0200
Subject: [PATCH 3/7] CI fix

---
 paddle/fluid/operators/expand_op.cc                | 14 +++-----------
 paddle/fluid/operators/expand_v2_op.cc             |  4 ++--
 .../fluid/operators/mkldnn/expand_v2_mkldnn_op.cc  |  3 +--
 paddle/fluid/platform/mkldnn_reuse.h               |  2 +-
 4 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 02389c9baa537..e7da08ff27711 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -75,17 +75,9 @@ class ExpandOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index e6613453c2f59..3c2b939e79957 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -216,8 +216,8 @@ class ExpandV2GradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
 
 #ifdef PADDLE_WITH_MKLDNN
     if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index a5969d0e44402..efbeaa78941f3 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -98,7 +98,6 @@ class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
     auto dx_vec_dims = framework::vectorize(dx->dims());
     auto dout_vec_dims = framework::vectorize(dout->dims());
 
-    dnnl::memory::format_tag dx_format_tag = dout->format();
     if (dx_vec_dims.size() != dout_vec_dims.size()) {
       dx_vec_dims.insert(dx_vec_dims.begin(),
                          dout_vec_dims.size() - dx_vec_dims.size(), 1);
@@ -160,4 +159,4 @@ REGISTER_OP_KERNEL(expand_v2, MKLDNN, paddle::platform::CPUPlace,
 
 REGISTER_OP_KERNEL(expand_v2_grad, MKLDNN, paddle::platform::CPUPlace,
                    ops::ExpandGradMKLDNNKernel<float>,
-                   ops::ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
\ No newline at end of file
+                   ops::ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 81cd2793f376d..f63d45d7ff6ae 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -726,7 +726,7 @@ class BroadcastDataMKLDNNHandler
                                               src1_md, src0_md);
     }
   }
-  
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr = output->mutable_data<T_out>(

From 3bd6438fe1bcdb405bcc197e086fce5b31af8e1f Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Fri, 23 Jul 2021 18:20:10 +0200
Subject: [PATCH 4/7] added missing test file

---
 .../mkldnn/test_expand_v2_mkldnn_op.py        | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
new file mode 100644
index 0000000000000..88b98a1fdca8e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+
+
+class TestExpandV2OneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.init_data()
+        self.x = np.random.random(self.ori_shape).astype("float32")
+        self.set_inputs()
+        self.attrs = {'shape': self.shape, 'use_mkldnn': True}
+        output = np.tile(self.inputs['X'], self.expand_times)
+        self.outputs = {'Out': output}
+
+    def set_inputs(self):
+        self.inputs = {'X': self.x}
+
+    def init_data(self):
+        self.ori_shape = [1, 140]
+        self.shape = [12, 140]
+        self.expand_times = [12, 1]
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
+
+
+class TestExpandV2ExpandDimOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.shape = [2, 120]
+        self.expand_times = [2, 1]
+
+class TestExpandV2CopyScenarioOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.shape = (2, 10, 5)
+        self.expand_times = (1, 1, 1)
+
+class TestExpandV2CopyScenarioShapeNotGivenOneDNNOp(TestExpandV2OneDNNOp):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.shape = (-1, -1, -1, -1)
+        self.expand_times = (1, 1, 1, 1)
+
+
+#   BF16 TESTS
+def create_expand_v2_bf16_test_class(parent):
+    @OpTestTool.skip_if_not_cpu_bf16()
+    class TestExpandV2BF16OneDNNOp(parent):
+        def set_inputs(self):
+            self.inputs = {"X": convert_float_to_uint16(self.x)}
+
+        def calculate_grads(self):
+            self.dout = self.outputs['Out']
+            self.dx = self.dout.copy()
+
+            for i in range (len(self.shape)):
+                if self.expand_times[i] != 1:
+                    self.dx = np.sum(self.dx, axis=i, keepdims=True)
+
+        def test_check_grad(self):
+            self.calculate_grads()
+            self.check_grad_with_place(
+                core.CPUPlace(), ["X"],
+                "Out",
+                user_defined_grads=[convert_float_to_uint16(self.dx)],
+                user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Expand_v2_BF16")
+    TestExpandV2BF16OneDNNOp.__name__ = cls_name
+    globals()[cls_name] = TestExpandV2BF16OneDNNOp
+
+
+create_expand_v2_bf16_test_class(TestExpandV2OneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2ExpandDimOneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2CopyScenarioOneDNNOp)
+create_expand_v2_bf16_test_class(TestExpandV2CopyScenarioShapeNotGivenOneDNNOp)
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()

From 0b7247fdfdd0f010bbf43cc3f40f5585f4b9dcec Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Fri, 23 Jul 2021 18:49:33 +0200
Subject: [PATCH 5/7] added formatting

---
 .../fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
index 88b98a1fdca8e..eceef665a0304 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -52,12 +52,14 @@ def init_data(self):
         self.shape = [2, 120]
         self.expand_times = [2, 1]
 
+
 class TestExpandV2CopyScenarioOneDNNOp(TestExpandV2OneDNNOp):
     def init_data(self):
         self.ori_shape = (2, 10, 5)
         self.shape = (2, 10, 5)
         self.expand_times = (1, 1, 1)
 
+
 class TestExpandV2CopyScenarioShapeNotGivenOneDNNOp(TestExpandV2OneDNNOp):
     def init_data(self):
         self.ori_shape = (2, 4, 5, 7)
@@ -76,7 +78,7 @@ def calculate_grads(self):
             self.dout = self.outputs['Out']
             self.dx = self.dout.copy()
 
-            for i in range (len(self.shape)):
+            for i in range(len(self.shape)):
                 if self.expand_times[i] != 1:
                     self.dx = np.sum(self.dx, axis=i, keepdims=True)
 

From 2b00c60438780cbce0e9b8daa4ca0a4178f50120 Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Wed, 28 Jul 2021 17:59:28 +0200
Subject: [PATCH 6/7] reduced binary size

---
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   | 85 +++++++++----------
 .../mkldnn/test_expand_v2_mkldnn_op.py        |  2 +
 2 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index efbeaa78941f3..ffd64a841ecb3 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -14,28 +14,30 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace operators {
+namespace {
 
 using paddle::framework::Tensor;
+using paddle::framework::vectorize;
+using paddle::framework::GradVarName;
+using paddle::framework::ExecutionContext;
+using paddle::platform::MKLDNNDeviceContext;
 
 template <typename T>
-class ExpandMKLDNNKernel : public framework::OpKernel<T> {
+class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const ExecutionContext& ctx) const override {
     this->RunKernel(ctx);
   }
 
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
     const auto* x = ctx.Input<Tensor>("X");
     auto* out = ctx.Output<Tensor>("Out");
 
-    auto x_vec_dims = framework::vectorize(x->dims());
-    auto out_vec_dims = framework::vectorize(out->dims());
+    auto x_vec_dims = vectorize(x->dims());
+    auto out_vec_dims = vectorize(out->dims());
 
     dnnl::memory::format_tag x_format_tag = x->format();
     if (x_vec_dims.size() != out_vec_dims.size()) {
@@ -45,7 +47,7 @@ class ExpandMKLDNNKernel : public framework::OpKernel<T> {
 
     out->set_format(x_format_tag);
 
-    platform::BroadcastDataMKLDNNHandler<T> handler(
+    paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
         dnnl::algorithm::binary_add, dev_ctx, onednn_engine, ctx.GetPlace(),
         out, x, 0.0f, 1.0f, ctx.InputName("X"), x_vec_dims);
 
@@ -58,62 +60,62 @@ class ExpandMKLDNNKernel : public framework::OpKernel<T> {
         {DNNL_ARG_SRC_1, *src_memory_p},
         {DNNL_ARG_DST, *dst_memory_p}};
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
     binary_p->execute(astream, args);
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+    out->set_layout(paddle::framework::DataLayout::kMKLDNN);
+    out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
   }
 
  private:
   dnnl::memory::format_tag GetExtendedFormatTag(
       std::vector<int64_t>& dims, int new_size,
       mkldnn::memory::format_tag format_tag) const {
-    mkldnn::memory::desc md(dims, platform::MKLDNNGetDataType<T>(), format_tag);
+    mkldnn::memory::desc md(dims, paddle::platform::MKLDNNGetDataType<T>(),
+                            format_tag);
     std::vector<int64_t> new_dims(new_size, 1);
     std::copy(dims.begin(), dims.end(),
               new_dims.begin() + new_size - dims.size());
 
     dims = std::move(new_dims);
-    return platform::GetMKLDNNFormat(md.reshape(dims));
+    return paddle::platform::GetMKLDNNFormat(md.reshape(dims));
   }
 };
 
 template <typename T>
-class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
+class ExpandGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const ExecutionContext& ctx) const override {
     this->RunKernel(ctx);
   }
 
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
+  void RunKernel(const ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<Tensor>(GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(GradVarName("X"));
 
-    auto dx_vec_dims = framework::vectorize(dx->dims());
-    auto dout_vec_dims = framework::vectorize(dout->dims());
+    auto dx_vec_dims = vectorize(dx->dims());
+    auto dout_vec_dims = vectorize(dout->dims());
 
     if (dx_vec_dims.size() != dout_vec_dims.size()) {
       dx_vec_dims.insert(dx_vec_dims.begin(),
                          dout_vec_dims.size() - dx_vec_dims.size(), 1);
     }
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
     if (dout_vec_dims == dx_vec_dims) {
       mkldnn::memory::data_type dout_type =
-          framework::ToMKLDNNDataType(dout->type());
-      std::string key = platform::CreateKey(
+          paddle::framework::ToMKLDNNDataType(dout->type());
+      std::string key = paddle::platform::CreateKey(
           dev_ctx, dout_vec_dims, dout->format(), dout->format(), dout_type);
-      platform::ReorderMKLDNNHandler reorder_handler(
+      paddle::platform::ReorderMKLDNNHandler reorder_handler(
           dout_vec_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          dout->format(), platform::to_void_cast(dout->data<T>()));
+          dout->format(), paddle::platform::to_void_cast(dout->data<T>()));
 
       auto reorder_dst_memory_p =
           reorder_handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace());
@@ -124,11 +126,11 @@ class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
-      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
       dx->set_format(
-          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc()));
+          paddle::platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc()));
     } else {
-      platform::ReductionMKLDNNHandler<T> handler(
+      paddle::platform::ReductionMKLDNNHandler<T> handler(
           dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
           ctx.GetPlace(), dout, dx, ctx.InputName("X"), dx_vec_dims);
 
@@ -142,21 +144,18 @@ class ExpandGradMKLDNNKernel : public framework::OpKernel<T> {
 
       reduction_p->execute(astream, reduction_args);
       astream.wait();
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
-          paddle::framework::vectorize<int64_t>(dx->dims()))));
+      dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
+      dx->set_format(paddle::platform::GetMKLDNNFormat(
+          dst_memory_p->get_desc().reshape(vectorize<int64_t>(dx->dims()))));
     }
   }
 };
+}  // anonymous namespace
 
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(expand_v2, MKLDNN, paddle::platform::CPUPlace,
-                   ops::ExpandMKLDNNKernel<float>,
-                   ops::ExpandMKLDNNKernel<paddle::platform::bfloat16>);
+                   ExpandMKLDNNKernel<float>,
+                   ExpandMKLDNNKernel<paddle::platform::bfloat16>);
 
 REGISTER_OP_KERNEL(expand_v2_grad, MKLDNN, paddle::platform::CPUPlace,
-                   ops::ExpandGradMKLDNNKernel<float>,
-                   ops::ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
+                   ExpandGradMKLDNNKernel<float>,
+                   ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
index eceef665a0304..63c87ce11507d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -21,6 +21,8 @@
 from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 
+@OpTestTool.skip_if(core.is_compiled_with_cuda(),
+                    "CUDA required dygraph so oneDNN UT must be skipped")
 class TestExpandV2OneDNNOp(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"

From aee5afb0bb3ab984fb7bd4a5e906e2a507a8dc6e Mon Sep 17 00:00:00 2001
From: Jakub Piasecki <jakpia21@gmail.com>
Date: Wed, 28 Jul 2021 18:10:27 +0200
Subject: [PATCH 7/7] CI fix

---
 .../fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
index 63c87ce11507d..51d7fe971674d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_expand_v2_mkldnn_op.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.x = np.random.random(self.ori_shape).astype("float32")
         self.set_inputs()
         self.attrs = {'shape': self.shape, 'use_mkldnn': True}
-        output = np.tile(self.inputs['X'], self.expand_times)
+        output = np.tile(self.x, self.expand_times)
         self.outputs = {'Out': output}
 
     def set_inputs(self):
@@ -90,7 +90,7 @@ def test_check_grad(self):
                 core.CPUPlace(), ["X"],
                 "Out",
                 user_defined_grads=[convert_float_to_uint16(self.dx)],
-                user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+                user_defined_grad_outputs=[self.dout])
 
     cls_name = "{0}_{1}".format(parent.__name__, "Expand_v2_BF16")
     TestExpandV2BF16OneDNNOp.__name__ = cls_name