Added PRelu BF16/FP32 FWD/BWD kernels (#33878)

* added prelu bf16/fp32 fwd/bwd kernel
PaddlePaddle · Jul 7, 2021 · 375e561 · 375e561
1 parent a0666b9
commit 375e561
Show file tree

Hide file tree

Showing 6 changed files with 433 additions and 12 deletions.
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2262,11 +2262,26 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>(
-          {"concat", "conv2d", "conv2d_transpose", "elementwise_add",
-           "elementwise_mul", "fc", "fusion_gru", "fusion_lstm", "gelu",
-           "layer_norm", "matmul", "matmul_v2", "pool2d", "relu", "reshape2",
-           "softmax", "split", "sum", "transpose2"});
+      std::unordered_set<std::string>({"concat",
+                                       "conv2d",
+                                       "conv2d_transpose",
+                                       "elementwise_add",
+                                       "elementwise_mul",
+                                       "fc",
+                                       "fusion_gru",
+                                       "fusion_lstm",
+                                       "gelu",
+                                       "layer_norm",
+                                       "matmul",
+                                       "matmul_v2",
+                                       "pool2d",
+                                       "prelu",
+                                       "relu",
+                                       "reshape2",
+                                       "softmax",
+                                       "split",
+                                       "sum",
+                                       "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }

diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using dnnl::memory;
+using framework::Tensor;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::MKLDNNGetDataType;
+using platform::to_void_cast;
+
+namespace {
+template <typename T>
+class PReluMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::prelu_forward,
+                                      dnnl::prelu_backward> {
+ public:
+  PReluMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                     const mkldnn::engine engine, platform::Place cpu_place,
+                     const Tensor* x, const Tensor* weights,
+                     const std::string& uniq_name, const std::string& mode,
+                     bool is_test = false)
+      : platform::MKLDNNHandlerT<T, dnnl::prelu_forward, dnnl::prelu_backward>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      auto x_md = memory::desc(framework::vectorize(x->dims()),
+                               MKLDNNGetDataType<T>(), x->format());
+
+      auto weights_dims = framework::vectorize(weights->dims());
+
+      // weights must have same size as X only for "element" case
+      if (weights->dims().size() != x->dims().size()) {
+        auto new_weights_dims = std::vector<int64_t>(x->dims().size(), 1);
+        if (mode == "channel") {
+          new_weights_dims[1] =
+              *std::max_element(weights_dims.begin(), weights_dims.end());
+        }
+        weights_dims = std::move(new_weights_dims);
+      }
+      auto weights_md = memory::desc(weights_dims, MKLDNNGetDataType<T>(),
+                                     memory::format_tag::any);
+
+      this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
+                                              x_md, weights_md);
+      if (!is_test)
+        this->AcquireBackwardPrimitiveDescriptor(x_md, weights_md, x_md,
+                                                 weights_md);
+    }
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemoryPossiblyWithReorder(
+      const Tensor* input, const bool is_test) {
+    const T* input_data = input->data<T>();
+
+    // if weights are 1D, every format tag is correct, so we accept
+    // format_tag::any's output and no reorder is needed
+    if (input->dims().size() == 1) {
+      return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
+                                              to_void_cast<T>(input_data),
+                                              "@alpha_mem_p");
+    }
+
+    auto user_weights_md =
+        memory::desc(framework::vectorize(input->dims()),
+                     MKLDNNGetDataType<T>(), input->format());
+    return this->AcquireMemoryWithReorder(
+        user_weights_md, this->fwd_pd_->weights_desc(),
+        to_void_cast<T>(input_data), "@alpha_mem_p", is_test);
+  }
+
+  std::shared_ptr<memory> AcquireDiffWeightsMemory(Tensor* output) {
+    T* output_data = output->mutable_data<T>(
+        this->place_, this->bwd_pd_->diff_weights_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(this->bwd_pd_->diff_weights_desc(),
+                                            output_data, "@diff_weights_mem_p");
+  }
+};
+}  // anonymous namespace
+
+template <typename T>
+class PReluMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* alpha = ctx.Input<Tensor>("Alpha");
+    auto* out = ctx.Output<Tensor>("Out");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const auto mode = ctx.Attr<std::string>("mode");
+
+    PReluMKLDNNHandler<T> handler(dev_ctx, onednn_engine, ctx.GetPlace(), x,
+                                  alpha, ctx.InputName("X"), mode, is_test);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto weights_memory_p =
+        handler.AcquireWeightsMemoryPossiblyWithReorder(alpha, is_test);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto prelu_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
+    prelu_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p},
+                               {DNNL_ARG_WEIGHTS, *weights_memory_p},
+                               {DNNL_ARG_DST, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(GetMKLDNNFormat(*dst_memory_p));
+  }
+};
+
+template <typename T>
+class PReluGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dalpha = ctx.Output<Tensor>(framework::GradVarName("Alpha"));
+    auto* alpha = ctx.Input<Tensor>("Alpha");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const auto mode = ctx.Attr<std::string>("mode");
+
+    PReluMKLDNNHandler<T> handler(dev_ctx, onednn_engine, ctx.GetPlace(), x,
+                                  alpha, framework::GradVarName("X"), mode);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto weights_memory_p =
+        handler.AcquireWeightsMemoryPossiblyWithReorder(alpha, is_test);
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
+    auto diff_weights_memory_p = handler.AcquireDiffWeightsMemory(dalpha);
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
+    auto prelu_p = handler.AcquireBackwardPrimitive();
+
+    auto& astream = MKLDNNDeviceContext::tls().get_stream();
+    prelu_p->execute(astream,
+                     {{DNNL_ARG_SRC, *src_memory_p},
+                      {DNNL_ARG_WEIGHTS, *weights_memory_p},
+                      {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
+                      {DNNL_ARG_DIFF_SRC, *diff_src_memory_p},
+                      {DNNL_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}});
+    astream.wait();
+
+    dx->set_layout(framework::DataLayout::kMKLDNN);
+    dx->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(prelu, MKLDNN, paddle::platform::CPUPlace,
+                   ops::PReluMKLDNNKernel<float>,
+                   ops::PReluMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(prelu_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::PReluGradMKLDNNKernel<float>,
+                   ops::PReluGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
@@ -95,9 +95,17 @@ class PReluOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -126,6 +134,18 @@ There are modes:
 )DOC");
     AddAttr<std::string>("mode", "The mode for inputs to share weights.")
         .SetDefault("all");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
   }
 };
 
@@ -153,9 +173,17 @@ class PReluGradOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };