ResnetUnitOp implemented by cuDNN fused op(backend code) (#35557)

PaddlePaddle · Sep 22, 2021 · 736a738 · 736a738
1 parent 482f062
commit 736a738
Show file tree

Hide file tree

Showing 6 changed files with 602 additions and 10 deletions.
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -78,4 +78,7 @@ if (WITH_GPU OR WITH_ROCM)
         nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
         nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory)
     endif()
+    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
+        cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -0,0 +1,162 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+namespace dynload = platform::dynload;
+
+#if CUDNN_VERSION >= 8000
+
+// A wrapper for cuDNN fused_op API.
+class CudnnFusionOp {
+ public:
+  explicit CudnnFusionOp(cudnnFusedOps_t op_id) : plan_created_(false) {
+    // New 'fused op' descriptor creation
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsPlan(&op_, op_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnCreateFusedOpsConstParamPack(&op_const_params_, op_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFusedOpsVariantParamPack(
+        &op_variant_params_, op_id));
+  }
+
+  ~CudnnFusionOp() {
+    // New 'fused op' descriptor destruction
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
+  }
+
+  // Execute fused op
+  void Execute(cudnnHandle_t cudnn_handle) {
+    PADDLE_ENFORCE_EQ(
+        plan_created_, true,
+        platform::errors::Fatal(
+            "CudnnFusionOp exec requested without a valid 'plan', need: "
+            "<set const params>, GetWorkspaceSizeBytes(), Execute()."));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnFusedOpsExecute(cudnn_handle, op_, op_variant_params_));
+  }
+
+  // Set const param pack attribute given a descriptor.
+  template <typename T>
+  void SetOpConstParamDesc(cudnnFusedOpsConstParamLabel_t param_label,
+                           T *param_ptr) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsConstParamPackAttribute(
+            op_const_params_, param_label, param_ptr));
+    plan_created_ = false;
+  }
+
+  // Set multiple const param pack attribute given a descriptor.
+  template <typename T>
+  void SetOpConstParamDesc(
+      const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
+      T *param_ptr) {
+    for (auto param_label : param_labels) {
+      SetOpConstParamDesc(param_label, param_ptr);
+    }
+  }
+
+  // Set const param pack attribute given a value of param.
+  template <typename T>
+  void SetOpConstParamAttr(cudnnFusedOpsConstParamLabel_t param_label,
+                           T param) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsConstParamPackAttribute(op_const_params_,
+                                                         param_label, &param));
+    plan_created_ = false;
+  }
+
+  // Set multiple const param pack attribute given a value of param.
+  template <typename T>
+  void SetOpConstParamAttr(
+      const std::vector<cudnnFusedOpsConstParamLabel_t> &param_labels,
+      T param) {
+    for (auto param_label : param_labels) {
+      SetOpConstParamAttr(param_label, param);
+    }
+  }
+
+  // Set a variant param pack attribute given a reference to a param.
+  template <typename T>
+  void SetOpVariantParamAttrPtr(cudnnFusedOpsVariantParamLabel_t param_label,
+                                T *param_ptr) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::cudnnSetFusedOpsVariantParamPackAttribute(
+            op_variant_params_, param_label, param_ptr));
+  }
+
+  // Set multiple const param pack attributes given a reference to a param.
+  template <typename T>
+  void SetOpVariantParamAttrPtr(
+      const std::vector<cudnnFusedOpsVariantParamLabel_t> &param_labels,
+      const T *param_ptr) {
+    for (auto param_label : param_labels) {
+      SetOpVariantParamAttrPtr(param_label, param_ptr);
+    }
+  }
+
+  // Get the workspace, which is required before Execute().
+  size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
+    size_t workspace_bytes = 0U;
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
+        cudnn_handle, op_, op_const_params_, &workspace_bytes));
+    plan_created_ = true;
+    return workspace_bytes;
+  }
+
+ private:
+  bool plan_created_;
+
+  cudnnFusedOpsPlan_t op_;
+  cudnnFusedOpsConstParamPack_t op_const_params_;
+  cudnnFusedOpsVariantParamPack_t op_variant_params_;
+};
+
+static inline std::vector<int> GetStrides(const std::vector<int> &shape) {
+  if (shape.size() < 1) {
+    return {};
+  }
+  int dim = static_cast<int>(shape.size());
+  std::vector<int> pro_shape(shape);
+  std::vector<int> strides(dim);
+  int temp = pro_shape[1];
+  pro_shape.erase(pro_shape.begin() + 1);
+  pro_shape.push_back(temp);
+  strides.back() = 1;
+  for (int i = dim - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * pro_shape[i + 1];
+  }
+  strides.pop_back();
+  strides.insert(strides.begin() + 1, 1);
+  return strides;
+}
+
+static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+
+#endif  // CUDNN_VERSION >= 8000
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+namespace dynload = platform::dynload;
+
+#if CUDNN_VERSION >= 8000
+template <typename T>
+class CudnnNormConvolutionOp {
+ public:
+  CudnnNormConvolutionOp()
+      : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {}
+  ~CudnnNormConvolutionOp() {}
+
+  void Init(const platform::CUDADeviceContext &ctx,
+            const std::vector<int> &input_shape,
+            const std::vector<int> &filter_shape,
+            const std::vector<int> &output_shape, const int &pad,
+            const int &stride, const int &dilate, const int &group) {
+    cudnn_fwd_compute_type_ = platform::CudnnDataType<float>::type;
+    dtype_ = platform::CudnnDataType<T>::type;
+    format_ = CUDNN_TENSOR_NHWC;
+
+    InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride,
+                    dilate, group);
+    GetWorkspaceSize(ctx);
+  }
+
+  void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr,
+               T *filter_ptr, T *output_ptr, float *sum_ptr,
+               float *sum_of_squares_ptr) {
+    auto handle = ctx.cudnn_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    // Set variant_param
+    // input ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(
+        CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_);
+    // output ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
+    workspace_handle.RunFunc(
+        [&](void *workspace_ptr) {
+          // workspace ptr
+          fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
+          // fused op execute
+          fwd_op_.Execute(handle);
+        },
+        fwd_workspace_byte_);
+  }
+
+  // TBD
+  void Backward(const platform::CUDADeviceContext &ctx) {}
+
+ private:
+  void InitDescriptors(const platform::CUDADeviceContext &ctx,
+                       const std::vector<int> &input_shape,
+                       const std::vector<int> &filter_shape,
+                       const std::vector<int> &output_shape, const int &pad,
+                       const int &stride, const int &dilate, const int &group) {
+    // Set constant_param
+    fwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER,
+         CUDNN_PARAM_YDATA_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    fwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+
+    std::vector<int> pad_vec = {pad, pad};
+    std::vector<int> stride_vec = {stride, stride};
+    std::vector<int> dilate_vec = {dilate, dilate};
+    int output_channel = filter_shape[0];
+    std::vector<int> stats_shape = {1, 1, 1, output_channel};
+
+    // set conv desc
+    conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc());
+
+    // set input desc
+    in_desc_.set(input_shape, format_, dtype_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc());
+
+    // set filter desc
+    filter_desc_.set(filter_shape, format_, dtype_, group);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc());
+
+    // set output desc
+    out_desc_.set(output_shape, format_, dtype_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc());
+
+    // set output_stats desc
+    out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_);
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC,
+                                out_stats_desc_.desc());
+
+    fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL);
+  }
+
+  void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) {
+    auto handle = ctx.cudnn_handle();
+    fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
+  }
+
+  size_t fwd_workspace_byte_ = 0;
+
+  cudnnDataType_t dtype_;
+  cudnnDataType_t cudnn_fwd_compute_type_;
+  platform::TensorDescriptor in_desc_;
+  platform::FilterDescriptor filter_desc_;
+  platform::TensorDescriptor out_desc_;
+  platform::TensorDescriptor out_stats_desc_;
+  platform::ConvolutionDescriptor conv_desc_;
+  cudnnTensorFormat_t format_;
+
+  CudnnFusionOp fwd_op_;
+};
+#endif
+}  // namespace operators
+}  // namespace paddle