apache · bgawrych · Mar 1, 2022 · Nov 8, 2021 · Nov 10, 2021 · Nov 15, 2021
diff --git a/src/operator/contrib/adaptive_avg_pooling-inl.h b/src/operator/contrib/adaptive_avg_pooling-inl.h
@@ -95,6 +95,7 @@ inline void AdaptiveAvgPoolOpBackward(const nnvm::NodeAttrs& attrs,
                                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
+
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   if (IsWriting(req[0])) {
     // zero grad before backwarding
@@ -116,7 +117,7 @@ static bool AdaptiveAvgPoolOpInferShape(const nnvm::NodeAttrs& attrs,
   if (mxnet::op::shape_is_none(dshape)) {
     return false;
   }
-  if (param.output_size.has_value()) {
+  if (param.IsAdaptivePooling()) {
     if (param.output_size.value().ndim() == 1) {
       dshape[2] = param.output_size.value()[0];
       dshape[3] = param.output_size.value()[0];

diff --git a/src/operator/contrib/adaptive_avg_pooling.cc b/src/operator/contrib/adaptive_avg_pooling.cc
@@ -25,11 +25,13 @@
 // #include "elemwise_op_common.h"
 #include "../elemwise_op_common.h"
 #if MXNET_USE_ONEDNN == 1
+#include "../nn/dnnl/dnnl_base-inl.h"
 #include "../nn/dnnl/dnnl_pooling-inl.h"
 #endif  // MXNET_USE_ONEDNN
 
 #define START_IND(a, b, c) static_cast<int>(std::floor(static_cast<float>(a * c) / b))
 #define END_IND(a, b, c)   static_cast<int>(std::ceil(static_cast<float>((a + 1) * c) / b))
+#define DIV_ROUND_UP(a, b) ((a + (b - 1)) / b)
 
 namespace mxnet {
 namespace op {
@@ -169,39 +171,6 @@ void AdaptiveAvgPoolUpdateOutput(mshadow::Stream<cpu>* s,
   }
 }
 
-template <typename xpu, typename DType, typename AccReal>
-void AdaptiveAvgPoolUpdateGradInput(mshadow::Stream<cpu>* s,
-                                    const std::vector<TBlob>& input,
-                                    const std::vector<TBlob>& output) {
-  Tensor<xpu, 4, DType> gradOut = input[0].get<xpu, 4, DType>(s);
-  Tensor<xpu, 4, DType> gradIn  = output[0].get<xpu, 4, DType>(s);
-
-  DType* gradOutput_data = gradOut.dptr_;
-  DType* gradInput_data  = gradIn.dptr_;
-
-  int64_t sizeB  = gradIn.size(0);
-  int64_t sizeD  = gradIn.size(1);
-  int64_t isizeH = gradIn.size(2);
-  int64_t isizeW = gradIn.size(3);
-
-  int64_t osizeH = gradOut.size(2);
-  int64_t osizeW = gradOut.size(3);
-
-  int64_t b;
-#pragma omp parallel for private(b) \
-    num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-  for (b = 0; b < sizeB; b++) {
-    SpatialAdaptiveAveragePooling_updateGradInput_frame<DType>(
-        gradInput_data + b * sizeD * isizeH * isizeW,
-        gradOutput_data + b * sizeD * osizeH * osizeW,
-        sizeD,
-        isizeH,
-        isizeW,
-        osizeH,
-        osizeW);
-  }
-}
-
 #if MXNET_USE_ONEDNN == 1
 bool SupportDNNLAveragePooling(const NDArray& in_data, const NDArray& out_data) {
   for (int64_t idx = 2; idx < in_data.shape().ndim(); ++idx) {
@@ -218,15 +187,44 @@ bool SupportDNNLAveragePooling(const NDArray& in_data, const NDArray& out_data)
   const int IW         = in_data.shape()[3];
   const int OH         = out_data.shape()[2];
   const int OW         = out_data.shape()[3];
-  const int strides_H  = floor((IH << 1) / OH) - floor(IH / OH);
-  const int strides_W  = floor((IW << 1) / OW) - floor(IW / OW);
-  const int kernel_H   = ceil((IH << 1) / OH) - floor(IH / OH);
-  const int kernel_W   = ceil((IW << 1) / OW) - floor(IW / OW);
+  const int strides_H  = ((IH << 1) / OH) - (IH / OH);
+  const int strides_W  = ((IW << 1) / OW) - (IW / OW);
+  const int kernel_H   = DIV_ROUND_UP((IH << 1) / OH, 1) - (IH / OH);
+  const int kernel_W   = DIV_ROUND_UP((IW << 1) / OW, 1) - (IW / OW);
   const int pad_l_top  = (strides_H * (OH - 1) + kernel_H - IH) / 2;
   const int pad_l_left = (strides_W * (OW - 1) + kernel_W - IW) / 2;
   return pad_l_top == 0 && pad_l_left == 0;
 }
 
+void AdaptiveAvgPoolOpBackwardExCPU(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+
+  if (SupportDNNLAveragePooling(outputs[0], inputs[0])) {
+    DNNL_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    DNNLRun(DNNLPoolingGradCompute, attrs, ctx, inputs, req, outputs);
+    DNNL_OPCHECK_RUN(AdaptiveAvgPoolOpBackward<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(AdaptiveAvgPoolOpBackward<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+inline static bool BackwardAdaptivePoolingStorageType(const nnvm::NodeAttrs& attrs,
+                                                      const int dev_mask,
+                                                      DispatchMode* dispatch_mode,
+                                                      std::vector<int>* in_attrs,
+                                                      std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  // support_dnnl is set to true, because at this point there is no way
+  // to check if DNNLAdaptivePooling is supported
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+
 void AdaptiveAvgPoolComputeExCPU(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
                                  const std::vector<NDArray>& inputs,
@@ -238,18 +236,62 @@ void AdaptiveAvgPoolComputeExCPU(const nnvm::NodeAttrs& attrs,
   oneDNN doesn't support adaptive pooling.
   Fallback is needed when padding is not equal 0;
   */
-  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   if (SupportDNNL(inputs[0]) && SupportDNNLAveragePooling(inputs[0], outputs[0])) {
-    const NDArray* workspace = nullptr;
     DNNL_OPCHECK_INIT(false, 1, inputs, outputs);
-    DNNLPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace, true);
+    DNNLRun(DNNLPoolingCompute, attrs, ctx, inputs, req, outputs);
     DNNL_OPCHECK_RUN(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(AdaptiveAvgPoolOpForward<cpu>, attrs, ctx, inputs, req, outputs);
 }
+
+inline static bool AdaptivePoolingStorageType(const nnvm::NodeAttrs& attrs,
+                                              const int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  // support_dnnl is set to true, because at this point there is no way
+  // to check if DNNLAdaptivePooling is supported
+  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
 #endif
 
+template <typename xpu, typename DType, typename AccReal>
+void AdaptiveAvgPoolUpdateGradInput(mshadow::Stream<cpu>* s,
+                                    const std::vector<TBlob>& input,
+                                    const std::vector<TBlob>& output) {
+  Tensor<xpu, 4, DType> gradOut = input[0].get<xpu, 4, DType>(s);
+  Tensor<xpu, 4, DType> gradIn  = output[0].get<xpu, 4, DType>(s);
+
+  DType* gradOutput_data = gradOut.dptr_;
+  DType* gradInput_data  = gradIn.dptr_;
+
+  int64_t sizeB  = gradIn.size(0);
+  int64_t sizeD  = gradIn.size(1);
+  int64_t isizeH = gradIn.size(2);
+  int64_t isizeW = gradIn.size(3);
+
+  int64_t osizeH = gradOut.size(2);
+  int64_t osizeW = gradOut.size(3);
+
+  int64_t b;
+#pragma omp parallel for private(b) \
+    num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+  for (b = 0; b < sizeB; b++) {
+    SpatialAdaptiveAveragePooling_updateGradInput_frame<DType>(
+        gradInput_data + b * sizeD * isizeH * isizeW,
+        gradOutput_data + b * sizeD * osizeH * osizeW,
+        sizeD,
+        isizeH,
+        isizeW,
+        osizeH,
+        osizeW);
+  }
+}
+
 NNVM_REGISTER_OP(_contrib_AdaptiveAvgPooling2D)
     .describe(R"code(
 Applies a 2D adaptive average pooling over a 4D input with the shape of (NCHW).
@@ -262,25 +304,49 @@ The pooling kernel and stride sizes are automatically chosen for desired output
   (N x C x height x width) for any input (NCHW).
 
 )code" ADD_FILELINE)
-    .set_attr_parser(ParamParser<PoolingParam>)
+    .set_attr_parser(PoolingParamParser)
     .set_num_inputs(1)
     .set_num_outputs(1)
     .set_attr<mxnet::FInferShape>("FInferShape", AdaptiveAvgPoolOpInferShape)
     .set_attr<FCompute>("FCompute<cpu>", AdaptiveAvgPoolOpForward<cpu>)
     .set_attr<nnvm::FGradient>("FGradient",
                                ElemwiseGradUseNone{"_backward_contrib_AdaptiveAvgPooling2D"})
 #if MXNET_USE_ONEDNN == 1
+    .set_attr<FInferStorageType>("FInferStorageType", AdaptivePoolingStorageType)
     .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", AdaptiveAvgPoolComputeExCPU)
 #endif
     .add_argument("data", "NDArray-or-Symbol", "Input data")
     .add_arguments(PoolingParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_contrib_AdaptiveAvgPooling2D)
-    .set_attr_parser(ParamParser<PoolingParam>)
+    .set_attr_parser(PoolingParamParser)
     .set_num_inputs(1)
     .set_num_outputs(1)
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+#if MXNET_USE_ONEDNN == 1
+    .set_attr<FInferStorageType>("FInferStorageType", BackwardAdaptivePoolingStorageType)
+    // Different backend requires different FInplaceOption
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      const PoolingParam& param =
+                                          nnvm::get<PoolingParam>(attrs.parsed);
+                                      if (DNNLRequireWorkspace(param) && param.IsAdaptivePooling())
+                                        return std::vector<std::pair<int, int>>{{1, 0}};
+                                      return std::vector<std::pair<int, int>>();
+                                    })
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& n) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
+    .set_attr<bool>("TIsDNNL", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", AdaptiveAvgPoolOpBackwardExCPU)
+#else
+    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                    [](const NodeAttrs& attrs) {
+                                      return std::vector<std::pair<int, int>>();
+                                    })
+#endif
     .set_attr<FCompute>("FCompute<cpu>", AdaptiveAvgPoolOpBackward<cpu>);
 
 }  // namespace op

diff --git a/src/operator/nn/dnnl/dnnl_pooling-inl.h b/src/operator/nn/dnnl/dnnl_pooling-inl.h
@@ -28,10 +28,13 @@
 
 #include <dnnl.hpp>
 #include <utility>
+#include <vector>
 
 #include "../pooling-inl.h"
 #include "./dnnl_base-inl.h"
 
+#define DIV_ROUND_UP(a, b) ((a + (b - 1)) / b)
+
 namespace mxnet {
 namespace op {
 
@@ -54,7 +57,8 @@ class DNNLPoolingFwd {
   void Execute(const NDArray& in_data,
                const OpReqType req,
                const NDArray& out_data,
-               const NDArray* workspace);
+               const NDArray* workspace,
+               const bool use_adaptive_pooling);
 
  private:
   bool with_workspace_;
@@ -92,19 +96,19 @@ void UseAdaptivePaddingKernel(T* kernel,
                               T* strides,
                               T* pad_l,
                               T* pad_r,
-                              const NDArray& in_data,
-                              const NDArray& out_data) {
-  const int IH = in_data.shape()[2];
-  const int IW = in_data.shape()[3];
-  const int OH = out_data.shape()[2];
-  const int OW = out_data.shape()[3];
-
-  strides->at(0) = floor((IH << 1) / OH) - floor(IH / OH);
-  strides->at(1) = floor((IW << 1) / OW) - floor(IW / OW);
-  kernel->at(0)  = ceil((IH << 1) / OH) - floor(IH / OH);
-  kernel->at(1)  = ceil((IW << 1) / OW) - floor(IW / OW);
-  pad_l->at(0)   = (strides->at(0) * (OH - 1) + kernel->at(0) - IH) >> 1;
-  pad_l->at(1)   = (strides->at(1) * (OW - 1) + kernel->at(1) - IW) >> 1;
+                              const mxnet::TShape& input_shape,
+                              const mxnet::TShape& output_shape) {
+  const int IH = input_shape[2];
+  const int IW = input_shape[3];
+  const int OH = output_shape[2];
+  const int OW = output_shape[3];
+
+  strides->at(0) = ((IH << 1) / OH) - (IH / OH);
+  strides->at(1) = ((IW << 1) / OW) - (IW / OW);
+  kernel->at(0)  = DIV_ROUND_UP((IH << 1) / OH, 1) - (IH / OH);
+  kernel->at(1)  = DIV_ROUND_UP((IW << 1) / OW, 1) - (IW / OW);
+  pad_l->at(0)   = (strides->at(0) * (OH - 1) + kernel->at(0) - IH) / 2;
+  pad_l->at(1)   = (strides->at(1) * (OW - 1) + kernel->at(1) - IW) / 2;
 }
 
 inline int GetPaddingSizeFull(dim_t x, int padl, int padr, int k, int s) {
@@ -168,31 +172,35 @@ inline bool SupportDNNLPooling(const PoolingParam& param, const NDArray& input)
 }
 
 inline bool DNNLRequireWorkspace(const PoolingParam& param) {
-  return param.pool_type != pool_enum::kAvgPooling;
+  return param.pool_type != pool_enum::kAvgPooling && !param.IsAdaptivePooling();
 }
 
 typedef ParamOpSign<PoolingParam> DNNLPoolingSignature;
-void DNNLPoolingCompute(const OpContext& ctx,
-                        const PoolingParam& param,
-                        const NDArray& in_data,
-                        const OpReqType req,
-                        const NDArray& out_data,
-                        const NDArray* workspace,
-                        const bool use_adaptive_pooling);
-
-void DNNLPoolingGradCompute(const OpContext& ctx,
-                            const PoolingParam& param,
-                            const NDArray& out_grad,
-                            const NDArray& in_data,
-                            const NDArray* workspace,
-                            const OpReqType req,
-                            const NDArray& in_grad);
 
 DNNLPoolingFwd& GetPoolingFwd(const PoolingParam& param,
                               const bool is_train,
                               const NDArray& data,
                               const NDArray& output,
                               const bool use_adaptive_pooling);
+
+DNNLPoolingBwd& GetPoolingBwd(const PoolingParam& param,
+                              const NDArray& in_data,
+                              const NDArray& in_grad,
+                              const NDArray& out_grad,
+                              const bool use_adaptive_pooling);
+
+void DNNLPoolingGradCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs);
+
+void DNNLPoolingCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& in_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& out_data);
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_ONEDNN == 1