apache · apeforest · Jan 30, 2019 · Jan 30, 2019 · Jan 30, 2019 · Jan 31, 2019
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
@@ -36,6 +36,7 @@
 #include <utility>
 #include "../operator_common.h"
 #include "../linalg.h"
+#include "./im2col.h"
 
 
 namespace mxnet {
@@ -118,7 +119,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   }
 
   template<size_t ndim>
-  void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const {
+  void InferPad(const TShape &input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim]) const {
     // Modified by Li.bs
     // Use tag to control the calculation of pad
     bool bCal = false;
@@ -226,22 +227,24 @@ class DeconvolutionOp {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
     auto in_data_shape = in_data[deconv::kData].shape_;
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
     index_t o_pad[2], o_adj[2];
     if (param_.kernel.ndim() == 2) {
       param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
     } else {
-      index_t o_pad_1D[1], o_adj_1D[1];
-      param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
-      o_pad[0] = 0;
-      o_pad[1] = o_pad_1D[0];
-      o_adj[0] = 0;
-      o_adj[1] = o_adj_1D[0];
+      param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
     }
+
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+    auto padding = param_.kernel.ndim() == 2 ?
+      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -251,76 +254,57 @@ class DeconvolutionOp {
                param_.num_filter / param_.num_group * kernel_size);
     Tensor<xpu, 3, DType> wmat =
         in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(this->InitTemp(out.shape_, data.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
+    for (index_t i = 0; i < nbatch; ++i) {
+      // temp_col: (N * kernel_size, OW * OH)
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                             workspace.dptr_,
-                                            Shape2(shape_colunit_[0],
-                                            shape_colunit_[1] * step), s);
+                                            Shape2(shape_colunit_[0], shape_colunit_[1]),
+                                            s);
+      // temp_dst: (N, N/n_grup, OW * OH)
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
-                                           shape_dstunit_[1],
-                                           shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        temp_col = unpack_patch2col(out.Slice(i, i + step),
-                                    kernel[0],
-                                    kernel[1],
-                                    stride[0],
-                                    stride[1],
-                                    dilate[0],
-                                    dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-                                        o_pad[0], o_pad[1]),
-                                    kernel[0],
-                                    kernel[1],
-                                    stride[0],
-                                    stride[1],
-                                    dilate[0],
-                                    dilate[1]);
-      }
+                                                  shape_dstunit_[1],
+                                                  shape_dstunit_[2]),
+                                           s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
+
+      im2col(
+        s,
+        (out.Slice(i, i + 1)).dptr_,
+        out.shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        temp_col.dptr_);
+
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-                                              gstride * (gid + 1));
+        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         // Legacy approach shown here for comparison:
-        //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        // tmpc = dot(wmat[gid].T(), temp_dst[gid]);
         linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        out.Slice(i, i + step) = pack_col2patch(temp_col,
-                                   out.Slice(i, i + step).shape_,
-                                   kernel[0],
-                                   kernel[1],
-                                   stride[0],
-                                   stride[1],
-                                   dilate[0],
-                                   dilate[1]);
-      } else {
-        Shape<4> pshape = out.Slice(i, i + step).shape_;
-        pshape[2] += 2 * o_pad[0];
-        pshape[3] += 2 * o_pad[1];
-        out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
-                                        pshape,
-                                        kernel[0],
-                                        kernel[1],
-                                        stride[0],
-                                        stride[1],
-                                        dilate[0],
-                                        dilate[1]),
-                                        out[i][0].shape_);
-      }
+
+      col2im(
+        s,
+        temp_col.dptr_,
+        out.Slice(i, i + 1).shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        out.Slice(i, i + 1).dptr_,
+        req[deconv::kOut]);
     }
+
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
@@ -344,24 +328,24 @@ class DeconvolutionOp {
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
     // get data
     Stream<xpu> *s = ctx.get_stream<xpu>();
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
     auto in_data_shape = in_data[deconv::kData].shape_;
     Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
     Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
     Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s);
-
     index_t o_pad[2], o_adj[2];
     if (param_.kernel.ndim() == 2) {
       param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
     } else {
-      index_t o_pad_1D[1], o_adj_1D[1];
-      param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
-      o_pad[0] = 0;
-      o_pad[1] = o_pad_1D[0];
-      o_adj[0] = 0;
-      o_adj[1] = o_adj_1D[0];
+      param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
     }
     auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
     auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+    auto padding = param_.kernel.ndim() == 2 ?
+      TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
     auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
     auto kernel_size = kernel.Size();
 
@@ -373,55 +357,46 @@ class DeconvolutionOp {
         in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
     Tensor<xpu, 3, DType> gwmat =
         in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
 
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
             Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
+    for (index_t i = 0; i < nbatch; ++i) {
       Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
                                            workspace.dptr_,
-                                           Shape2(shape_colunit_[0],
-                                           shape_colunit_[1] * step), s);
+                                           Shape2(shape_colunit_[0], shape_colunit_[1]),
+                                           s);
       Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
                                            workspace.dptr_ + temp_col.shape_.Size(),
                                            Shape3(shape_dstunit_[0],
-                                           shape_dstunit_[1],
-                                           shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (o_pad[0] == 0 && o_pad[1] == 0) {
-        temp_col = unpack_patch2col(grad.Slice(i, i + step),
-                                     kernel[0],
-                                     kernel[1],
-                                     stride[0],
-                                     stride[1],
-                                     dilate[0],
-                                     dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]),
-                                     kernel[0],
-                                     kernel[1],
-                                     stride[0],
-                                     stride[1],
-                                     dilate[0],
-                                     dilate[1]);
-      }
+                                                  shape_dstunit_[1],
+                                                  shape_dstunit_[2]),
+                                           s);
+      temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);
+
+      im2col(
+        s,
+        (grad.Slice(i, i + 1)).dptr_,
+        grad.shape_,
+        temp_col.shape_,
+        kernel,
+        padding,
+        stride,
+        dilate,
+        temp_col.dptr_);
+
       const index_t gstride = temp_col.size(0) / param_.num_group;
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         if (i == 0) {
           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
           // Legacy approach shown here for comparison:
-          //   Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
+          // Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
           linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]);
         } else {
           // Legacy approach shown here for comparison:
-          //   gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          // gwmat[gid] += dot(temp_dst[gid], tmpc.T());
           linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
         }
       }
@@ -431,16 +406,16 @@ class DeconvolutionOp {
         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
           // Legacy approach shown here for comparison:
-          //   temp_dst[gid] = dot(wmat[gid], tmpc);
+          // temp_dst[gid] = dot(wmat[gid], tmpc);
           linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
         }
-        Assign(gdata.Slice(i, i + step),
+        Assign(gdata.Slice(i, i + 1),
                req[deconv::kData],
                (swapaxis<1, 0>(reshape(temp_dst,
-                                      mshadow::Shape4(gdata.shape_[1],
-                                                      step,
-                                                      gdata.size(2),
-                                                      gdata.size(3))))));
+                                       Shape4(gdata.shape_[1],
+                                              1,
+                                              gdata.size(2),
+                                              gdata.size(3))))));
       }
     }
     if (!param_.no_bias) {
@@ -458,17 +433,12 @@ class DeconvolutionOp {
     shape_dstunit_ = mshadow::Shape3(param_.num_group,
                                      oshape[1] / param_.num_group,
                                      oshape[2] * oshape[3]);
-    // See convolution for workspace calculations. nstep_ will be the effective batch size
-    nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
-          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
-      1);
 
     mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-                                             shape_colunit_[1] * nstep_);
+                                             shape_colunit_[1]);
     mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
                                              shape_dstunit_[1],
-                                             shape_dstunit_[2] * nstep_);
+                                             shape_dstunit_[2]);
     index_t required_size = scol.Size() + sdst.Size();
     return required_size;
   }
@@ -485,7 +455,6 @@ class DeconvolutionOp {
   DeconvolutionParam param_;
   mshadow::Shape<2> shape_colunit_;
   mshadow::Shape<3> shape_dstunit_;
-  index_t nstep_;
 };  // class DeconvolutionOp
 
 template<typename xpu>

diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
@@ -503,6 +503,40 @@ def test_deconv():
     # layer = nn.Conv3DTranspose(16, (3, 3, 3), layout='NDHWC', in_channels=4)
     # # check_layer_forward(layer, (1, 10, 10, 10, 4))
 
+@with_seed()
+def test_deconv_dilation():
+    data = mx.nd.array((((0,0,0),
+                         (0,1,0),
+                         (0,0,0)),
+                        ((0,0,0),
+                         (0,2,0),
+                         (0,0,0))))
+
+    kernel = mx.nd.array(((1,2,3),
+                          (4,5,6),
+                          (7,8,9)))
+
+    data_batch = data.expand_dims(1)
+    weight = kernel.expand_dims(0).expand_dims(0)
+    layer = nn.Conv2DTranspose(in_channels=1, channels=1,
+                               kernel_size=(3,3), padding=(1,1),
+                               strides=(1,1), dilation=(2,2))
+    layer.initialize()
+    layer.weight.set_data(weight)
+    out = layer(data_batch).asnumpy()
+    expected = np.array([[[[1.,0.,2.,0.,3.],
+                           [0.,0.,0.,0.,0.],
+                           [4.,0.,5.,0.,6.],
+                           [0.,0.,0.,0.,0.],
+                           [7.,0.,8.,0.,9.]]],
+                         [[[2.,0.,4.,0.,6.],
+                           [0.,0.,0.,0.,0.],
+                           [8.,0.,10.,0.,12.],
+                           [0.,0.,0.,0.,0.],
+                           [14.,0.,16.,0.,18.]]]  
+                        ])
+    assert_almost_equal(out, expected)
+
 
 @with_seed()
 def test_pool():

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
@@ -1256,7 +1256,7 @@ def test_abs():
     assert_almost_equal(out, npout)
 
     out_grad = mx.nd.empty(shape)
-    out_grad[:] = 2;
+    out_grad[:] = 2
     npout_grad = out_grad.asnumpy()
     npout_grad = npout_grad * np.sign(data_tmp)
     exe_test.backward(out_grad)
@@ -1380,7 +1380,6 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
     assert out_shapes[0] == (input_shape[0], 5) + target_shape
 
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10973")
 @with_seed()
 def test_deconvolution():
     # 2D