Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Fix transposed convolution in CPU w/o MKLDNN. #14031

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
9fe0589
replace with im2col/col2im functions
apeforest Jan 30, 2019
19dfcb5
fixed padding problem in transpose conv forward
apeforest Jan 30, 2019
747df6c
fix backward deconvolution
apeforest Jan 30, 2019
854cff2
refactor
apeforest Jan 31, 2019
da75280
Merge branch 'master' into bugfix/conv2dtran
apeforest Jan 31, 2019
20ae427
fix lint
apeforest Jan 31, 2019
926cfd7
fix unit test, remove step in deconv
apeforest Jan 31, 2019
c49dbe1
add unit test
apeforest Jan 31, 2019
afd75d1
refactor
apeforest Jan 31, 2019
5b59097
fix build error
apeforest Jan 31, 2019
d1554c1
Revert "Aggregate SGD (#13346)"
apeforest Jan 28, 2019
a9da95e
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest Jan 31, 2019
1268457
Merge remote-tracking branch 'upstream/master'
apeforest Feb 5, 2019
953dd95
Merge remote-tracking branch 'upstream/master'
apeforest Feb 12, 2019
fabc318
Revert "Aggregate SGD (#13346)"
apeforest Jan 28, 2019
6322da9
Merge branch 'master' of https://github.com/apeforest/incubator-mxnet
apeforest Feb 13, 2019
4b36009
Merge branch 'master' into bugfix/conv2dtran
Feb 13, 2019
01e371e
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest Feb 13, 2019
51ed635
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest Feb 14, 2019
a649f67
Revert "Revert "Aggregate SGD (#13346)""
apeforest Feb 14, 2019
a6b0b9e
add comments
apeforest Feb 16, 2019
3240833
fix lint
apeforest Feb 16, 2019
27f2033
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest Mar 15, 2019
88892d2
fix lint error
apeforest Mar 15, 2019
aa3c4dd
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest Mar 20, 2019
1580ba0
Merge branch 'master' into bugfix/conv2dtran
apeforest Apr 24, 2019
0675b3b
fix a bug in calling im2col (col_shape should be 3)
apeforest Apr 29, 2019
f403b9c
fix im2col parameter mismatch
apeforest May 3, 2019
bdbf81d
add debug
apeforest May 6, 2019
2c92980
set col_buffer_shape
apeforest May 10, 2019
0c44ec8
dump data from gpu to cpu to debug
apeforest May 10, 2019
2c868fd
debug
apeforest May 10, 2019
5dacddc
debug
apeforest May 10, 2019
29c4488
update function call to col2im
apeforest May 13, 2019
5f3c881
fix backward pass
apeforest May 13, 2019
db3aaef
comment out debug message
apeforest May 13, 2019
2181f80
Merge remote-tracking branch 'upstream/master' into bugfix/conv2dtran
apeforest May 13, 2019
424f36d
fix bug in backward
apeforest May 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 82 additions & 113 deletions src/operator/nn/deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <utility>
#include "../operator_common.h"
#include "../linalg.h"
#include "./im2col.h"


namespace mxnet {
Expand Down Expand Up @@ -118,7 +119,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
}

template<size_t ndim>
void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const {
void InferPad(const TShape &input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim]) const {
// Modified by Li.bs
// Use tag to control the calculation of pad
bool bCal = false;
Expand Down Expand Up @@ -226,22 +227,24 @@ class DeconvolutionOp {
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
Stream<xpu> *s = ctx.get_stream<xpu>();
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"cuBLAS" is the official abbreviation :)

#endif
auto in_data_shape = in_data[deconv::kData].shape_;
Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
index_t o_pad[2], o_adj[2];
if (param_.kernel.ndim() == 2) {
param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
} else {
index_t o_pad_1D[1], o_adj_1D[1];
param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
o_pad[0] = 0;
o_pad[1] = o_pad_1D[0];
o_adj[0] = 0;
o_adj[1] = o_adj_1D[0];
param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
}

auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
auto padding = param_.kernel.ndim() == 2 ?
TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
auto kernel_size = kernel.Size();

Expand All @@ -251,76 +254,57 @@ class DeconvolutionOp {
param_.num_filter / param_.num_group * kernel_size);
Tensor<xpu, 3, DType> wmat =
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
const index_t nbatch = data.size(0);
Tensor<xpu, 1, DType> workspace =
ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(this->InitTemp(out.shape_, data.shape_)), s);
for (index_t i = 0; i < nbatch; i += nstep_) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know what was "nstep_" doing earlier? It would help understand the problem with the earlier code.

Copy link
Contributor Author

@apeforest apeforest Mar 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The col2im method does not support such step.

const index_t step = std::min(nstep_, nbatch - i);
for (index_t i = 0; i < nbatch; ++i) {
// temp_col: (N * kernel_size, OW * OH)
Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
workspace.dptr_,
Shape2(shape_colunit_[0],
shape_colunit_[1] * step), s);
Shape2(shape_colunit_[0], shape_colunit_[1]),
s);
// temp_dst: (N, N/n_grup, OW * OH)
Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
workspace.dptr_ + temp_col.shape_.Size(),
Shape3(shape_dstunit_[0],
shape_dstunit_[1],
shape_dstunit_[2] * step), s);
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
if (o_pad[0] == 0 && o_pad[1] == 0) {
temp_col = unpack_patch2col(out.Slice(i, i + step),
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]);
} else {
temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
o_pad[0], o_pad[1]),
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]);
}
shape_dstunit_[1],
shape_dstunit_[2]),
s);
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);

im2col(
s,
(out.Slice(i, i + 1)).dptr_,
out.shape_,
temp_col.shape_,
kernel,
padding,
stride,
dilate,
temp_col.dptr_);

const index_t gstride = temp_col.size(0) / param_.num_group;
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
gstride * (gid + 1));
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
// Legacy approach shown here for comparison:
// tmpc = dot(wmat[gid].T(), temp_dst[gid]);
// tmpc = dot(wmat[gid].T(), temp_dst[gid]);
linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
}
if (o_pad[0] == 0 && o_pad[1] == 0) {
out.Slice(i, i + step) = pack_col2patch(temp_col,
out.Slice(i, i + step).shape_,
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]);
} else {
Shape<4> pshape = out.Slice(i, i + step).shape_;
pshape[2] += 2 * o_pad[0];
pshape[3] += 2 * o_pad[1];
out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
pshape,
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]),
out[i][0].shape_);
}

col2im(
s,
temp_col.dptr_,
out.Slice(i, i + 1).shape_,
temp_col.shape_,
kernel,
padding,
stride,
dilate,
out.Slice(i, i + 1).dptr_,
req[deconv::kOut]);
}

if (!param_.no_bias) {
// add bias, broadcast bias to dim 1: channel
Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
Expand All @@ -344,24 +328,24 @@ class DeconvolutionOp {
CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
// get data
Stream<xpu> *s = ctx.get_stream<xpu>();
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
auto in_data_shape = in_data[deconv::kData].shape_;
Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s);

index_t o_pad[2], o_adj[2];
if (param_.kernel.ndim() == 2) {
param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
} else {
index_t o_pad_1D[1], o_adj_1D[1];
param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
o_pad[0] = 0;
o_pad[1] = o_pad_1D[0];
o_adj[0] = 0;
o_adj[1] = o_adj_1D[0];
param_.InferPad({in_data_shape[2]}, o_pad, o_adj);
}
auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
auto padding = param_.kernel.ndim() == 2 ?
TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]});
auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
auto kernel_size = kernel.Size();

Expand All @@ -373,55 +357,46 @@ class DeconvolutionOp {
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
Tensor<xpu, 3, DType> gwmat =
in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif

const index_t nbatch = data.size(0);
Tensor<xpu, 1, DType> workspace =
ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
Shape1(this->InitTemp(grad.shape_, data.shape_)), s);
for (index_t i = 0; i < nbatch; i += nstep_) {
const index_t step = std::min(nstep_, nbatch - i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again can you tell what was the purpose of "step" in the previous code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's used to convert multiple batch of image data into columns in the prevous library. However, it is not supported in the col2im method.

for (index_t i = 0; i < nbatch; ++i) {
Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(
workspace.dptr_,
Shape2(shape_colunit_[0],
shape_colunit_[1] * step), s);
Shape2(shape_colunit_[0], shape_colunit_[1]),
s);
Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
workspace.dptr_ + temp_col.shape_.Size(),
Shape3(shape_dstunit_[0],
shape_dstunit_[1],
shape_dstunit_[2] * step), s);
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
if (o_pad[0] == 0 && o_pad[1] == 0) {
temp_col = unpack_patch2col(grad.Slice(i, i + step),
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]);
} else {
temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]),
kernel[0],
kernel[1],
stride[0],
stride[1],
dilate[0],
dilate[1]);
}
shape_dstunit_[1],
shape_dstunit_[2]),
s);
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_);

im2col(
s,
(grad.Slice(i, i + 1)).dptr_,
grad.shape_,
temp_col.shape_,
kernel,
padding,
stride,
dilate,
temp_col.dptr_);

const index_t gstride = temp_col.size(0) / param_.num_group;
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
if (i == 0) {
Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
// Legacy approach shown here for comparison:
// Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
// Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]);
} else {
// Legacy approach shown here for comparison:
// gwmat[gid] += dot(temp_dst[gid], tmpc.T());
// gwmat[gid] += dot(temp_dst[gid], tmpc.T());
linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
}
}
Expand All @@ -431,16 +406,16 @@ class DeconvolutionOp {
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
// Legacy approach shown here for comparison:
// temp_dst[gid] = dot(wmat[gid], tmpc);
// temp_dst[gid] = dot(wmat[gid], tmpc);
linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
}
Assign(gdata.Slice(i, i + step),
Assign(gdata.Slice(i, i + 1),
req[deconv::kData],
(swapaxis<1, 0>(reshape(temp_dst,
mshadow::Shape4(gdata.shape_[1],
step,
gdata.size(2),
gdata.size(3))))));
Shape4(gdata.shape_[1],
1,
gdata.size(2),
gdata.size(3))))));
}
}
if (!param_.no_bias) {
Expand All @@ -458,17 +433,12 @@ class DeconvolutionOp {
shape_dstunit_ = mshadow::Shape3(param_.num_group,
oshape[1] / param_.num_group,
oshape[2] * oshape[3]);
// See convolution for workspace calculations. nstep_ will be the effective batch size
nstep_ = std::max<index_t>(
std::min(static_cast<index_t>(param_.workspace) /
(shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
1);

mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
shape_colunit_[1] * nstep_);
shape_colunit_[1]);
mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
shape_dstunit_[1],
shape_dstunit_[2] * nstep_);
shape_dstunit_[2]);
index_t required_size = scol.Size() + sdst.Size();
return required_size;
}
Expand All @@ -485,7 +455,6 @@ class DeconvolutionOp {
DeconvolutionParam param_;
mshadow::Shape<2> shape_colunit_;
mshadow::Shape<3> shape_dstunit_;
index_t nstep_;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please tell me why was this removed?

Copy link
Contributor Author

@apeforest apeforest Mar 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The col2im method does not support such step.

}; // class DeconvolutionOp

template<typename xpu>
Expand Down
34 changes: 34 additions & 0 deletions tests/python/unittest/test_gluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,40 @@ def test_deconv():
# layer = nn.Conv3DTranspose(16, (3, 3, 3), layout='NDHWC', in_channels=4)
# # check_layer_forward(layer, (1, 10, 10, 10, 4))

@with_seed()
def test_deconv_dilation():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since deconv is a really important OP, I suggest to visit the original deconv test cases and add dilation > 1 cases alongside the old tests. This ensures better coverage than this single test case.
Feel free to keep this unittest which LGTM as well.

data = mx.nd.array((((0,0,0),
(0,1,0),
(0,0,0)),
((0,0,0),
(0,2,0),
(0,0,0))))

kernel = mx.nd.array(((1,2,3),
(4,5,6),
(7,8,9)))

data_batch = data.expand_dims(1)
weight = kernel.expand_dims(0).expand_dims(0)
layer = nn.Conv2DTranspose(in_channels=1, channels=1,
kernel_size=(3,3), padding=(1,1),
strides=(1,1), dilation=(2,2))
layer.initialize()
layer.weight.set_data(weight)
out = layer(data_batch).asnumpy()
expected = np.array([[[[1.,0.,2.,0.,3.],
[0.,0.,0.,0.,0.],
[4.,0.,5.,0.,6.],
[0.,0.,0.,0.,0.],
[7.,0.,8.,0.,9.]]],
[[[2.,0.,4.,0.,6.],
[0.,0.,0.,0.,0.],
[8.,0.,10.,0.,12.],
[0.,0.,0.,0.,0.],
[14.,0.,16.,0.,18.]]]
])
assert_almost_equal(out, expected)


@with_seed()
def test_pool():
Expand Down
3 changes: 1 addition & 2 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,7 +1256,7 @@ def test_abs():
assert_almost_equal(out, npout)

out_grad = mx.nd.empty(shape)
out_grad[:] = 2;
out_grad[:] = 2
npout_grad = out_grad.asnumpy()
npout_grad = npout_grad * np.sign(data_tmp)
exe_test.backward(out_grad)
Expand Down Expand Up @@ -1380,7 +1380,6 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
assert out_shapes[0] == (input_shape[0], 5) + target_shape


@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/10973")
@with_seed()
def test_deconvolution():
# 2D
Expand Down