-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Fix transposed convolution in CPU w/o MKLDNN. #14031
Changes from 11 commits
9fe0589
19dfcb5
747df6c
854cff2
da75280
20ae427
926cfd7
c49dbe1
afd75d1
5b59097
d1554c1
a9da95e
1268457
953dd95
fabc318
6322da9
4b36009
01e371e
51ed635
a649f67
a6b0b9e
3240833
27f2033
88892d2
aa3c4dd
1580ba0
0675b3b
f403b9c
bdbf81d
2c92980
0c44ec8
2c868fd
5dacddc
29c4488
5f3c881
db3aaef
2181f80
424f36d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,6 +36,7 @@ | |
#include <utility> | ||
#include "../operator_common.h" | ||
#include "../linalg.h" | ||
#include "./im2col.h" | ||
|
||
|
||
namespace mxnet { | ||
|
@@ -118,7 +119,7 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> { | |
} | ||
|
||
template<size_t ndim> | ||
void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const { | ||
void InferPad(const TShape &input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim]) const { | ||
// Modified by Li.bs | ||
// Use tag to control the calculation of pad | ||
bool bCal = false; | ||
|
@@ -226,22 +227,24 @@ class DeconvolutionOp { | |
CHECK_EQ(in_data.size(), expected); | ||
CHECK_EQ(out_data.size(), 1U); | ||
Stream<xpu> *s = ctx.get_stream<xpu>(); | ||
#if defined(__CUDACC__) | ||
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle) | ||
<< "Must init CuBLAS handle in stream"; | ||
#endif | ||
auto in_data_shape = in_data[deconv::kData].shape_; | ||
Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s); | ||
Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s); | ||
index_t o_pad[2], o_adj[2]; | ||
if (param_.kernel.ndim() == 2) { | ||
param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj); | ||
} else { | ||
index_t o_pad_1D[1], o_adj_1D[1]; | ||
param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D); | ||
o_pad[0] = 0; | ||
o_pad[1] = o_pad_1D[0]; | ||
o_adj[0] = 0; | ||
o_adj[1] = o_adj_1D[0]; | ||
param_.InferPad({in_data_shape[2]}, o_pad, o_adj); | ||
} | ||
|
||
auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); | ||
auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); | ||
auto padding = param_.kernel.ndim() == 2 ? | ||
TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); | ||
auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); | ||
auto kernel_size = kernel.Size(); | ||
|
||
|
@@ -251,76 +254,57 @@ class DeconvolutionOp { | |
param_.num_filter / param_.num_group * kernel_size); | ||
Tensor<xpu, 3, DType> wmat = | ||
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s); | ||
#if defined(__CUDACC__) | ||
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle) | ||
<< "Must init CuBLAS handle in stream"; | ||
#endif | ||
const index_t nbatch = data.size(0); | ||
Tensor<xpu, 1, DType> workspace = | ||
ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>( | ||
Shape1(this->InitTemp(out.shape_, data.shape_)), s); | ||
for (index_t i = 0; i < nbatch; i += nstep_) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you know what was "nstep_" doing earlier? It would help understand the problem with the earlier code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
const index_t step = std::min(nstep_, nbatch - i); | ||
for (index_t i = 0; i < nbatch; ++i) { | ||
// temp_col: (N * kernel_size, OW * OH) | ||
Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>( | ||
workspace.dptr_, | ||
Shape2(shape_colunit_[0], | ||
shape_colunit_[1] * step), s); | ||
Shape2(shape_colunit_[0], shape_colunit_[1]), | ||
s); | ||
// temp_dst: (N, N/n_grup, OW * OH) | ||
Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>( | ||
workspace.dptr_ + temp_col.shape_.Size(), | ||
Shape3(shape_dstunit_[0], | ||
shape_dstunit_[1], | ||
shape_dstunit_[2] * step), s); | ||
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); | ||
if (o_pad[0] == 0 && o_pad[1] == 0) { | ||
temp_col = unpack_patch2col(out.Slice(i, i + step), | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]); | ||
} else { | ||
temp_col = unpack_patch2col(pad(out.Slice(i, i + step), | ||
o_pad[0], o_pad[1]), | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]); | ||
} | ||
shape_dstunit_[1], | ||
shape_dstunit_[2]), | ||
s); | ||
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); | ||
|
||
im2col( | ||
s, | ||
(out.Slice(i, i + 1)).dptr_, | ||
out.shape_, | ||
temp_col.shape_, | ||
kernel, | ||
padding, | ||
stride, | ||
dilate, | ||
temp_col.dptr_); | ||
|
||
const index_t gstride = temp_col.size(0) / param_.num_group; | ||
for (uint32_t gid = 0; gid < param_.num_group; ++gid) { | ||
mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, | ||
gstride * (gid + 1)); | ||
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); | ||
// Legacy approach shown here for comparison: | ||
// tmpc = dot(wmat[gid].T(), temp_dst[gid]); | ||
// tmpc = dot(wmat[gid].T(), temp_dst[gid]); | ||
linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s); | ||
} | ||
if (o_pad[0] == 0 && o_pad[1] == 0) { | ||
out.Slice(i, i + step) = pack_col2patch(temp_col, | ||
out.Slice(i, i + step).shape_, | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]); | ||
} else { | ||
Shape<4> pshape = out.Slice(i, i + step).shape_; | ||
pshape[2] += 2 * o_pad[0]; | ||
pshape[3] += 2 * o_pad[1]; | ||
out.Slice(i, i + step) = crop(pack_col2patch(temp_col, | ||
pshape, | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]), | ||
out[i][0].shape_); | ||
} | ||
|
||
col2im( | ||
s, | ||
temp_col.dptr_, | ||
out.Slice(i, i + 1).shape_, | ||
temp_col.shape_, | ||
kernel, | ||
padding, | ||
stride, | ||
dilate, | ||
out.Slice(i, i + 1).dptr_, | ||
req[deconv::kOut]); | ||
} | ||
|
||
if (!param_.no_bias) { | ||
// add bias, broadcast bias to dim 1: channel | ||
Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s); | ||
|
@@ -344,24 +328,24 @@ class DeconvolutionOp { | |
CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); | ||
// get data | ||
Stream<xpu> *s = ctx.get_stream<xpu>(); | ||
#if defined(__CUDACC__) | ||
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle) | ||
<< "Must init CuBLAS handle in stream"; | ||
#endif | ||
auto in_data_shape = in_data[deconv::kData].shape_; | ||
Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s); | ||
Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s); | ||
Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s); | ||
|
||
index_t o_pad[2], o_adj[2]; | ||
if (param_.kernel.ndim() == 2) { | ||
param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj); | ||
} else { | ||
index_t o_pad_1D[1], o_adj_1D[1]; | ||
param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D); | ||
o_pad[0] = 0; | ||
o_pad[1] = o_pad_1D[0]; | ||
o_adj[0] = 0; | ||
o_adj[1] = o_adj_1D[0]; | ||
param_.InferPad({in_data_shape[2]}, o_pad, o_adj); | ||
} | ||
auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]}); | ||
auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]}); | ||
auto padding = param_.kernel.ndim() == 2 ? | ||
TShape({o_pad[0], o_pad[1]}) : TShape({0, o_pad[0]}); | ||
auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]}); | ||
auto kernel_size = kernel.Size(); | ||
|
||
|
@@ -373,55 +357,46 @@ class DeconvolutionOp { | |
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s); | ||
Tensor<xpu, 3, DType> gwmat = | ||
in_grad[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s); | ||
#if defined(__CUDACC__) | ||
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle) | ||
<< "Must init CuBLAS handle in stream"; | ||
#endif | ||
|
||
const index_t nbatch = data.size(0); | ||
Tensor<xpu, 1, DType> workspace = | ||
ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>( | ||
Shape1(this->InitTemp(grad.shape_, data.shape_)), s); | ||
for (index_t i = 0; i < nbatch; i += nstep_) { | ||
const index_t step = std::min(nstep_, nbatch - i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again can you tell what was the purpose of "step" in the previous code? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's used to convert multiple batch of image data into columns in the prevous library. However, it is not supported in the |
||
for (index_t i = 0; i < nbatch; ++i) { | ||
Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>( | ||
workspace.dptr_, | ||
Shape2(shape_colunit_[0], | ||
shape_colunit_[1] * step), s); | ||
Shape2(shape_colunit_[0], shape_colunit_[1]), | ||
s); | ||
Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>( | ||
workspace.dptr_ + temp_col.shape_.Size(), | ||
Shape3(shape_dstunit_[0], | ||
shape_dstunit_[1], | ||
shape_dstunit_[2] * step), s); | ||
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_); | ||
if (o_pad[0] == 0 && o_pad[1] == 0) { | ||
temp_col = unpack_patch2col(grad.Slice(i, i + step), | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]); | ||
} else { | ||
temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]), | ||
kernel[0], | ||
kernel[1], | ||
stride[0], | ||
stride[1], | ||
dilate[0], | ||
dilate[1]); | ||
} | ||
shape_dstunit_[1], | ||
shape_dstunit_[2]), | ||
s); | ||
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + 1)), temp_dst.shape_); | ||
|
||
im2col( | ||
s, | ||
(grad.Slice(i, i + 1)).dptr_, | ||
grad.shape_, | ||
temp_col.shape_, | ||
kernel, | ||
padding, | ||
stride, | ||
dilate, | ||
temp_col.dptr_); | ||
|
||
const index_t gstride = temp_col.size(0) / param_.num_group; | ||
for (uint32_t gid = 0; gid < param_.num_group; ++gid) { | ||
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); | ||
if (i == 0) { | ||
Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid]; | ||
// Legacy approach shown here for comparison: | ||
// Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); | ||
// Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T())); | ||
linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]); | ||
} else { | ||
// Legacy approach shown here for comparison: | ||
// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); | ||
// gwmat[gid] += dot(temp_dst[gid], tmpc.T()); | ||
linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo); | ||
} | ||
} | ||
|
@@ -431,16 +406,16 @@ class DeconvolutionOp { | |
for (uint32_t gid = 0; gid < param_.num_group; ++gid) { | ||
Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1)); | ||
// Legacy approach shown here for comparison: | ||
// temp_dst[gid] = dot(wmat[gid], tmpc); | ||
// temp_dst[gid] = dot(wmat[gid], tmpc); | ||
linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s); | ||
} | ||
Assign(gdata.Slice(i, i + step), | ||
Assign(gdata.Slice(i, i + 1), | ||
req[deconv::kData], | ||
(swapaxis<1, 0>(reshape(temp_dst, | ||
mshadow::Shape4(gdata.shape_[1], | ||
step, | ||
gdata.size(2), | ||
gdata.size(3)))))); | ||
Shape4(gdata.shape_[1], | ||
1, | ||
gdata.size(2), | ||
gdata.size(3)))))); | ||
} | ||
} | ||
if (!param_.no_bias) { | ||
|
@@ -458,17 +433,12 @@ class DeconvolutionOp { | |
shape_dstunit_ = mshadow::Shape3(param_.num_group, | ||
oshape[1] / param_.num_group, | ||
oshape[2] * oshape[3]); | ||
// See convolution for workspace calculations. nstep_ will be the effective batch size | ||
nstep_ = std::max<index_t>( | ||
std::min(static_cast<index_t>(param_.workspace) / | ||
(shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]), | ||
1); | ||
|
||
mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0], | ||
shape_colunit_[1] * nstep_); | ||
shape_colunit_[1]); | ||
mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0], | ||
shape_dstunit_[1], | ||
shape_dstunit_[2] * nstep_); | ||
shape_dstunit_[2]); | ||
index_t required_size = scol.Size() + sdst.Size(); | ||
return required_size; | ||
} | ||
|
@@ -485,7 +455,6 @@ class DeconvolutionOp { | |
DeconvolutionParam param_; | ||
mshadow::Shape<2> shape_colunit_; | ||
mshadow::Shape<3> shape_dstunit_; | ||
index_t nstep_; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please tell me why was this removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
}; // class DeconvolutionOp | ||
|
||
template<typename xpu> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -503,6 +503,40 @@ def test_deconv(): | |
# layer = nn.Conv3DTranspose(16, (3, 3, 3), layout='NDHWC', in_channels=4) | ||
# # check_layer_forward(layer, (1, 10, 10, 10, 4)) | ||
|
||
@with_seed() | ||
def test_deconv_dilation(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since deconv is a really important OP, I suggest to visit the original deconv test cases and add dilation > 1 cases alongside the old tests. This ensures better coverage than this single test case. |
||
data = mx.nd.array((((0,0,0), | ||
(0,1,0), | ||
(0,0,0)), | ||
((0,0,0), | ||
(0,2,0), | ||
(0,0,0)))) | ||
|
||
kernel = mx.nd.array(((1,2,3), | ||
(4,5,6), | ||
(7,8,9))) | ||
|
||
data_batch = data.expand_dims(1) | ||
weight = kernel.expand_dims(0).expand_dims(0) | ||
layer = nn.Conv2DTranspose(in_channels=1, channels=1, | ||
kernel_size=(3,3), padding=(1,1), | ||
strides=(1,1), dilation=(2,2)) | ||
layer.initialize() | ||
layer.weight.set_data(weight) | ||
out = layer(data_batch).asnumpy() | ||
expected = np.array([[[[1.,0.,2.,0.,3.], | ||
[0.,0.,0.,0.,0.], | ||
[4.,0.,5.,0.,6.], | ||
[0.,0.,0.,0.,0.], | ||
[7.,0.,8.,0.,9.]]], | ||
[[[2.,0.,4.,0.,6.], | ||
[0.,0.,0.,0.,0.], | ||
[8.,0.,10.,0.,12.], | ||
[0.,0.,0.,0.,0.], | ||
[14.,0.,16.,0.,18.]]] | ||
]) | ||
assert_almost_equal(out, expected) | ||
|
||
|
||
@with_seed() | ||
def test_pool(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"cuBLAS" is the official abbreviation :)