diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index dbbb73b1361a..843ad8a056fc 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -137,3 +137,8 @@ List of Contributors * [Roshani Nagmote](https://github.com/Roshrini) * [Chetan Khatri](https://github.com/chetkhatri/) * [James Liu](https://github.com/jamesliu/) +* [Yuwen Xiong](https://github.com/Orpine/) +* [Haozhi Qi](https://github.com/Oh233/) +* [Yi Li](https://github.com/liyi14/) +* [Guodong Zhang](https://github.com/gd-zhang/) +* [Xizhou Zhu](https://github.com/einsiedler0408/) diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h new file mode 100644 index 000000000000..da979e707aee --- /dev/null +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -0,0 +1,488 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution-inl.h + * \brief + * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../nn/im2col.h" +#include "./nn/deformable_im2col.h" + + +namespace mxnet { +namespace op { + +namespace conv { + enum DeformableConvolutionOpInputs { kData, kOffset, kWeight, kBias }; + enum DeformableConvolutionOpOutputs { kOut }; + enum DeformableConvolutionOpResource { kTempSpace }; +} + +struct DeformableConvolutionParam : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape dilate; + TShape pad; + uint32_t num_filter; + uint32_t num_group; + uint32_t num_deformable_group; + uint64_t workspace; + bool no_bias; + dmlc::optional layout; + DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { + DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .describe("convolution stride: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(dilate).set_default(TShape()) + .describe("convolution dilate: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for convolution: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) + .describe("convolution filter(channel) number"); + DMLC_DECLARE_FIELD(num_group).set_default(1) + .describe("Number of group partitions."); + DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) + .describe("Number of deformable group partitions."); + DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) + .describe("Maximum temperal workspace allowed for convolution (MB)."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(layout) + .add_enum("NCW", mshadow::kNCW) + .add_enum("NCHW", mshadow::kNCHW) + .add_enum("NCDHW", mshadow::kNCDHW) + .set_default(dmlc::optional()) + .describe("Set layout for input, output and weight. Empty for\n " + "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + } +}; + +template +class DeformableConvolutionOp : public Operator { + public: + explicit DeformableConvolutionOp(DeformableConvolutionParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + CHECK(param_.layout.value() == mshadow::kNCW || + param_.layout.value() == mshadow::kNCHW || + param_.layout.value() == mshadow::kNCDHW) + << "Only support NCW, NCHW and NCDHW layout"; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(req[conv::kOut], kWriteTo); + size_t expected = param_.no_bias ? 3 : 4; + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[conv::kData].shape_, + in_data[conv::kOffset].shape_, + out_data[conv::kOut].shape_); + Stream* s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_data[0].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + index_t M = conv_out_channels_ / group_; + index_t N = conv_out_spatial_dim_; + index_t K = kernel_dim_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, M, K), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, K, N), s); + Tensor output_4d = out_data[conv::kOut].get_with_shape( + Shape4(num_, group_, M, N), s); + for (index_t n = 0; n < num_; ++n) { + // transform image to col_buffer in order to use gemm + deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + param_.num_deformable_group, col_buffer.dptr()); + Tensor output_3d = output_4d[n]; + for (index_t g = 0; g < group_; ++g) { + ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + } + } + if (bias_term_) { + Tensor bias = in_data[conv::kBias].get(s); + Tensor output_3d = out_data[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + // has bias term, broadcast it to the same shape of output_3d in channel dim + output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = param_.no_bias == 0 ? 4 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); + LayerSetUp(in_grad[conv::kData].shape_, + in_grad[conv::kOffset].shape_, + out_grad[conv::kOut].shape_); + Stream *s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // For computing dLoss/d(in_data[kData]) + index_t M = kernel_dim_; + index_t N = conv_out_spatial_dim_; + index_t K = conv_out_channels_ / group_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( + Shape4(num_, group_, K, N), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, M, N), s); + // For computing dLoss/dWeight + Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + + Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); + data_grad = 0; + + + for (index_t n = 0; n < num_; ++n) { + Tensor out_grad_3d = out_grad_4d[n]; + for (index_t g = 0; g < group_; ++g) { + col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + } + + // gradient w.r.t. input coordinate data + deformable_col2im_coord(s, col_buffer.dptr(), + in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kOffset].dptr() + n*input_offset_dim_, + req[conv::kData]); + + // gradient w.r.t. input data + deformable_col2im(s, col_buffer.dptr(), + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kData].dptr() + n*input_dim_, + req[conv::kData]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + col_buffer.dptr()); + for (index_t g = 0; g < group_; ++g) { + if (0 == n) { + ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], + dot(out_grad_3d[g], col_buffer_3d[g].T())); + } else { + dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); + } + } + } + + // gradient w.r.t bias + if (bias_term_) { + Tensor dbias = in_grad[conv::kBias].get(s); + Tensor dout = out_grad[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); + } + } + + private: + void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const index_t num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + is_1x1_ = true; + for (index_t i = 0; i < param_.kernel.ndim(); ++i) { + is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; + if (!is_1x1_) break; + } + + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = param_.num_filter; + conv_in_channels_ = channels_; + bias_term_ = !param_.no_bias; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + input_offset_dim_ = offset_shape.ProdShape(1, offset_shape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + + private: + DeformableConvolutionParam param_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t input_offset_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool bias_term_; // has bias term? + bool is_1x1_; +}; // class ConvolutionOp + +template +Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); + +#if DMLC_USE_CXX11 +class DeformableConvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return{ "data", "offset", "weight", "bias" }; + } else { + return{ "data", "offset", "weight" }; + } + } + + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + LOG(FATAL) << "not implemented"; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + const TShape &oshp = (*in_shape)[conv::kOffset]; + if (dshp.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + CHECK_EQ(oshp.ndim(), 4U) \ + << "Input offset should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t ksize_y = static_cast(param_.kernel[0]); + const index_t ksize_x = static_cast(param_.kernel[1]); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ + << "input num_filter must divide deformable group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = (dshape[2] + 2 * param_.pad[0] - + (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; + oshape[3] = (dshape[3] + 2 * param_.pad[1] - + (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + << "output num_filter must divide deformable group size"; + CHECK_EQ(oshape[2], offsetshape[2]) \ + << "output height must equal to offset map height"; + CHECK_EQ(oshape[3], offsetshape[3]) \ + << "output width must equal to offset map width"; + CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + << "offset filter must divide deformable group size"; + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \ + param_.num_deformable_group) \ + << "offset filter must divide deformable group size"; + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (param_.stride[0] == 1) { + dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; + } + if (param_.stride[1] == 1) { + dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; + } + return true; + } else { + LOG(FATAL) << "not implemented"; + return false; + } + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DeformableConvolutionProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_DeformableConvolution"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return{ out_grad[conv::kOut], in_data[conv::kData], + in_data[conv::kOffset], in_data[conv::kWeight] }; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + DeformableConvolutionParam param_; +}; // class ConvolutionProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ diff --git a/src/operator/contrib/deformable_convolution.cc b/src/operator/contrib/deformable_convolution.cc new file mode 100644 index 000000000000..5af91a0aa407 --- /dev/null +++ b/src/operator/contrib/deformable_convolution.cc @@ -0,0 +1,89 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution.cc + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ + +#include "./deformable_convolution-inl.h" + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(DeformableConvolutionParam); + +template<> +Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionOp(param); + }) + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *DeformableConvolutionProp::CreateOperatorEx(Context ctx, + std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +} + +MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolution, DeformableConvolutionProp) +.describe(R"code(Compute 2-D deformable convolution on 4-D input. + +The deformable convolution operation is described in https://arxiv.org/abs/1703.06211 + +For 2-D deformable convolution, the shapes are + +- **data**: *(batch_size, channel, height, width)* +- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)* +- **weight**: *(num_filter, channel, kernel[0], kernel[1])* +- **bias**: *(num_filter,)* +- **out**: *(batch_size, num_filter, out_height, out_width)*. + +Define:: + + f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 + +then we have:: + + out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) + out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) + +If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + +The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height, +width)*. + +If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` +evenly into *g* parts along the channel axis, and also evenly split ``weight`` +along the first dimension. Next compute the convolution on the *i*-th part of +the data with the *i*-th weight part. The output is obtained by concating all +the *g* results. + +If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the +input ``offset`` evenly into *dg* parts along the channel axis, and also evenly +split ``out`` evenly into *dg* parts along the channel axis. Next compute the +deformable convolution, apply the *i*-th part of the offset part on the *i*-th +out. + + +Both ``weight`` and ``bias`` are learnable parameters. + + +)code" ADD_FILELINE) +.add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionOp.") +.add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionOp.") +.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") +.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") +.add_arguments(DeformableConvolutionParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/deformable_convolution.cu b/src/operator/contrib/deformable_convolution.cu new file mode 100644 index 000000000000..f690cc1ce24c --- /dev/null +++ b/src/operator/contrib/deformable_convolution.cu @@ -0,0 +1,29 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution.cu + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ + +#include "./deformable_convolution-inl.h" +#include + +namespace mxnet { +namespace op { + + template<> + Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionOp(param); + }) + return op; + } + +} // namespace op +} // namespace mxnet + diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h new file mode 100644 index 000000000000..16a98f76bcff --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling-inl.h @@ -0,0 +1,286 @@ +/*! +* Copyright (c) 2017 Microsoft +* Licensed under The Apache-2.0 License [see LICENSE for details] +* \file deformable_psroi_pooling-inl.h +* \brief deformable psroi pooling operator and symbol +* \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../operator_common.h" + + +namespace mxnet { +namespace op { + + // Declare enumeration of input order to make code more intuitive. + // These enums are only visible within this header +namespace deformablepsroipool { + enum DeformablePSROIPoolingOpInputs { kData, kBox, kTrans }; + enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount }; +} // deformablepsroipool + +struct DeformablePSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int group_size; + int pooled_size; + int part_size; + int sample_per_part; + float trans_std; + bool no_trans; + DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(group_size).describe("fix group size"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); + DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); + DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0) + .describe("fix transition std"); + DMLC_DECLARE_FIELD(no_trans).set_default(false) + .describe("Whether to disable trans parameter."); + } +}; + +template +class DeformablePSROIPoolingOp : public Operator { + public: + explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor out = out_data[deformablepsroipool::kOut].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount] + .get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + out = -FLT_MAX; + top_count = 0.0f; + + Tensor trans; + if (!param_.no_trans) { + trans = in_data[deformablepsroipool::kTrans].get(s); + } + DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, + param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, + param_.part_size, param_.sample_per_part, param_.trans_std); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans ? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << + // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount] + .get(s); + Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); + Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); + Tensor grad_trans; + Tensor trans; + if (!param_.no_trans) { + CHECK_EQ(in_grad.size(), 3); + trans = in_data[deformablepsroipool::kTrans].get(s); + grad_trans = in_grad[deformablepsroipool::kTrans].get(s); + } + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[deformablepsroipool::kData], 0); + if (!param_.no_trans) { + Assign(grad_trans, req[deformablepsroipool::kTrans], 0); + } + DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, + top_count, param_.no_trans, param_.spatial_scale, param_.output_dim, param_.group_size, + param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); + Assign(grad_roi, req[deformablepsroipool::kBox], 0); + } + + private: + DeformablePSROIPoolingParam param_; +}; // class DeformablePSROIPoolingOp + +// Decalre Factory function, used for dispatch specialization +template +Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); + +#if DMLC_USE_CXX11 +class DeformablePSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.no_trans) { + return{ "data", "rois" }; + } else { + return{ "data", "rois", "trans" }; + } + } + + std::vector ListOutputs() const override { + return{ "output", "top_count" }; + } + + int NumOutputs() const override { + return 2; + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.part_size == 0) { + param_.part_size = param_.pooled_size; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.no_trans) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + } else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; + // trans: [num_rois, 2, pooled_h, pooled_w] + TShape tshape = in_shape->at(deformablepsroipool::kTrans); + CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; + } + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(deformablepsroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(deformablepsroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + // top_count: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); + deformable_psroi_pooling_sym->param_ = this->param_; + return deformable_psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_DeformablePSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.no_trans) { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], + in_data[deformablepsroipool::kBox], out_data[deformablepsroipool::kTopCount] }; + } else { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], + in_data[deformablepsroipool::kBox], in_data[deformablepsroipool::kTrans], + out_data[deformablepsroipool::kTopCount] }; + } + } + + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + + private: + DeformablePSROIPoolingParam param_; +}; // class DeformablePSROIPoolingProp +#endif +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc new file mode 100644 index 000000000000..290bad2a76cd --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling.cc @@ -0,0 +1,96 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_psroi_pooling.cc + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./deformable_psroi_pooling-inl.h" +#include +#include +#include +#include +#include + +using std::max; +using std::min; +using std::floor; +using std::ceil; + +namespace mshadow { + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // NOT_IMPLEMENTED; + return; + } + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // NOT_IMPLEMENTED; + return; + } +} // namespace mshadow + +namespace mxnet { +namespace op { + + template<> + Operator *CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } + + Operator *DeformablePSROIPoolingProp::CreateOperatorEx( + Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + } + + DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); + + MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) + .describe("Performs deformable position-sensitive region-of-interest pooling on inputs." + "The DeformablePSROIPooling operation is described in https://arxiv.org/abs/1703.06211." + "batch_size will change to the number of region bounding boxes after DeformablePSROIPooling") + .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") + .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " + "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " + "of designated region of interest. batch_index indicates the index of corresponding image " + "in the input data") + .add_argument("trans", "Symbol", "transition parameter") + .add_arguments(DeformablePSROIPoolingParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu new file mode 100644 index 000000000000..f9eb01a26e38 --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling.cu @@ -0,0 +1,415 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./deformable_psroi_pooling-inl.h" +#include +#include +#include +#include +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" + +#define DeformablePSROIPOOLING_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) +#define CUDA_KERNEL_LOOP(i, n) \ +for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +namespace mshadow { +namespace cuda { + template + __device__ DType bilinear_interp( + const DType* data, + const DType x, + const DType y, + const int width, + const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + DType dist_x = static_cast(x - x1); + DType dist_y = static_cast(y - y1); + DType value11 = data[y1*width + x1]; + DType value12 = data[y2*width + x1]; + DType value21 = data[y1*width + x2]; + DType value22 = data[y2*width + x2]; + DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12 + + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22; + return value; + } + + template + __global__ void DeformablePSROIPoolForwardKernel( + const int count, + const DType* bottom_data, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const DType* bottom_rois, const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + DType* top_data, + DType* top_count) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + DType sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } + } + + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolForward"; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *top_data = out.dptr_; + DType *top_count_data = top_count.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(out.stream_); + DeformablePSROIPoolForwardKernel << > >( + count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } + + + template + __global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* top_count, + const int num_rois, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + DType* bottom_data_diff, DType* bottom_trans_diff, + const DType* bottom_data, + const DType* bottom_rois, + const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + DType diff_val = top_diff[index] / top_count[index]; + const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + DType dist_x = w - x0, dist_y = h - y0; + DType q00 = (1 - dist_x)*(1 - dist_y); + DType q01 = (1 - dist_x)*dist_y; + DType q10 = dist_x*(1 - dist_y); + DType q11 = dist_x*dist_y; + int bottom_index_base = c * height *width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val); + + if (no_trans) { + continue; + } + DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0]; + DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0]; + DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1]; + DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1]; + DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y)) + *trans_std*diff_val; + diff_x *= roi_width; + DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x)) + *trans_std*diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w, diff_y); + } + } + } + } + + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const DType *top_diff = out_grad.dptr_; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *bottom_data_diff = in_grad.dptr_; + DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_; + const DType *top_count_data = top_count.dptr_; + const int count = out_grad.shape_.Size(); + const int num_rois = bbox.size(0); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + DeformablePSROIPoolBackwardAccKernel << > >( + count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } + +} // namespace cuda + + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + cuda::DeformablePSROIPoolForward(out, data, bbox, trans, top_count, no_trans, spatial_scale, + output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); + } + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans, + top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, + sample_per_part, trans_std); + } + +} // namespace mshadow + + +namespace mxnet { +namespace op { + + template<> + Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h new file mode 100644 index 000000000000..48f3535b5eab --- /dev/null +++ b/src/operator/contrib/multi_proposal-inl.h @@ -0,0 +1,301 @@ +/*! + * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal-inl.h + * \brief MultiProposal Operator + * \author Piotr Teterwak, Bing Xu, Jian Guo, Xizhou Zhu +*/ +#ifndef MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ +#define MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" + +// extend NumericalParam +namespace mxnet { +namespace op { + +/*! +* \brief structure for numerical tuple input +* \tparam VType data type of param +*/ +template +struct NumericalParam { + NumericalParam() {} + explicit NumericalParam(VType *begin, VType *end) { + int32_t size = static_cast(end - begin); + info.resize(size); + for (int i = 0; i < size; ++i) { + info[i] = *(begin + i); + } + } + inline size_t ndim() const { + return info.size(); + } + std::vector info; +}; + +template +inline std::istream &operator>>(std::istream &is, NumericalParam ¶m) { + while (true) { + char ch = is.get(); + if (ch == '(') break; + if (!isspace(ch)) { + is.setstate(std::ios::failbit); + return is; + } + } + VType idx; + std::vector tmp; + // deal with empty case + size_t pos = is.tellg(); + char ch = is.get(); + if (ch == ')') { + param.info = tmp; + return is; + } + is.seekg(pos); + // finish deal + while (is >> idx) { + tmp.push_back(idx); + char ch; + do { + ch = is.get(); + } while (isspace(ch)); + if (ch == ',') { + while (true) { + ch = is.peek(); + if (isspace(ch)) { + is.get(); continue; + } + if (ch == ')') { + is.get(); break; + } + break; + } + if (ch == ')') break; + } else if (ch == ')') { + break; + } else { + is.setstate(std::ios::failbit); + return is; + } + } + param.info = tmp; + return is; +} + +template +inline std::ostream &operator<<(std::ostream &os, const NumericalParam ¶m) { + os << '('; + for (index_t i = 0; i < param.info.size(); ++i) { + if (i != 0) os << ','; + os << param.info[i]; + } + // python style tuple + if (param.info.size() == 1) os << ','; + os << ')'; + return os; +} + +} // namespace op +} // namespace mxnet + +namespace mxnet { +namespace op { + +namespace proposal { +enum MultiProposalOpInputs {kClsProb, kBBoxPred, kImInfo}; +enum MultiProposalOpOutputs {kOut, kScore}; +enum MultiProposalForwardResource {kTempResource}; +} // proposal + +struct MultiProposalParam : public dmlc::Parameter { + int rpn_pre_nms_top_n; + int rpn_post_nms_top_n; + float threshold; + int rpn_min_size; + NumericalParam scales; + NumericalParam ratios; + int feature_stride; + bool output_score; + bool iou_loss; + DMLC_DECLARE_PARAMETER(MultiProposalParam) { + float tmp[] = {0, 0, 0, 0}; + DMLC_DECLARE_FIELD(rpn_pre_nms_top_n).set_default(6000) + .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals"); + DMLC_DECLARE_FIELD(rpn_post_nms_top_n).set_default(300) + .describe("Overlap threshold used for non-maximum" + "suppresion(suppress boxes with IoU >= this threshold"); + DMLC_DECLARE_FIELD(threshold).set_default(0.7) + .describe("NMS value, below which to suppress."); + DMLC_DECLARE_FIELD(rpn_min_size).set_default(16) + .describe("Minimum height or width in proposal"); + tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f; + DMLC_DECLARE_FIELD(scales).set_default(NumericalParam(tmp, tmp + 4)) + .describe("Used to generate anchor windows by enumerating scales"); + tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f; + DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam(tmp, tmp + 3)) + .describe("Used to generate anchor windows by enumerating ratios"); + DMLC_DECLARE_FIELD(feature_stride).set_default(16) + .describe("The size of the receptive field each unit in the convolution layer of the rpn," + "for example the product of all stride's prior to this layer."); + DMLC_DECLARE_FIELD(output_score).set_default(false) + .describe("Add score to outputs"); + DMLC_DECLARE_FIELD(iou_loss).set_default(false) + .describe("Usage of IoU Loss"); + } +}; + +template +Operator *CreateOp(MultiProposalParam param); + +#if DMLC_USE_CXX11 +class MultiProposalProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]"; + const TShape &dshape = in_shape->at(proposal::kClsProb); + if (dshape.ndim() == 0) return false; + Shape<4> bbox_pred_shape; + bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]); + SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred, + bbox_pred_shape); + Shape<2> im_info_shape; + im_info_shape = Shape2(dshape[0], 3); + SHAPE_ASSIGN_CHECK(*in_shape, proposal::kImInfo, im_info_shape); + out_shape->clear(); + // output + out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 5)); + // score + out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 1)); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new MultiProposalProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_MultiProposal"; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {}; + } + + int NumVisibleOutputs() const override { + if (param_.output_score) { + return 2; + } else { + return 1; + } + } + + int NumOutputs() const override { + return 2; + } + + std::vector ListArguments() const override { + return {"cls_prob", "bbox_pred", "im_info"}; + } + + std::vector ListOutputs() const override { + return {"output", "score"}; + } + + Operator* CreateOperator(Context ctx) const override; + + private: + MultiProposalParam param_; +}; // class MultiProposalProp + +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +//======================== +// Anchor Generation Utils +//======================== +namespace mxnet { +namespace op { +namespace utils { + +inline void _MakeAnchor(float w, + float h, + float x_ctr, + float y_ctr, + std::vector *out_anchors) { + out_anchors->push_back(x_ctr - 0.5f * (w - 1.0f)); + out_anchors->push_back(y_ctr - 0.5f * (h - 1.0f)); + out_anchors->push_back(x_ctr + 0.5f * (w - 1.0f)); + out_anchors->push_back(y_ctr + 0.5f * (h - 1.0f)); + out_anchors->push_back(0.0f); +} + +inline void _Transform(float scale, + float ratio, + const std::vector& base_anchor, + std::vector *out_anchors) { + float w = base_anchor[2] - base_anchor[1] + 1.0f; + float h = base_anchor[3] - base_anchor[1] + 1.0f; + float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f); + float y_ctr = base_anchor[1] + 0.5 * (h - 1.0f); + float size = w * h; + float size_ratios = std::floor(size / ratio); + float new_w = std::floor(std::sqrt(size_ratios) + 0.5f) * scale; + float new_h = std::floor((new_w / scale * ratio) + 0.5f) * scale; + + _MakeAnchor(new_w, new_h, x_ctr, + y_ctr, out_anchors); +} + +// out_anchors must have shape (n, 5), where n is ratios.size() * scales.size() +inline void GenerateAnchors(const std::vector& base_anchor, + const std::vector& ratios, + const std::vector& scales, + std::vector *out_anchors) { + for (size_t j = 0; j < ratios.size(); ++j) { + for (size_t k = 0; k < scales.size(); ++k) { + _Transform(scales[k], ratios[j], base_anchor, out_anchors); + } + } +} + +} // namespace utils +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ diff --git a/src/operator/contrib/multi_proposal.cc b/src/operator/contrib/multi_proposal.cc new file mode 100644 index 000000000000..c8f75eaec547 --- /dev/null +++ b/src/operator/contrib/multi_proposal.cc @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal.cc + * \brief + * \author Xizhou Zhu +*/ + +#include "./multi_proposal-inl.h" + + +namespace mxnet { +namespace op { + +template +class MultiProposalOp : public Operator{ + public: + explicit MultiProposalOp(MultiProposalParam param) { + this->param_ = param; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + LOG(FATAL) << "not implemented"; + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + LOG(FATAL) << "not implemented"; + } + + private: + MultiProposalParam param_; +}; // class MultiProposalOp + +template<> +Operator *CreateOp(MultiProposalParam param) { + return new MultiProposalOp(param); +} + +Operator* MultiProposalProp::CreateOperator(Context ctx) const { + DO_BIND_DISPATCH(CreateOp, param_); +} + +DMLC_REGISTER_PARAMETER(MultiProposalParam); + +MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp) +.describe("Generate region proposals via RPN") +.add_argument("cls_score", "NDArray-or-Symbol", "Score of how likely proposal is object.") +.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals") +.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.") +.add_arguments(MultiProposalParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu new file mode 100644 index 000000000000..052d777d5fac --- /dev/null +++ b/src/operator/contrib/multi_proposal.cu @@ -0,0 +1,593 @@ +/*! + * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal.cu + * \brief MultiProposal Operator + * \author Shaoqing Ren, Xizhou Zhu, Jian Guo +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./multi_proposal-inl.h" + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +#define FRCNN_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ +} while (0) + +namespace mshadow { +namespace cuda { +namespace multi_proposal { + +// scores are (b, 2 * anchor, h, w) +// workspace_proposals are (b, h * w * anchor, 5) +// w defines "x" and h defines "y" +// count should be total anchors numbers, h * w * anchors +template +__global__ void ProposalGridKernel(const int count, + const int num_anchors, + const int height, + const int width, + const int feature_stride, + const Dtype* scores, + Dtype* workspace_proposals) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % width; + int h = (index / num_anchors / width) % height; + int b = index / num_anchors / width / height; + + workspace_proposals[index * 5 + 0] = workspace_proposals[a * 5 + 0] + w * feature_stride; + workspace_proposals[index * 5 + 1] = workspace_proposals[a * 5 + 1] + h * feature_stride; + workspace_proposals[index * 5 + 2] = workspace_proposals[a * 5 + 2] + w * feature_stride; + workspace_proposals[index * 5 + 3] = workspace_proposals[a * 5 + 3] + h * feature_stride; + workspace_proposals[index * 5 + 4] = + scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w]; + } +} + +// boxes are (b, h * w * anchor, 5) +// deltas are (b, 4 * anchor, h, w) +// out_pred_boxes are (b, h * w * anchor, 5) +// count should be total anchors numbers, b * h * w * anchors +// in-place write: boxes and out_pred_boxes are the same location +template +__global__ void BBoxPredKernel(const int count, + const int num_anchors, + const int feat_height, + const int feat_width, + const int feature_stride, + const Dtype* im_infos, + const Dtype* boxes, + const Dtype* deltas, + Dtype* out_pred_boxes) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % feat_width; + int h = (index / num_anchors / feat_width) % feat_height; + int b = index / num_anchors / feat_width / feat_height; + + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); + + float width = boxes[index * 5 + 2] - boxes[index * 5 + 0] + 1.0f; + float height = boxes[index * 5 + 3] - boxes[index * 5 + 1] + 1.0f; + float ctr_x = boxes[index * 5 + 0] + 0.5f * (width - 1.0f); + float ctr_y = boxes[index * 5 + 1] + 0.5f * (height - 1.0f); + + int ba = (b * num_anchors + a); + float dx = deltas[((ba * 4) * feat_height + h) * feat_width + w]; + float dy = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; + float dw = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; + float dh = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w]; + + float pred_ctr_x = dx * width + ctr_x; + float pred_ctr_y = dy * height + ctr_y; + float pred_w = exp(dw) * width; + float pred_h = exp(dh) * height; + + float pred_x1 = pred_ctr_x - 0.5f * (pred_w - 1.0f); + float pred_y1 = pred_ctr_y - 0.5f * (pred_h - 1.0f); + float pred_x2 = pred_ctr_x + 0.5f * (pred_w - 1.0f); + float pred_y2 = pred_ctr_y + 0.5f * (pred_h - 1.0f); + + pred_x1 = max(min(pred_x1, im_width - 1.0f), 0.0f); + pred_y1 = max(min(pred_y1, im_height - 1.0f), 0.0f); + pred_x2 = max(min(pred_x2, im_width - 1.0f), 0.0f); + pred_y2 = max(min(pred_y2, im_height - 1.0f), 0.0f); + + out_pred_boxes[index * 5 + 0] = pred_x1; + out_pred_boxes[index * 5 + 1] = pred_y1; + out_pred_boxes[index * 5 + 2] = pred_x2; + out_pred_boxes[index * 5 + 3] = pred_y2; + + if (h >= real_height || w >= real_width) { + out_pred_boxes[index * 5 + 4] = -1.0f; + } + } +} + +// boxes are (b, h * w * anchor, 5) +// deltas are (b, 4 * anchor, h, w) +// out_pred_boxes are (b, h * w * anchor, 5) +// count should be total anchors numbers, b * h * w * anchors +// in-place write: boxes and out_pred_boxes are the same location +template +__global__ void IoUPredKernel(const int count, + const int num_anchors, + const int feat_height, + const int feat_width, + const int feature_stride, + const Dtype* im_infos, + const Dtype* boxes, + const Dtype* deltas, + Dtype* out_pred_boxes) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % feat_width; + int h = (index / num_anchors / feat_width) % feat_height; + int b = index / num_anchors / feat_width / feat_height; + + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); + + float x1 = boxes[index * 5 + 0]; + float y1 = boxes[index * 5 + 1]; + float x2 = boxes[index * 5 + 2]; + float y2 = boxes[index * 5 + 3]; + + int ba = (b * num_anchors + a); + float dx1 = deltas[((ba * 4) * feat_height + h) * feat_width + w]; + float dy1 = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; + float dx2 = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; + float dy2 = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w]; + + float pred_x1 = max(min(x1 + dx1, im_width - 1.0f), 0.0f); + float pred_y1 = max(min(y1 + dy1, im_height - 1.0f), 0.0f); + float pred_x2 = max(min(x2 + dx2, im_width - 1.0f), 0.0f); + float pred_y2 = max(min(y2 + dy2, im_height - 1.0f), 0.0f); + + out_pred_boxes[index * 5 + 0] = pred_x1; + out_pred_boxes[index * 5 + 1] = pred_y1; + out_pred_boxes[index * 5 + 2] = pred_x2; + out_pred_boxes[index * 5 + 3] = pred_y2; + + if (h >= real_height || w >= real_width) { + out_pred_boxes[index * 5 + 4] = -1.0f; + } + } +} + +// filter box with stride less than rpn_min_size +// filter: set score to zero +// dets (b, n, 5) +template +__global__ void FilterBoxKernel(const int count, + const int count_anchors, + const float original_min_size, + const Dtype* im_infos, + Dtype* dets) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int b = index / count_anchors; + float iw = dets[index * 5 + 2] - dets[index * 5 + 0] + 1.0f; + float ih = dets[index * 5 + 3] - dets[index * 5 + 1] + 1.0f; + float min_size = original_min_size * im_infos[b * 3 + 2]; + if (iw < min_size || ih < min_size) { + dets[index * 5 + 0] -= min_size / 2; + dets[index * 5 + 1] -= min_size / 2; + dets[index * 5 + 2] += min_size / 2; + dets[index * 5 + 3] += min_size / 2; + dets[index * 5 + 4] = -1.0f; + } + } +} + +// copy score and init order +// dets (n, 5); score (n, ); order (n, ) +// count should be n (total anchors or proposals) +template +__global__ void CopyScoreKernel(const int count, + const Dtype* dets, + Dtype* score, + int* order) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + score[index] = dets[index * 5 + 4]; + order[index] = index; + } +} + +// reorder proposals according to order and keep the top_n proposals +// prev_dets (n, 5); order (n, ); dets (n, 5) +// count should be output anchor numbers (top_n) +template +__global__ void ReorderProposalsKernel(const int count, + const Dtype* prev_dets, + const int* order, + Dtype* dets) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + const int order_i = order[index]; + for (int j = 0; j < 5; j ++) { + dets[index * 5 + j] = prev_dets[order_i * 5 + j]; + } + } +} + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int threadsPerBlock = sizeof(uint64_t) * 8; + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +void _nms(const mshadow::Tensor& boxes, + const float nms_overlap_thresh, + int *keep, + int *num_out) { + const int threadsPerBlock = sizeof(uint64_t) * 8; + const int boxes_num = boxes.size(0); + const int boxes_dim = boxes.size(1); + + float* boxes_dev = boxes.dptr_; + uint64_t* mask_dev = NULL; + + const int col_blocks = DIVUP(boxes_num, threadsPerBlock); + FRCNN_CUDA_CHECK(cudaMalloc(&mask_dev, + boxes_num * col_blocks * sizeof(uint64_t))); + + dim3 blocks(DIVUP(boxes_num, threadsPerBlock), + DIVUP(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + std::vector mask_host(boxes_num * col_blocks); + FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(uint64_t) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep[num_to_keep++] = i; + uint64_t *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + *num_out = num_to_keep; + + FRCNN_CUDA_CHECK(cudaFree(mask_dev)); +} + +// copy proposals to output +// dets (top_n, 5); keep (top_n, ); out (top_n, ) +// count should be top_n (total anchors or proposals) +template +__global__ void PrepareOutput(const int count, + const Dtype* dets, + const int* keep, + const int out_size, + const int image_index, + Dtype* out, + Dtype* score) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + out[index * 5] = image_index; + if (index < out_size) { + int keep_i = keep[index]; + for (int j = 0; j < 4; ++j) { + out[index * 5 + j + 1] = dets[keep_i * 5 + j]; + } + score[index] = dets[keep_i * 5 + 4]; + } else { + int keep_i = keep[index % out_size]; + for (int j = 0; j < 4; ++j) { + out[index * 5 + j + 1] = dets[keep_i * 5 + j]; + } + score[index] = dets[keep_i * 5 + 4]; + } + } +} +} // namespace multi_proposal +} // namespace cuda +} // namespace mshadow + +namespace mxnet { +namespace op { + +template +class MultiProposalGPUOp : public Operator{ + public: + explicit MultiProposalGPUOp(MultiProposalParam param) { + this->param_ = param; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow::cuda; + using namespace mshadow::cuda::multi_proposal; + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 2); + CHECK_GT(req.size(), 1); + CHECK_EQ(req[proposal::kOut], kWriteTo); + /*CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1) + << "Sorry, multiple images each device is not implemented.";*/ + + Stream *s = ctx.get_stream(); + + Tensor scores = in_data[proposal::kClsProb].get(s); + Tensor bbox_deltas = in_data[proposal::kBBoxPred].get(s); + Tensor im_info = in_data[proposal::kImInfo].get(s); + + Tensor out = out_data[proposal::kOut].get(s); + Tensor out_score = out_data[proposal::kScore].get(s); + + int num_images = scores.size(0); + int num_anchors = scores.size(1) / 2; + int height = scores.size(2); + int width = scores.size(3); + int count_anchors = num_anchors * height * width; // count of total anchors + int count = num_images * count_anchors; + // set to -1 for max + int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n + : count_anchors; + rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors); + int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n); + + // Generate first anchors based on base anchor + std::vector base_anchor(4); + base_anchor[0] = 0.0; + base_anchor[1] = 0.0; + base_anchor[2] = param_.feature_stride - 1.0; + base_anchor[3] = param_.feature_stride - 1.0; + CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); + std::vector anchors; + utils::GenerateAnchors(base_anchor, + param_.ratios.info, + param_.scales.info, + &anchors); + + // Copy generated anchors to GPU + float* workspace_proposals_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, + sizeof(float) * num_images * count_anchors * 5)); + Tensor workspace_proposals(workspace_proposals_ptr, + Shape3(num_images, count_anchors, 5)); + FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0], + sizeof(float) * anchors.size(), cudaMemcpyHostToDevice)); + + // Copy proposals to a mesh grid + dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); + dim3 dimBlock(kMaxThreadsPerBlock); + CheckLaunchParam(dimGrid, dimBlock, "ProposalGrid"); + ProposalGridKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, + scores.dptr_, workspace_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // Transform anchors and bbox_deltas into bboxes + CheckLaunchParam(dimGrid, dimBlock, "BBoxPred"); + if (param_.iou_loss) { + IoUPredKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); + } else { + BBoxPredKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); + } + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // filter boxes with less than rpn_min_size + CheckLaunchParam(dimGrid, dimBlock, "FilterBox"); + FilterBoxKernel<<>>( + count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + + + dimGrid = dim3((count_anchors + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); + dimBlock = dim3(kMaxThreadsPerBlock); + // Copy score to a continuous memory + float* score_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count_anchors)); + Tensor score(score_ptr, Shape1(count_anchors)); + int* order_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count_anchors)); + Tensor order(order_ptr, Shape1(count_anchors)); + + float* workspace_ordered_proposals_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, + sizeof(float) * rpn_pre_nms_top_n * 5)); + Tensor workspace_ordered_proposals(workspace_ordered_proposals_ptr, + Shape2(rpn_pre_nms_top_n, 5)); + + int* keep; + FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n)); + + for (int b = 0; b < num_images; b++) { + CheckLaunchParam(dimGrid, dimBlock, "CopyScore"); + CopyScoreKernel << > >( + count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, + score.dptr_, order.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // argsort score, save order + thrust::stable_sort_by_key(thrust::device, + score.dptr_, + score.dptr_ + score.size(0), + order.dptr_, + thrust::greater()); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // Reorder proposals according to order + + dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals"); + ReorderProposalsKernel << > >( + rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, + order.dptr_, workspace_ordered_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // perform nms + std::vector _keep(workspace_ordered_proposals.size(0)); + int out_size = 0; + _nms(workspace_ordered_proposals, + param_.threshold, + &_keep[0], + &out_size); + + // copy nms result to gpu + FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(), + cudaMemcpyHostToDevice)); + + // copy results after nms + dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput"); + PrepareOutput << > >( + rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b, + out.dptr_ + b * rpn_post_nms_top_n * 5, out_score.dptr_ + b * rpn_post_nms_top_n); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + } + // free temporary memory + FRCNN_CUDA_CHECK(cudaFree(keep)); + FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(score_ptr)); + FRCNN_CUDA_CHECK(cudaFree(order_ptr)); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), 3); + + Stream *s = ctx.get_stream(); + Tensor gscores = in_grad[proposal::kClsProb].get(s); + Tensor gbbox = in_grad[proposal::kBBoxPred].get(s); + Tensor ginfo = in_grad[proposal::kImInfo].get(s); + + // can not assume the grad would be zero + Assign(gscores, req[proposal::kClsProb], 0); + Assign(gbbox, req[proposal::kBBoxPred], 0); + Assign(ginfo, req[proposal::kImInfo], 0); + } + + private: + MultiProposalParam param_; +}; // class MultiProposalGPUOp + +template<> +Operator* CreateOp(MultiProposalParam param) { + return new MultiProposalGPUOp(param); +} +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh new file mode 100644 index 000000000000..d9e7b970ca84 --- /dev/null +++ b/src/operator/contrib/nn/deformable_im2col.cuh @@ -0,0 +1,525 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai + */ + +#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ +#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ + +#include +#include +#include +#include +#include +#include "../../mxnet_op.h" +#include "../../../common/cuda_utils.h" + + + +namespace mxnet { +namespace op { + +template +__device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int data_width, + const int height, const int width, DType h, DType w) { + + int h_low = floor(h); + int w_low = floor(w); + int h_high; + int w_high; + if (h_low >= height - 1) { + h_high = h_low = height - 1; + h = (DType)h_low; + } + else { + h_high = h_low + 1; + } + + if (w_low >= width - 1) { + w_high = w_low = width - 1; + w = (DType)w_low; + } + else { + w_high = w_low + 1; + } + + DType lh = h - h_low; + DType lw = w - w_low; + DType hh = 1 - lh, hw = 1 - lw; + + DType v1 = bottom_data[h_low * data_width + w_low]; + DType v2 = bottom_data[h_low * data_width + w_high]; + DType v3 = bottom_data[h_high * data_width + w_low]; + DType v4 = bottom_data[h_high * data_width + w_high]; + DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ DType get_gradient_weight(DType argmax_h, DType argmax_w, + const int h, const int w, const int height, const int width) { + + if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) { + //empty + return 0; + } + + argmax_h = max(argmax_h, (DType)0.0f); + argmax_w = max(argmax_w, (DType)0.0f); + + int argmax_h_low = (int)argmax_h; + int argmax_w_low = (int)argmax_w; + int argmax_h_high; + int argmax_w_high; + if (argmax_h_low >= height - 1) { + argmax_h_high = argmax_h_low = height - 1; + argmax_h = (DType)argmax_h_low; + } else { + argmax_h_high = argmax_h_low + 1; + } + if (argmax_w_low >= width - 1) + { + argmax_w_high = argmax_w_low = width - 1; + argmax_w = (DType)argmax_w_low; + } else { + argmax_w_high = argmax_w_low + 1; + } + DType weight = 0; + if (h == argmax_h_low) { + if (w == argmax_w_low) { + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + } else if (w == argmax_w_high) { + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + } + } else if (h == argmax_h_high) { + if (w == argmax_w_low) { + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + } else if (w == argmax_w_high) { + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + } + } + return weight; +} + + +template +__device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w, + const int height, const int width, const DType* im_data, + const int data_width, const int bp_dir) { + + if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) + { + //empty + return 0; + } + + if (argmax_h < 0) argmax_h = 0; + if (argmax_w < 0) argmax_w = 0; + + int argmax_h_low = (int)argmax_h; + int argmax_w_low = (int)argmax_w; + int argmax_h_high; + int argmax_w_high; + if (argmax_h_low >= height - 1) { + argmax_h_high = argmax_h_low = height - 1; + argmax_h = (DType)argmax_h_low; + } else { + argmax_h_high = argmax_h_low + 1; + } + if (argmax_w_low >= width - 1) { + argmax_w_high = argmax_w_low = width - 1; + argmax_w = (DType)argmax_w_low; + } else { + argmax_w_high = argmax_w_low + 1; + } + DType weight = 0; + + if (bp_dir == 0) { + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + + +/*! + * \brief deformable_im2col gpu kernel. + * DO NOT call this directly. Use wrapper function im2col() instead; + */ +template +__global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im, const DType* data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* data_col) { + CUDA_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int c_im = (index / width_col) / height_col; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + DType* data_col_ptr = data_col + (c_col * height_col + h_col) * width_col + w_col; + const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in; + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + DType val = static_cast(0); + const DType h_im = h_in + i * dilation_h + offset_h; + const DType w_im = w_in + j * dilation_w + offset_w; + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + const DType map_h = i * dilation_h + offset_h; + const DType map_w = j * dilation_w + offset_w; + const int cur_height = height - h_in; + const int cur_width = width - w_in; + val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + } + *data_col_ptr = val; + data_col_ptr += height_col * width_col; + } + } + } +} + + + + + + +/*!\brief + * cpu function of deformable_im2col algorithm + * \param s device stream + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param data_col column buffer pointer + */ +template +inline void deformable_im2col(mshadow::Stream* s, + const DType* data_im, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, const TShape& dilation, + const uint32_t deformable_group, DType* data_col) { + // num_axes should be smaller than block size + index_t num_spatial_axes = kernel_shape.ndim(); + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + index_t channel_per_deformable_group = im_shape[1] / deformable_group; + index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim()); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + deformable_im2col_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) + <<::GetStream(s)>>>( + num_kernels, data_im, data_offset, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], + pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], channel_per_deformable_group, + col_shape[1], col_shape[2], data_col); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_im2col_gpu_kernel); + break; + default: + LOG(FATAL) << "im2col_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +/*! +* \brief deformable_col2im gpu kernel. +* \brief DO NOT call this directly. Use wrapper function deformable_col2im() instead; +*/ +template +__global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col, const DType* data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* grad_im, OpReqType req) { + CUDA_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col) % kernel_w; + const int i = (index / width_col / height_col / kernel_w) % kernel_h; + const int c = index / width_col / height_col / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType cur_inv_h_data = h_in + i * dilation_h + offset_h; + const DType cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const DType cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1 + ) { + int cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx; + DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + + +/*!\brief + * gpu function of deformable_col2im algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_im pointer of a image (C, H, W,...) in the image batch + */ +template +inline void deformable_col2im(mshadow::Stream* s, + const DType* data_col, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, + DType* grad_im, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + index_t im_size = im_shape.ProdShape(1, im_shape.ndim()); + index_t channel_per_deformable_group = im_shape[1] / deformable_group; + index_t num_kernels = col_shape.ProdShape(0, col_shape.ndim()); + // num_axes should be smaller than block size + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + deformable_col2im_gpu_kernel<<::GetStream(s)>>>( + num_kernels, data_col, data_offset, im_shape[1], im_shape[2], im_shape[3], + kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_im, req); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +/*! + * \brief deformable_col2im_coord gpu kernel. + * \brief DO NOT call this directly. Use wrapper function deformable_col2im_coord() instead; + */ +template +__global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* data_col, + const DType* data_im, const DType* data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* grad_offset, OpReqType req) { + CUDA_KERNEL_LOOP(index, n) { + DType val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = index / width_col / height_col; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const DType* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * width_col * height_col; + const DType* data_im_ptr = data_im + deformable_group_index * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { + const int col_pos = ((col_c * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col) % kernel_w; + int i = (col_pos / width_col / height_col / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + DType inv_h = h_in + i * dilation_h + offset_h; + DType inv_w = w_in + j * dilation_w + offset_w; + if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -1; + } + const DType weight = get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +/*!\brief + * gpu function of deformable_col2im_coord algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch + */ +template +inline void deformable_col2im_coord(mshadow::Stream* s, + const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, + const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + index_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_shape[0] * kernel_shape[1] * deformable_group; + index_t channel_per_deformable_group = col_shape[0] / deformable_group; + // num_axes should be smaller than block size + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + + deformable_col2im_coord_gpu_kernel << ::GetStream(s) >> >( + num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3], + kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset, req); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h new file mode 100644 index 000000000000..9d6180034c71 --- /dev/null +++ b/src/operator/contrib/nn/deformable_im2col.h @@ -0,0 +1,157 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai + */ + +#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ +#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ + +#include +#include +#include +#include +#include "../../mxnet_op.h" + +namespace mxnet { +namespace op { + +/*!\brief + * cpu function of deformable_im2col algorithm + * \param s device stream + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param data_col column buffer pointer + */ +template +inline void deformable_im2col(mshadow::Stream* s, + const DType* data_im, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, const TShape& dilation, + const uint32_t deformable_group, DType* data_col) { + if (2 == kernel_shape.ndim()) { + LOG(FATAL) << "only implemented in GPU"; + } else { + LOG(FATAL) << "not implemented"; + } +} + + +/*!\brief + * cpu function of deformable_col2im algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_im pointer of a image (C, H, W,...) in the image batch + */ +template +inline void deformable_col2im(mshadow::Stream* s, + const DType* data_col, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, + DType* grad_im, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + LOG(FATAL) << "only implemented in GPU"; +} + + +/*!\brief + * cpu function of deformable_col2im_coord algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch + */ + +template +inline void deformable_col2im_coord(mshadow::Stream* s, + const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, + const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { + LOG(FATAL) << "only implemented in GPU"; +} + +} // namespace op +} // namespace mxnet +#ifdef __CUDACC__ +#include "./deformable_im2col.cuh" +#endif +#endif // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h new file mode 100644 index 000000000000..3a3a9c34927c --- /dev/null +++ b/src/operator/contrib/psroi_pooling-inl.h @@ -0,0 +1,222 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling-inl.h + * \brief psroi pooling operator and symbol + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../operator_common.h" + + +namespace mxnet { +namespace op { + +// Declare enumeration of input order to make code more intuitive. +// These enums are only visible within this header +namespace psroipool { +enum PSROIPoolingOpInputs {kData, kBox}; +enum PSROIPoolingOpOutputs {kOut}; +} // psroipool + +struct PSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int pooled_size; + int group_size; + DMLC_DECLARE_PARAMETER(PSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(group_size).set_default(0).describe("fix group size"); + } +}; + +template +class PSROIPoolingOp : public Operator { + public: + explicit PSROIPoolingOp(PSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + CHECK_EQ(in_data.size(), 2); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(out_data[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[psroipool::kData].get(s); + Tensor bbox = in_data[psroipool::kBox].get(s); + Tensor out = out_data[psroipool::kOut].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + out = -FLT_MAX; + PSROIPoolForward(out, data, bbox, param_.spatial_scale, param_.output_dim, param_.group_size); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + CHECK_EQ(in_data.size(), 2); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(out_grad[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); + CHECK_NE(req[psroipool::kData], kWriteInplace) << + "ROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[psroipool::kBox], kWriteInplace) << + "ROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[psroipool::kOut].get(s); + Tensor bbox = in_data[psroipool::kBox].get(s); + Tensor grad_in = in_grad[psroipool::kData].get(s); + Tensor grad_roi = in_grad[psroipool::kBox].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + if (kAddTo == req[psroipool::kData] || kWriteTo == req[psroipool::kData]) { + if (kWriteTo == req[psroipool::kData]) { + grad_in = 0.0f; + } + PSROIPoolBackwardAcc(grad_in, grad_out, bbox, param_.spatial_scale, + param_.output_dim, param_.group_size); + } + if (kWriteTo == req[psroipool::kBox]) { + grad_roi = 0.0f; + } + } + + private: + PSROIPoolingParam param_; +}; // class PSROIPoolingOp + +// Decalre Factory function, used for dispatch specialization +template +Operator* CreateOp(PSROIPoolingParam param, int dtype); + +#if DMLC_USE_CXX11 +class PSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + return {"data", "rois"}; + } + + std::vector ListOutputs() const override { + return {"output"}; + } + + int NumOutputs() const override { + return 1; + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.group_size == 0) { + param_.group_size = param_.pooled_size; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(psroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(psroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + PSROIPoolingProp* psroi_pooling_sym = new PSROIPoolingProp(); + psroi_pooling_sym->param_ = this->param_; + return psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_PSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[psroipool::kOut], in_data[psroipool::kBox]}; + } + + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + + private: + PSROIPoolingParam param_; +}; // class PSROIPoolingProp +#endif +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc new file mode 100644 index 000000000000..ad25aec8eee8 --- /dev/null +++ b/src/operator/contrib/psroi_pooling.cc @@ -0,0 +1,80 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling.cc + * \brief psroi pooling operator + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#include "./psroi_pooling-inl.h" +#include +#include +#include +#include +#include + +using std::max; +using std::min; +using std::floor; +using std::ceil; + +namespace mshadow { +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale_, + const int output_dim_, + const int group_size_) { + // NOT_IMPLEMENTED; + return; +} + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale_, + const int output_dim_, + const int group_size_) { + // NOT_IMPLEMENTED; + return; +} +} // namespace mshadow + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(PSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new PSROIPoolingOp(param); + }); + return op; +} + +Operator *PSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} + +DMLC_REGISTER_PARAMETER(PSROIPoolingParam); + +MXNET_REGISTER_OP_PROPERTY(_contrib_PSROIPooling, PSROIPoolingProp) +.describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " +"spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " +"by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " +"the number of region bounding boxes after PSROIPooling") +.add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") +.add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " +"[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " +"of designated region of interest. batch_index indicates the index of corresponding image " +"in the input data") +.add_arguments(PSROIPoolingParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu new file mode 100644 index 000000000000..962c874c6d1a --- /dev/null +++ b/src/operator/contrib/psroi_pooling.cu @@ -0,0 +1,260 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling.cu + * \brief psroi pooling operator + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#include "./psroi_pooling-inl.h" +#include +#include +#include +#include +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" + +#define PSROIPOOLING_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) +#define CUDA_KERNEL_LOOP(i, n) \ +for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +namespace mshadow { +namespace cuda { + +template +__global__ void PSROIPoolForwardKernel( + const int count, + const DType* bottom_data, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const DType* bottom_rois, + const int output_dim, + const int group_size, + DType* top_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(static_cast(ph) * bin_size_h + + roi_start_h); + int wstart = floor(static_cast(pw)* bin_size_w + + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + int c = (ctop*group_size + gh)*group_size + gw; + + const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; + DType out_sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h*width + w; + out_sum += offset_bottom_data[bottom_index]; + } + } + + DType bin_area = (hend - hstart)*(wend - wstart); + top_data[index] = is_empty? (DType)0. : out_sum/bin_area; + } +} + +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + DType *top_data = out.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int pooled_height = out.size(2); + const int pooled_width = out.size(3); + cudaStream_t stream = Stream::GetStream(out.stream_); + PSROIPoolForwardKernel << > >( + count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, output_dim_, group_size_, top_data); + PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); +} + + +template +__global__ void PSROIPoolBackwardAccKernel( + const int count, + const DType* top_diff, + const int num_rois, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int group_size, + const int output_dim, + DType* bottom_diff, + const DType* bottom_rois) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(static_cast(ph)* bin_size_h + + roi_start_h); + int wstart = floor(static_cast(pw)* bin_size_w + + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Compute c at bottom + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + int c = (ctop*group_size + gh)*group_size + gw; + DType* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; + DType bin_area = (hend - hstart)*(wend - wstart); + DType diff_val = is_empty ? (DType)0. : top_diff[index] / bin_area; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h*width + w; + atomicAdd(offset_bottom_diff + bottom_index, diff_val); + } + } + } +} + + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + // LOG(INFO) << "PSROIPoolBackward"; + const DType *top_diff = out_grad.dptr_; + const DType *bottom_rois = bbox.dptr_; + DType *bottom_diff = in_grad.dptr_; + const int count = out_grad.shape_.Size(); + const int num_rois = bbox.size(0); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int pooled_height = out_grad.size(2); + const int pooled_width = out_grad.size(3); + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + PSROIPoolBackwardAccKernel << > >( + count, top_diff, num_rois, spatial_scale, channels, height, width, + pooled_height, pooled_width, group_size_, output_dim_, bottom_diff, bottom_rois); + PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); +} + +} // namespace cuda + +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + cuda::PSROIPoolForward(out, data, bbox, spatial_scale, output_dim_, group_size_); +} + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + cuda::PSROIPoolBackwardAcc(in_grad, out_grad, bbox, spatial_scale, output_dim_, group_size_); +} + +} // namespace mshadow + + +namespace mxnet { +namespace op { + +template<> +Operator* CreateOp(PSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new PSROIPoolingOp(param); + }); + return op; +} + +} // namespace op +} // namespace mxnet diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 48e44133216b..4b884f523789 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1093,6 +1093,166 @@ def test_unfuse(): check_rnn_consistency(fused, stack) check_rnn_consistency(stack, fused) +def test_psroipooling_with_type(): + np.random.seed(1234) + arg_params = { + 'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])} + + # plain psroipooling + sym = mx.contrib.sym.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool') + ctx_list = [{'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float64, 'psroipool_rois': np.float64}}, + {'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float32, 'psroipool_rois': np.float32}}, + {'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float16, 'psroipool_rois': np.float16}}, + ] + + check_consistency(sym, ctx_list, grad_req={'psroipool_data': 'write', + 'psroipool_rois': 'null'}, arg_params=arg_params) + +def test_deformable_psroipooling_with_type(): + np.random.seed(1234) + arg_params = { + 'deformable_psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])} + + # deformable psroipooling + sym = mx.contrib.sym.DeformablePSROIPooling(spatial_scale=0.0625, sample_per_part=4, group_size=3, pooled_size=3, + output_dim=2, trans_std=0.1, no_trans=False, name='deformable_psroipool') + + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float64, 'deformable_psroipool_rois': np.float64, + 'deformable_psroipool_trans': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float32, 'deformable_psroipool_rois': np.float32, + 'deformable_psroipool_trans': np.float32}}, + {'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float16, 'deformable_psroipool_rois': np.float16, + 'deformable_psroipool_trans': np.float16}}, + ] + + check_consistency(sym, ctx_list, grad_req={'deformable_psroipool_data': 'write', + 'deformable_psroipool_rois': 'null', + 'deformable_psroipool_trans': 'write'}, arg_params=arg_params) + +def test_deformable_convolution_with_type(): + np.random.seed(1234) + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv') + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 10, 10), + 'deformable_conv_offset': (2, 18, 8, 8), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 10, 10), + 'deformable_conv_offset': (2, 18, 8, 8), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 10, 10), + # 'deformable_conv_offset': (2, 18, 8, 8), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_conv_offset': np.float16}}, + ] + # wider tolerance needed for true-fp16 NCHW test above + tol = {np.dtype(np.float16): 0.5, + np.dtype(np.float32): 1e-3, + np.dtype(np.float64): 1e-5, + np.dtype(np.uint8): 0, + np.dtype(np.int32): 0} + check_consistency(sym, ctx_list, tol=tol) + # test ability to turn off training on bias + check_consistency(sym, ctx_list, grad_req={'deformable_conv_data': 'write', + 'deformable_conv_offset': 'write', + 'deformable_conv_weight': 'write', + 'deformable_conv_bias': 'null'}, tol=tol) +def test_deformable_convolution_options(): + # 2D convolution + + # Pad > 0 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 7, 7), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 7, 7), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_offset': (2, 18, 7, 7), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Stride > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 18, 3, 3), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Dilate > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 18, 3, 3), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Deformable group > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 36, 5, 5), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 36, 5, 5), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 36, 5, 5), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, + name='deformable_conv') def test_residual_fused(): cell = mx.rnn.ResidualCell( mx.rnn.FusedRNNCell(50, num_layers=3, mode='lstm', @@ -1144,3 +1304,8 @@ def test_residual_fused(): test_take_with_type() test_bilinear_sampler_with_type() test_grid_generator_with_type() + test_psroipooling_with_type() + test_deformable_psroipooling_with_type() + test_deformable_convolution_options() + test_deformable_convolution_with_type() + diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 79795e9386b6..2cdac27c3edb 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -2,6 +2,7 @@ import numpy as np import mxnet as mx import random +import itertools from numpy.testing import assert_allclose from mxnet.test_utils import * @@ -3160,6 +3161,101 @@ def create_operator(self, ctx, shapes, dtypes): check_numeric_gradient(op, [x]) +def test_psroipooling(): + for num_rois in [1, 2]: + for num_classes, num_group in itertools.product([2, 3], [2, 3]): + for image_height, image_width in itertools.product([168, 224], [168, 224]): + for grad_nodes in [['im_data']]: + spatial_scale = 0.0625 + feat_height = np.int(image_height * spatial_scale) + feat_width = np.int(image_width * spatial_scale) + im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) + rois_data = np.zeros([num_rois, 5]) + rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) + rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1)) + + im_data_var = mx.symbol.Variable(name="im_data") + rois_data_var = mx.symbol.Variable(name="rois_data") + op = mx.contrib.sym.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale, + group_size=num_group, pooled_size=num_group, + output_dim=num_classes, name='test_op') + rtol, atol = 1e-2, 1e-4 + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + +def test_deformable_convolution(): + for num_batch in [1, 2]: + for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): + for input_height, input_width in itertools.product([5, 6], [5, 6]): + for dilate in [(1, 1), (2, 2)]: + for grad_nodes in [['im_data'], ['offset_data']]: + output_height = input_height + output_width = input_width + im_data = np.random.rand(num_batch, num_channel_data, input_height, input_width) + offset_data = \ + np.random.rand(num_batch, num_deformable_group * 3 * 3 * 2, output_height, output_width)\ + * 0.8 + 0.1 + + weight = np.random.normal(0, 0.001, (num_channel_data, num_channel_data, 3, 3)) + bias = np.zeros(num_channel_data) + + im_data_var = mx.symbol.Variable(name="im_data") + offset_data_var = mx.symbol.Variable(name="offset_data") + weight_var = mx.symbol.Variable(name="weight") + bias_var = mx.symbol.Variable(name="bias") + op = mx.contrib.sym.DeformableConvolution(name='test_op', data=im_data_var, + offset=offset_data_var, + weight=weight_var, bias=bias_var, + num_filter=num_channel_data, pad=dilate, + kernel=(3, 3), stride=(1, 1), dilate=dilate, + num_deformable_group=num_deformable_group) + if grad_nodes[0] == 'offset_data': + # wider tolerance needed for coordinate differential + rtol, atol = 1.0, 1e-2 + else: + rtol, atol = 0.05, 1e-4 + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + + +def test_deformable_psroipooling(): + for num_rois in [1, 2]: + for num_classes, num_group in itertools.product([2, 3], [2, 3]): + for image_height, image_width in itertools.product([168, 224], [168, 224]): + for grad_nodes in [['im_data'], ['offset_data']]: + spatial_scale = 0.0625 + feat_height = np.int(image_height * spatial_scale) + feat_width = np.int(image_width * spatial_scale) + im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) + rois_data = np.zeros([num_rois, 5]) + rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) + rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1)) + offset_data = np.random.rand(num_rois, 2*num_classes, num_group, num_group) * 0.1 + + im_data_var = mx.symbol.Variable(name="im_data") + rois_data_var = mx.symbol.Variable(name="rois_data") + offset_data_var = mx.symbol.Variable(name="offset_data") + op = mx.contrib.sym.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var, + trans=offset_data_var, spatial_scale=spatial_scale, + sample_per_part=4, group_size=num_group, + pooled_size=num_group, output_dim=num_classes, + trans_std=0.1, no_trans=False, name='test_op') + if grad_nodes[0] == 'offset_data': + # wider tolerance needed for coordinate differential + rtol, atol = 1.0, 1e-2 + else: + rtol, atol = 1e-2, 1e-4 + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + + + def test_laop(): # Temporarily disabled until lapack is enabled by default return @@ -3409,7 +3505,7 @@ def test_laop(): if grad_check == 1: check_numeric_gradient(test_sumlogdiag, [a]) - + if __name__ == '__main__': import nose nose.runmodule()