From 6a43ec4bd730d677a45c9dcfb9300cec473920be Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 16:10:30 +0800 Subject: [PATCH 01/18] Add operators for Deformable ConvNets/FCIS/DFF --- CONTRIBUTORS.md | 6 +- src/operator/contrib/channel_operator-inl.h | 417 ++++++++++++ src/operator/contrib/channel_operator.cc | 96 +++ src/operator/contrib/channel_operator.cu | 303 +++++++++ .../contrib/deformable_convolution-inl.h | 487 +++++++++++++++ .../contrib/deformable_convolution.cc | 89 +++ .../contrib/deformable_convolution.cu | 29 + .../contrib/deformable_psroi_pooling-inl.h | 280 +++++++++ .../contrib/deformable_psroi_pooling.cc | 96 +++ .../contrib/deformable_psroi_pooling.cu | 402 ++++++++++++ src/operator/contrib/multi_proposal-inl.h | 301 +++++++++ src/operator/contrib/multi_proposal.cc | 63 ++ src/operator/contrib/multi_proposal.cu | 591 ++++++++++++++++++ src/operator/contrib/nn/deformable_im2col.cuh | 525 ++++++++++++++++ src/operator/contrib/nn/deformable_im2col.h | 157 +++++ src/operator/contrib/psroi_pooling-inl.h | 222 +++++++ src/operator/contrib/psroi_pooling.cc | 80 +++ src/operator/contrib/psroi_pooling.cu | 260 ++++++++ 18 files changed, 4403 insertions(+), 1 deletion(-) create mode 100644 src/operator/contrib/channel_operator-inl.h create mode 100644 src/operator/contrib/channel_operator.cc create mode 100644 src/operator/contrib/channel_operator.cu create mode 100644 src/operator/contrib/deformable_convolution-inl.h create mode 100644 src/operator/contrib/deformable_convolution.cc create mode 100644 src/operator/contrib/deformable_convolution.cu create mode 100644 src/operator/contrib/deformable_psroi_pooling-inl.h create mode 100644 src/operator/contrib/deformable_psroi_pooling.cc create mode 100644 src/operator/contrib/deformable_psroi_pooling.cu create mode 100644 src/operator/contrib/multi_proposal-inl.h create mode 100644 src/operator/contrib/multi_proposal.cc create mode 100644 src/operator/contrib/multi_proposal.cu create mode 100644 src/operator/contrib/nn/deformable_im2col.cuh create mode 100644 src/operator/contrib/nn/deformable_im2col.h create mode 100644 src/operator/contrib/psroi_pooling-inl.h create mode 100644 src/operator/contrib/psroi_pooling.cc create mode 100644 src/operator/contrib/psroi_pooling.cu diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3de0e1b61638..4ce449c1c130 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -135,4 +135,8 @@ List of Contributors * [David Salinas](https://github.com/geoalgo) * [Asmus Hetzel](https://github.com/asmushetzel) * [Roshani Nagmote](https://github.com/Roshrini) -* [Chetan Khatri](https://github.com/chetkhatri/) +* [Yuwen Xiong](https://github.com/Orpine/) +* [Haozhi Qi](https://github.com/Oh233/) +* [Yi Li](https://github.com/liyi14/) +* [Guodong Zhang](https://github.com/gd-zhang/) +* [Xizhou Zhu](https://github.com/einsiedler0408/) diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h new file mode 100644 index 000000000000..a5cd9a9172e4 --- /dev/null +++ b/src/operator/contrib/channel_operator-inl.h @@ -0,0 +1,417 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file channel_operator-inl.h + * \brief + * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ +#define MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" + +namespace mxnet { + namespace op { + // Declare enumeration of input order to make code more intuitive. + // // These enums are only visible within this header + namespace channel_operator { + enum ChannelOperatorOpInputs { kData, kPick }; + enum ChannelOperatorOpOutputs { kOut, kTemp }; + enum ChannelOperatorOpType { kGroupMax, kGroupPick, kGroupSoftmax }; + enum ChannelOperatorOpPickType { kLabel, kScore }; + } // activation + + struct ChannelOperatorParam : public dmlc::Parameter { + // use int for enumeration + int op_type; + int group; + int pick_type; + DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { + DMLC_DECLARE_FIELD(op_type) + .add_enum("Group_Max", channel_operator::kGroupMax) + .add_enum("Group_Pick", channel_operator::kGroupPick) + .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) + .describe("Channel operator to be applied."); + DMLC_DECLARE_FIELD(group).describe("group size"); + DMLC_DECLARE_FIELD(pick_type) + .add_enum("Label_Pick", channel_operator::kLabel) + .add_enum("Score_Pick", channel_operator::kScore) + .set_default(channel_operator::kLabel) + .describe("pick type"); + } + }; + + /** + * \brief This is the implementation of channel operator. + * \tparam xpu The device that the op will be executed on. + */ + template + class ChannelOperatorOp : public Operator { + public: + explicit ChannelOperatorOp(ChannelOperatorParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } + else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) + out_expected = 1; + else + out_expected = 2; + } + else { + LOG(FATAL) << "No that operation type."; + } + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + if (param_.op_type == channel_operator::kGroupSoftmax) { + int total_size = in_data[channel_operator::kData].Size(); + int batch_size = in_data[channel_operator::kData].shape_[0]; + int channel_num = in_data[channel_operator::kData].shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); + + Tensor data = in_data[channel_operator::kData].get_with_shape(data_shape, s); + Tensor out = out_data[channel_operator::kOut].get_with_shape(data_shape, s); + Softmax(out, data); + } + else if (param_.op_type == channel_operator::kGroupMax) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + + GroupMaxForward(out, data, max_idx, param_.group); + } + else if (param_.op_type == channel_operator::kGroupPick) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + + if (param_.pick_type == channel_operator::kScore) { + Tensor argmax_data = out_data[channel_operator::kTemp].get(s); + GetMaxIdx(pick_idx, argmax_data, param_.group); + GroupPickForward(out, data, argmax_data, param_.group); + } + else { + GroupPickForward(out, data, pick_idx, param_.group); + } + } + else { + LOG(FATAL) << "No that operation type."; + } + + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } + else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) + out_expected = 1; + else + out_expected = 2; + } + else { + LOG(FATAL) << "No that operation type."; + } + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + + if (param_.op_type == channel_operator::kGroupMax) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + LOG(FATAL) << "Not Implemented."; + } + else if (param_.op_type == channel_operator::kGroupPick) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + Tensor pick_diff = in_grad[channel_operator::kPick].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + Assign(pick_diff, req[channel_operator::kPick], 0); + if (param_.pick_type == channel_operator::kScore) { + LOG(FATAL) << "Not Implemented."; + } + else { + GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); + } + + } + else { + LOG(FATAL) << "No that operation type."; + } + + } + private: + ChannelOperatorParam param_; + }; // class ChannelOperatorOp + + // Decalre Factory function, used for dispatch specialization + template + Operator* CreateOp(ChannelOperatorParam type, int dtype); + +#if DMLC_USE_CXX11 + class ChannelOperatorProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.op_type == channel_operator::kGroupMax || + param_.op_type == channel_operator::kGroupSoftmax) { + return{ "data" }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + return{ "data", "pick_idx" }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; + } + } + + std::vector ListOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ "output" }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) + return{ "output" }; + else + return{ "output", "argmax_data" }; + } + else if (param_.op_type == channel_operator::kGroupMax) { + return{ "output", "max_idx" }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; + } + } + + int NumOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return 1; + } + else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) + return 1; + else + return 2; + } + else if (param_.op_type == channel_operator::kGroupMax) { + return 2; + } + else { + LOG(FATAL) << "No that operation type."; + return NULL; + } + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(channel_operator::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + return true; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, c, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); + return true; + } + else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; + // out: [num_rois, c/group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); + if (param_.pick_type == channel_operator::kScore) { + out_shape->push_back( + Shape4(dshape[0], 1, 1, 1)); + } + return true; + } + else { + LOG(FATAL) << "No that operation type."; + return false; + } + + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "Input must have specified type"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + return true; + } + else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_type->size(), 2); + out_type->clear(); + out_type->push_back(dtype); + if (param_.pick_type == channel_operator::kScore) { + out_type->push_back(dtype); + } + return true; + } + else { + LOG(FATAL) << "No that operation type."; + return false; + } + + } + + OperatorProperty* Copy() const override { + ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); + channel_operator_sym->param_ = this->param_; + return channel_operator_sym; + } + + std::string TypeString() const override { + return "_contrib_ChannelOperator"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.op_type == channel_operator::kGroupMax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kTemp] }; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kOut] }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + return{ out_grad[channel_operator::kOut], + in_data[channel_operator::kPick] }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; + } + } + + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + + private: + ChannelOperatorParam param_; + }; // class PSROIPoolingAlignProp +#endif + } // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ \ No newline at end of file diff --git a/src/operator/contrib/channel_operator.cc b/src/operator/contrib/channel_operator.cc new file mode 100644 index 000000000000..0c9c82fa554d --- /dev/null +++ b/src/operator/contrib/channel_operator.cc @@ -0,0 +1,96 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file channel_operator.cc + * \brief + * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./channel_operator-inl.h" +#include +#include +#include +#include +#include + +using std::max; +using std::min; +using std::floor; +using std::ceil; + +namespace mshadow { + template + inline void GroupMaxForward(const Tensor &out, + const Tensor &data, + const Tensor &max_idx, + const int group) { + // NOT_IMPLEMENTED; + return; + } + template + inline void GroupPickForward(const Tensor &out, + const Tensor &data, + const Tensor &pick_idx, + const int group) { + // NOT_IMPLEMENTED; + return; + } + + template + inline void GroupMaxBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &max_idx, + const int group) { + // NOT_IMPLEMENTED; + return; + } + + template + inline void GroupPickBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &pick_idx, + const int group) { + // NOT_IMPLEMENTED; + return; + } + + template + inline void GetMaxIdx(const Tensor &pick_score, + const Tensor &argmax, + const int group) { + // NOT_IMPLEMENTED; + return; + } +} // namespace mshadow + +namespace mxnet { + namespace op { + + template<> + Operator *CreateOp(ChannelOperatorParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ChannelOperatorOp(param); + }); + return op; + } + + Operator *ChannelOperatorProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + } + + DMLC_REGISTER_PARAMETER(ChannelOperatorParam); + + MXNET_REGISTER_OP_PROPERTY(_contrib_ChannelOperator, ChannelOperatorProp) + .describe("Performs channel operation on inputs, including GroupMax, GroupSoftmax, GroupPick " + "and ChannelPick. This layer is designed for FCIS ") + .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") + .add_argument("pick_idx", "Symbol", "In GroupPick or ChannelPick mode, pick_idx is used to" + "pick specific group or channel") + .add_arguments(ChannelOperatorParam::__FIELDS__()); + } // namespace op +} // namespace mxnet \ No newline at end of file diff --git a/src/operator/contrib/channel_operator.cu b/src/operator/contrib/channel_operator.cu new file mode 100644 index 000000000000..bbf9ad2b43db --- /dev/null +++ b/src/operator/contrib/channel_operator.cu @@ -0,0 +1,303 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file channel_operator.cu + * \brief + * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./channel_operator-inl.h" +#include +#include +#include +#include +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" + +#define ChannelOperator_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) +#define CUDA_KERNEL_LOOP(i, n) \ +for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +namespace mshadow { + namespace cuda { + template + __global__ void GroupMaxForwardKernel( + const int count, + const DType* bottom_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* top_data, + DType* max_idx_data) { + CUDA_KERNEL_LOOP(index, count) { + + int s = index % spatial_dim; + int g = (index / spatial_dim) % group; + int n = index / spatial_dim / group; + + DType max_val = -FLT_MAX; + int max_idx = -1; + for (int i = 0; i < channels_in_group; ++i) { + int c = g*channels_in_group + i; + int bottom_index = (n*channels + c)*spatial_dim + s; + if (bottom_data[bottom_index]>max_val) { + max_val = bottom_data[bottom_index]; + max_idx = c; + } + } + top_data[index] = max_val; + max_idx_data[index] = max_idx; + } + } + + template + __global__ void GroupPickForwardKernel( + const int count, + const DType* bottom_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* top_data, + const DType* pick_idx_data) { + CUDA_KERNEL_LOOP(index, count) { + + int s = index % spatial_dim; + int c = (index / spatial_dim) % channels_in_group; + int n = index / spatial_dim / channels_in_group; + int g = pick_idx_data[n]; + int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; + + top_data[index] = (g < group && g >= 0) ? bottom_data[bottom_index] : DType(0); + } + } + + template + inline void GroupMaxForward(const Tensor &out, + const Tensor &data, + const Tensor &max_idx, + const int group) { + // LOG(INFO) << "GroupMaxForward"; + const DType *bottom_data = data.dptr_; + DType *top_data = out.dptr_; + DType *max_idx_data = max_idx.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(out.stream_); + GroupMaxForwardKernel << > >( + count, bottom_data, channels, group, + channels_in_group, spatial_dim, top_data, max_idx_data); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + + template + inline void GroupPickForward(const Tensor &out, + const Tensor &data, + const Tensor &pick_idx, + const int group) { + // LOG(INFO) << "GroupPickForward"; + const DType *bottom_data = data.dptr_; + DType *top_data = out.dptr_; + const DType *pick_idx_data = pick_idx.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(out.stream_); + GroupPickForwardKernel << > >( + count, bottom_data, channels, group, + channels_in_group, spatial_dim, top_data, pick_idx_data); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + + + template + __global__ void GroupMaxBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* max_idx_data, + const int channels, + const int group, + const int spatial_dim, + DType* bottom_diff) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int n = index / spatial_dim / group; + + int c = max_idx_data[index]; + int bottom_index = (n*channels + c)*spatial_dim + s; + bottom_diff[bottom_index] = top_diff[index]; + } + } + + template + __global__ void GroupPickBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* pick_idx_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* bottom_diff) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int c = (index / spatial_dim) % channels_in_group; + int n = index / spatial_dim / channels_in_group; + int g = pick_idx_data[n]; + + int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; + bottom_diff[bottom_index] = (g < group && g >= 0) ? top_diff[index] : DType(0); + } + } + + + template + inline void GroupMaxBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &max_idx, + const int group) { + // LOG(INFO) << "GroupMaxBackward"; + const DType *top_diff = out_grad.dptr_; + DType *bottom_diff = in_grad.dptr_; + const DType *max_idx_data = max_idx.dptr_; + const int count = out_grad.shape_.Size(); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int spatial_dim = height * width; + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + GroupMaxBackwardAccKernel << > >( + count, top_diff, max_idx_data, channels, group, spatial_dim, bottom_diff); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + + template + inline void GroupPickBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &pick_idx, + const int group) { + // LOG(INFO) << "GroupPickBackward"; + const DType *top_diff = out_grad.dptr_; + DType *bottom_diff = in_grad.dptr_; + const DType *pick_idx_data = pick_idx.dptr_; + const int count = out_grad.shape_.Size(); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + GroupPickBackwardAccKernel << > >( + count, top_diff, pick_idx_data, channels, group, + channels_in_group, spatial_dim, bottom_diff); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + // GetMaxIdx + template + __global__ void GetMaxIdxKernel( + const int count, + const DType* pick_score_data, + DType* argmax_data, + const int group) { + CUDA_KERNEL_LOOP(index, count) { + const DType* offset_pick_score_data = pick_score_data + index*group; + int max_idx = -1; + DType max_val = -FLT_MAX; + for (int i = 1; i < group; ++i) { + max_idx = offset_pick_score_data[i] > max_val ? i : max_idx; + max_val = offset_pick_score_data[i] > max_val ? offset_pick_score_data[i] : max_val; + } + argmax_data[index] = static_cast(max_idx); + } + } + + template + inline void GetMaxIdx(const Tensor &pick_score, + const Tensor &argmax, + const int group) { + // LOG(INFO) << "GroupPickBackward"; + const DType *pick_score_data = pick_score.dptr_; + DType *argmax_data = argmax.dptr_; + const int count = argmax.shape_.Size(); + + cudaStream_t stream = Stream::GetStream(argmax.stream_); + GetMaxIdxKernel << > >( + count, pick_score_data, argmax_data, group); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + } // namespace cuda + + template + inline void GroupMaxForward(const Tensor &out, + const Tensor &data, + const Tensor &max_idx, + const int group) { + cuda::GroupMaxForward(out, data, max_idx, group); + } + template + inline void GroupPickForward(const Tensor &out, + const Tensor &data, + const Tensor &pick_idx, + const int group) { + cuda::GroupPickForward(out, data, pick_idx, group); + } + + template + inline void GroupMaxBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &max_idx, + const int group) { + cuda::GroupMaxBackwardAcc(in_grad, out_grad, max_idx, group); + } + + template + inline void GroupPickBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &pick_idx, + const int group) { + cuda::GroupPickBackwardAcc(in_grad, out_grad, pick_idx, group); + } + + template + inline void GetMaxIdx(const Tensor &pick_score, + const Tensor &argmax, + const int group) { + cuda::GetMaxIdx(pick_score, argmax, group); + } + +} // namespace mshadow + + +namespace mxnet { + namespace op { + + template<> + Operator* CreateOp(ChannelOperatorParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ChannelOperatorOp(param); + }); + return op; + } + + } // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h new file mode 100644 index 000000000000..ccc6bb36e72b --- /dev/null +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -0,0 +1,487 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution-inl.h + * \brief + * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_DEFORMABLE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_DEFORMABLE_CONVOLUTION_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../nn/im2col.h" +#include "./nn/deformable_im2col.h" + + +namespace mxnet { + namespace op { + + namespace conv { + enum DeformableConvolutionOpInputs { kData, kOffset, kWeight, kBias }; + enum DeformableConvolutionOpOutputs { kOut }; + enum DeformableConvolutionOpResource { kTempSpace }; + } + + struct DeformableConvolutionParam : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape dilate; + TShape pad; + uint32_t num_filter; + uint32_t num_group; + uint32_t num_deformable_group; + uint64_t workspace; + bool no_bias; + dmlc::optional layout; + DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { + DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .describe("convolution stride: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(dilate).set_default(TShape()) + .describe("convolution dilate: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for convolution: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) + .describe("convolution filter(channel) number"); + DMLC_DECLARE_FIELD(num_group).set_default(1) + .describe("Number of group partitions."); + DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) + .describe("Number of deformable group partitions."); + DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) + .describe("Maximum temperal workspace allowed for convolution (MB)."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(layout) + .add_enum("NCW", mshadow::kNCW) + .add_enum("NCHW", mshadow::kNCHW) + .add_enum("NCDHW", mshadow::kNCDHW) + .set_default(dmlc::optional()) + .describe("Set layout for input, output and weight. Empty for\n " + "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + } + }; + + template + class DeformableConvolutionOp : public Operator { + public: + explicit DeformableConvolutionOp(DeformableConvolutionParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + CHECK(param_.layout.value() == mshadow::kNCW || + param_.layout.value() == mshadow::kNCHW || + param_.layout.value() == mshadow::kNCDHW) + << "Only support NCW, NCHW and NCDHW layout"; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(req[conv::kOut], kWriteTo); + size_t expected = param_.no_bias ? 3 : 4; + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[conv::kData].shape_, in_data[conv::kOffset].shape_, out_data[conv::kOut].shape_); + Stream* s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_data[0].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + index_t M = conv_out_channels_ / group_; + index_t N = conv_out_spatial_dim_; + index_t K = kernel_dim_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, M, K), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, K, N), s); + Tensor output_4d = out_data[conv::kOut].get_with_shape( + Shape4(num_, group_, M, N), s); + for (index_t n = 0; n < num_; ++n) { + // transform image to col_buffer in order to use gemm + deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + col_buffer.dptr()); + Tensor output_3d = output_4d[n]; + for (index_t g = 0; g < group_; ++g) { + ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); + } + } + if (bias_term_) { + Tensor bias = in_data[conv::kBias].get(s); + Tensor output_3d = out_data[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + // has bias term, broadcast it to the same shape of output_3d in channel dim + output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = param_.no_bias == 0 ? 4 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); + LayerSetUp(in_grad[conv::kData].shape_, in_grad[conv::kOffset].shape_, out_grad[conv::kOut].shape_); + Stream *s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // For computing dLoss/d(in_data[kData]) + index_t M = kernel_dim_; + index_t N = conv_out_spatial_dim_; + index_t K = conv_out_channels_ / group_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( + Shape4(num_, group_, K, N), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, M, N), s); + // For computing dLoss/dWeight + Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + + Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); + data_grad = 0; + + + for (index_t n = 0; n < num_; ++n) { + Tensor out_grad_3d = out_grad_4d[n]; + for (index_t g = 0; g < group_; ++g) { + col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); + } + + // gradient w.r.t. input coordinate data + deformable_col2im_coord(s, col_buffer.dptr(), + in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kOffset].dptr() + n*input_offset_dim_, + req[conv::kData]); + + // gradient w.r.t. input data + deformable_col2im(s, col_buffer.dptr(), + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kData].dptr() + n*input_dim_, + req[conv::kData]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + col_buffer.dptr()); + for (index_t g = 0; g < group_; ++g) { + if (0 == n) { + ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], + dot(out_grad_3d[g], col_buffer_3d[g].T())); + } + else { + dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); + } + } + } + + // gradient w.r.t bias + if (bias_term_) { + Tensor dbias = in_grad[conv::kBias].get(s); + Tensor dout = out_grad[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); + } + + } + + private: + void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const index_t num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + is_1x1_ = true; + for (index_t i = 0; i < param_.kernel.ndim(); ++i) { + is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; + if (!is_1x1_) break; + } + + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = param_.num_filter; + conv_in_channels_ = channels_; + bias_term_ = !param_.no_bias; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + + private: + DeformableConvolutionParam param_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t input_offset_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool bias_term_; // has bias term? + bool is_1x1_; + }; // class ConvolutionOp + + template + Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); + +#if DMLC_USE_CXX11 + class DeformableConvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return{ "data", "offset", "weight", "bias" }; + } + else { + return{ "data", "offset", "weight" }; + } + } + + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } + else { + LOG(FATAL) << "not implemented"; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; + } + else { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + const TShape &oshp = (*in_shape)[conv::kOffset]; + if (dshp.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + CHECK_EQ(oshp.ndim(), 4U) \ + << "Input offset should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t ksize_y = static_cast(param_.kernel[0]); + const index_t ksize_x = static_cast(param_.kernel[1]); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ + << "input num_filter must divide deformable group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = (dshape[2] + 2 * param_.pad[0] - + (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; + oshape[3] = (dshape[3] + 2 * param_.pad[1] - + (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + << "output num_filter must divide deformable group size"; + CHECK_EQ(oshape[2], offsetshape[2]) \ + << "output height must equal to offset map height"; + CHECK_EQ(oshape[3], offsetshape[3]) \ + << "output width must equal to offset map width"; + CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + << "offset filter must divide deformable group size"; + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), param_.num_deformable_group) \ + << "offset filter must divide deformable group size"; + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (param_.stride[0] == 1) { + dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; + } + if (param_.stride[1] == 1) { + dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; + } + return true; + } + else { + LOG(FATAL) << "not implemented"; + return false; + } + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } + else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DeformableConvolutionProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_DeformableConvolution"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return{ out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kOffset], in_data[conv::kWeight] }; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + DeformableConvolutionParam param_; + }; // class ConvolutionProp +#endif // DMLC_USE_CXX11 + } // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONVOLUTION_INL_H_ diff --git a/src/operator/contrib/deformable_convolution.cc b/src/operator/contrib/deformable_convolution.cc new file mode 100644 index 000000000000..a5916a578304 --- /dev/null +++ b/src/operator/contrib/deformable_convolution.cc @@ -0,0 +1,89 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution.cc + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ + +#include "./deformable_convolution-inl.h" + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(DeformableConvolutionParam); + +template<> +Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionOp(param); + }) + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *DeformableConvolutionProp::CreateOperatorEx(Context ctx, + std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +} + +MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolution, DeformableConvolutionProp) +.describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. + +In the 2-D convolution, given input data with shape *(batch_size, +channel, height, width)*, the output is computed by + +.. math:: + + out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star + weight[i,j,:,:] + +where :math:`\star` is the 2-D cross-correlation operator. + +For general 2-D convolution, the shapes are + +- **data**: *(batch_size, channel, height, width)* +- **weight**: *(num_filter, channel, kernel[0], kernel[1])* +- **bias**: *(num_filter,)* +- **out**: *(batch_size, num_filter, out_height, out_width)*. + +Define:: + + f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 + +then we have:: + + out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) + out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) + +If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + +The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height, +width)*. We can choose other layouts such as *NHWC*. + +If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` +evenly into *g* parts along the channel axis, and also evenly split ``weight`` +along the first dimension. Next compute the convolution on the *i*-th part of +the data with the *i*-th weight part. The output is obtained by concating all +the *g* results. + +Both ``weight`` and ``bias`` are learnable parameters. + + +)code" ADD_FILELINE) +.add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionOp.") +.add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionOp.") +.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") +.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") +.add_arguments(DeformableConvolutionParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/deformable_convolution.cu b/src/operator/contrib/deformable_convolution.cu new file mode 100644 index 000000000000..59948fd7765b --- /dev/null +++ b/src/operator/contrib/deformable_convolution.cu @@ -0,0 +1,29 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution.cu + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai +*/ + +#include "./deformable_convolution-inl.h" +#include + +namespace mxnet { + namespace op { + + template<> + Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionOp(param); + }) + return op; + } + + } // namespace op +} // namespace mxnet + diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h new file mode 100644 index 000000000000..7226299997af --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling-inl.h @@ -0,0 +1,280 @@ +/*! +* Copyright (c) 2017 Microsoft +* Licensed under The Apache-2.0 License [see LICENSE for details] +* \file deformable_psroi_pooling-inl.h +* \brief deformable psroi pooling operator and symbol +* \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../operator_common.h" + + +namespace mxnet { + namespace op { + + // Declare enumeration of input order to make code more intuitive. + // These enums are only visible within this header + namespace deformablepsroipool { + enum DeformablePSROIPoolingOpInputs { kData, kBox, kTrans }; + enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount }; + } // deformablepsroipool + + struct DeformablePSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int group_size; + int pooled_size; + int part_size; + int sample_per_part; + float trans_std; + bool no_trans; + DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(group_size).describe("fix group size"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); + DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); + DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0).describe("fix transition std"); + DMLC_DECLARE_FIELD(no_trans).set_default(false) + .describe("Whether to disable trans parameter."); + } + }; + + template + class DeformablePSROIPoolingOp : public Operator { + public: + explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor out = out_data[deformablepsroipool::kOut].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + out = -FLT_MAX; + top_count = 0.0f; + + Tensor trans; + if (!param_.no_trans) { + trans = in_data[deformablepsroipool::kTrans].get(s); + } + DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, param_.spatial_scale, + param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans ? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << + // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); + Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); + Tensor grad_trans; + Tensor trans; + if (!param_.no_trans) { + CHECK_EQ(in_grad.size(), 3); + trans = in_data[deformablepsroipool::kTrans].get(s); + grad_trans = in_grad[deformablepsroipool::kTrans].get(s); + } + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[deformablepsroipool::kData], 0); + if (!param_.no_trans) { + Assign(grad_trans, req[deformablepsroipool::kTrans], 0); + } + DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, top_count, param_.no_trans, + param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, + param_.sample_per_part, param_.trans_std); + Assign(grad_roi, req[deformablepsroipool::kBox], 0); + } + + private: + DeformablePSROIPoolingParam param_; + }; // class DeformablePSROIPoolingOp + + // Decalre Factory function, used for dispatch specialization + template + Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); + +#if DMLC_USE_CXX11 + class DeformablePSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.no_trans) { + return{ "data", "rois" }; + } + else { + return{ "data", "rois", "trans" }; + } + } + + std::vector ListOutputs() const override { + return{ "output", "top_count" }; + } + + int NumOutputs() const override { + return 2; + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.part_size == 0) { + param_.part_size = param_.pooled_size; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.no_trans) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + } + else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; + // trans: [num_rois, 2, pooled_h, pooled_w] + TShape tshape = in_shape->at(deformablepsroipool::kTrans); + CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; + } + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(deformablepsroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(deformablepsroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + // top_count: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); + deformable_psroi_pooling_sym->param_ = this->param_; + return deformable_psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_DeformablePSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.no_trans) { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], + out_data[deformablepsroipool::kTopCount] }; + } + else { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], + in_data[deformablepsroipool::kTrans], out_data[deformablepsroipool::kTopCount] }; + } + } + + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + + private: + DeformablePSROIPoolingParam param_; + }; // class DeformablePSROIPoolingProp +#endif + } // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ \ No newline at end of file diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc new file mode 100644 index 000000000000..4a21a7933f44 --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling.cc @@ -0,0 +1,96 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_psroi_pooling.cc + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./deformable_psroi_pooling-inl.h" +#include +#include +#include +#include +#include + +using std::max; +using std::min; +using std::floor; +using std::ceil; + +namespace mshadow { + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // NOT_IMPLEMENTED; + return; + } + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // NOT_IMPLEMENTED; + return; + } +} // namespace mshadow + +namespace mxnet { + namespace op { + + template<> + Operator *CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } + + Operator *DeformablePSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + } + + DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); + + MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) + .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " + "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " + "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " + "the number of region bounding boxes after DeformablePSROIPooling") + .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") + .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " + "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " + "of designated region of interest. batch_index indicates the index of corresponding image " + "in the input data") + .add_argument("trans", "Symbol", "transition parameter") + .add_arguments(DeformablePSROIPoolingParam::__FIELDS__()); + } // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu new file mode 100644 index 000000000000..5b8f36138e86 --- /dev/null +++ b/src/operator/contrib/deformable_psroi_pooling.cu @@ -0,0 +1,402 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_psroi_pooling.cu + * \brief + * \author Yi Li, Guodong Zhang, Jifeng Dai +*/ +#include "./deformable_psroi_pooling-inl.h" +#include +#include +#include +#include +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" + +#define DeformablePSROIPOOLING_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) +#define CUDA_KERNEL_LOOP(i, n) \ +for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +namespace mshadow { + namespace cuda { + template + __device__ DType bilinear_interp( + const DType* data, + const DType x, + const DType y, + const int width, + const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + DType dist_x = static_cast(x - x1); + DType dist_y = static_cast(y - y1); + DType value11 = data[y1*width + x1]; + DType value12 = data[y2*width + x1]; + DType value21 = data[y1*width + x2]; + DType value22 = data[y2*width + x2]; + DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12 + + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22; + return value; + } + + template + __global__ void DeformablePSROIPoolForwardKernel( + const int count, + const DType* bottom_data, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const DType* bottom_rois, const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + DType* top_data, + DType* top_count) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + DType sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } + } + + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolForward"; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *top_data = out.dptr_; + DType *top_count_data = top_count.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(out.stream_); + DeformablePSROIPoolForwardKernel << > >( + count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } + + + template + __global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* top_count, + const int num_rois, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + DType* bottom_data_diff, DType* bottom_trans_diff, + const DType* bottom_data, + const DType* bottom_rois, + const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + DType diff_val = top_diff[index] / top_count[index]; + const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + DType dist_x = w - x0, dist_y = h - y0; + DType q00 = (1 - dist_x)*(1 - dist_y); + DType q01 = (1 - dist_x)*dist_y; + DType q10 = dist_x*(1 - dist_y); + DType q11 = dist_x*dist_y; + int bottom_index_base = c * height *width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val); + + if (no_trans) { + continue; + } + DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0]; + DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0]; + DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1]; + DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1]; + DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y)) + *trans_std*diff_val; + diff_x *= roi_width; + DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x)) + *trans_std*diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)*part_size + part_h)*part_size + part_w, diff_y); + } + } + } + } + + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const DType *top_diff = out_grad.dptr_; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *bottom_data_diff = in_grad.dptr_; + DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_; + const DType *top_count_data = top_count.dptr_; + const int count = out_grad.shape_.Size(); + const int num_rois = bbox.size(0); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + DeformablePSROIPoolBackwardAccKernel << > >( + count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } + + } // namespace cuda + + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + cuda::DeformablePSROIPoolForward(out, data, bbox, trans, top_count, no_trans, spatial_scale, + output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); + } + + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans, top_count, no_trans, + spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); + } + +} // namespace mshadow + + +namespace mxnet { + namespace op { + + template<> + Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } + + } // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h new file mode 100644 index 000000000000..3116d8c15623 --- /dev/null +++ b/src/operator/contrib/multi_proposal-inl.h @@ -0,0 +1,301 @@ +/*! + * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal-inl.h + * \brief MultiProposal Operator + * \author Piotr Teterwak, Bing Xu, Jian Guo, Xizhou Zhu +*/ +#ifndef MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ +#define MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" + +// extend NumericalParam +namespace mxnet { +namespace op { + +/*! +* \brief structure for numerical tuple input +* \tparam VType data type of param +*/ +template +struct NumericalParam { + NumericalParam() {} + explicit NumericalParam(VType *begin, VType *end) { + int32_t size = static_cast(end - begin); + info.resize(size); + for (int i = 0; i < size; ++i) { + info[i] = *(begin + i); + } + } + inline size_t ndim() const { + return info.size(); + } + std::vector info; +}; + +template +inline std::istream &operator>>(std::istream &is, NumericalParam ¶m) { + while (true) { + char ch = is.get(); + if (ch == '(') break; + if (!isspace(ch)) { + is.setstate(std::ios::failbit); + return is; + } + } + VType idx; + std::vector tmp; + // deal with empty case + size_t pos = is.tellg(); + char ch = is.get(); + if (ch == ')') { + param.info = tmp; + return is; + } + is.seekg(pos); + // finish deal + while (is >> idx) { + tmp.push_back(idx); + char ch; + do { + ch = is.get(); + } while (isspace(ch)); + if (ch == ',') { + while (true) { + ch = is.peek(); + if (isspace(ch)) { + is.get(); continue; + } + if (ch == ')') { + is.get(); break; + } + break; + } + if (ch == ')') break; + } else if (ch == ')') { + break; + } else { + is.setstate(std::ios::failbit); + return is; + } + } + param.info = tmp; + return is; +} + +template +inline std::ostream &operator<<(std::ostream &os, const NumericalParam ¶m) { + os << '('; + for (index_t i = 0; i < param.info.size(); ++i) { + if (i != 0) os << ','; + os << param.info[i]; + } + // python style tuple + if (param.info.size() == 1) os << ','; + os << ')'; + return os; +} + +} // namespace op +} // namespace mxnet + +namespace mxnet { +namespace op { + +namespace proposal { +enum MultiProposalOpInputs {kClsProb, kBBoxPred, kImInfo}; +enum MultiProposalOpOutputs {kOut, kScore}; +enum MultiProposalForwardResource {kTempResource}; +} // proposal + +struct MultiProposalParam : public dmlc::Parameter { + int rpn_pre_nms_top_n; + int rpn_post_nms_top_n; + float threshold; + int rpn_min_size; + NumericalParam scales; + NumericalParam ratios; + int feature_stride; + bool output_score; + bool iou_loss; + DMLC_DECLARE_PARAMETER(MultiProposalParam) { + float tmp[] = {0, 0, 0, 0}; + DMLC_DECLARE_FIELD(rpn_pre_nms_top_n).set_default(6000) + .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals"); + DMLC_DECLARE_FIELD(rpn_post_nms_top_n).set_default(300) + .describe("Overlap threshold used for non-maximum" + "suppresion(suppress boxes with IoU >= this threshold"); + DMLC_DECLARE_FIELD(threshold).set_default(0.7) + .describe("NMS value, below which to suppress."); + DMLC_DECLARE_FIELD(rpn_min_size).set_default(16) + .describe("Minimum height or width in proposal"); + tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f; + DMLC_DECLARE_FIELD(scales).set_default(NumericalParam(tmp, tmp + 4)) + .describe("Used to generate anchor windows by enumerating scales"); + tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f; + DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam(tmp, tmp + 3)) + .describe("Used to generate anchor windows by enumerating ratios"); + DMLC_DECLARE_FIELD(feature_stride).set_default(16) + .describe("The size of the receptive field each unit in the convolution layer of the rpn," + "for example the product of all stride's prior to this layer."); + DMLC_DECLARE_FIELD(output_score).set_default(false) + .describe("Add score to outputs"); + DMLC_DECLARE_FIELD(iou_loss).set_default(false) + .describe("Usage of IoU Loss"); + } +}; + +template +Operator *CreateOp(MultiProposalParam param); + +#if DMLC_USE_CXX11 +class MultiProposalProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]"; + const TShape &dshape = in_shape->at(proposal::kClsProb); + if (dshape.ndim() == 0) return false; + Shape<4> bbox_pred_shape; + bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]); + SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred, + bbox_pred_shape); + Shape<2> im_info_shape; + im_info_shape = Shape2(dshape[0], 3); + SHAPE_ASSIGN_CHECK(*in_shape, proposal::kImInfo, im_info_shape); + out_shape->clear(); + // output + out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 5)); + // score + out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 1)); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new MultiProposalProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_MultiProposal"; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {}; + } + + int NumVisibleOutputs() const override { + if (param_.output_score) { + return 2; + } else { + return 1; + } + } + + int NumOutputs() const override { + return 2; + } + + std::vector ListArguments() const override { + return {"cls_prob", "bbox_pred", "im_info"}; + } + + std::vector ListOutputs() const override { + return {"output", "score"}; + } + + Operator* CreateOperator(Context ctx) const override; + + private: + MultiProposalParam param_; +}; // class MultiProposalProp + +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +//======================== +// Anchor Generation Utils +//======================== +namespace mxnet { +namespace op { +namespace utils { + +inline void _MakeAnchor(float w, + float h, + float x_ctr, + float y_ctr, + std::vector *out_anchors) { + out_anchors->push_back(x_ctr - 0.5f * (w - 1.0f)); + out_anchors->push_back(y_ctr - 0.5f * (h - 1.0f)); + out_anchors->push_back(x_ctr + 0.5f * (w - 1.0f)); + out_anchors->push_back(y_ctr + 0.5f * (h - 1.0f)); + out_anchors->push_back(0.0f); +} + +inline void _Transform(float scale, + float ratio, + const std::vector& base_anchor, + std::vector *out_anchors) { + float w = base_anchor[2] - base_anchor[1] + 1.0f; + float h = base_anchor[3] - base_anchor[1] + 1.0f; + float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f); + float y_ctr = base_anchor[1] + 0.5 * (h - 1.0f); + float size = w * h; + float size_ratios = std::floor(size / ratio); + float new_w = std::floor(std::sqrt(size_ratios) + 0.5f) * scale; + float new_h = std::floor((new_w / scale * ratio) + 0.5f) * scale; + + _MakeAnchor(new_w, new_h, x_ctr, + y_ctr, out_anchors); +} + +// out_anchors must have shape (n, 5), where n is ratios.size() * scales.size() +inline void GenerateAnchors(const std::vector& base_anchor, + const std::vector& ratios, + const std::vector& scales, + std::vector *out_anchors) { + for (size_t j = 0; j < ratios.size(); ++j) { + for (size_t k = 0; k < scales.size(); ++k) { + _Transform(scales[k], ratios[j], base_anchor, out_anchors); + } + } +} + +} // namespace utils +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ diff --git a/src/operator/contrib/multi_proposal.cc b/src/operator/contrib/multi_proposal.cc new file mode 100644 index 000000000000..c8f75eaec547 --- /dev/null +++ b/src/operator/contrib/multi_proposal.cc @@ -0,0 +1,63 @@ +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal.cc + * \brief + * \author Xizhou Zhu +*/ + +#include "./multi_proposal-inl.h" + + +namespace mxnet { +namespace op { + +template +class MultiProposalOp : public Operator{ + public: + explicit MultiProposalOp(MultiProposalParam param) { + this->param_ = param; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + LOG(FATAL) << "not implemented"; + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + LOG(FATAL) << "not implemented"; + } + + private: + MultiProposalParam param_; +}; // class MultiProposalOp + +template<> +Operator *CreateOp(MultiProposalParam param) { + return new MultiProposalOp(param); +} + +Operator* MultiProposalProp::CreateOperator(Context ctx) const { + DO_BIND_DISPATCH(CreateOp, param_); +} + +DMLC_REGISTER_PARAMETER(MultiProposalParam); + +MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp) +.describe("Generate region proposals via RPN") +.add_argument("cls_score", "NDArray-or-Symbol", "Score of how likely proposal is object.") +.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals") +.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.") +.add_arguments(MultiProposalParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu new file mode 100644 index 000000000000..28e1c6a4cebd --- /dev/null +++ b/src/operator/contrib/multi_proposal.cu @@ -0,0 +1,591 @@ +/*! + * Copyright (c) 2015 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file multi_proposal.cu + * \brief MultiProposal Operator + * \author Shaoqing Ren, Xizhou Zhu, Jian Guo +*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./multi_proposal-inl.h" + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +#define FRCNN_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ +} while (0) + +namespace mshadow { +namespace cuda { +namespace multi_proposal { + +// scores are (b, 2 * anchor, h, w) +// workspace_proposals are (b, h * w * anchor, 5) +// w defines "x" and h defines "y" +// count should be total anchors numbers, h * w * anchors +template +__global__ void ProposalGridKernel(const int count, + const int num_anchors, + const int height, + const int width, + const int feature_stride, + const Dtype* scores, + Dtype* workspace_proposals) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % width; + int h = (index / num_anchors / width) % height; + int b = index / num_anchors / width / height; + + workspace_proposals[index * 5 + 0] = workspace_proposals[a * 5 + 0] + w * feature_stride; + workspace_proposals[index * 5 + 1] = workspace_proposals[a * 5 + 1] + h * feature_stride; + workspace_proposals[index * 5 + 2] = workspace_proposals[a * 5 + 2] + w * feature_stride; + workspace_proposals[index * 5 + 3] = workspace_proposals[a * 5 + 3] + h * feature_stride; + workspace_proposals[index * 5 + 4] = + scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w]; + //workspace_proposals[index * 5 + 4] = scores[(a * height + h) * width + w]; + } +} + +// boxes are (b, h * w * anchor, 5) +// deltas are (b, 4 * anchor, h, w) +// out_pred_boxes are (b, h * w * anchor, 5) +// count should be total anchors numbers, b * h * w * anchors +// in-place write: boxes and out_pred_boxes are the same location +template +__global__ void BBoxPredKernel(const int count, + const int num_anchors, + const int feat_height, + const int feat_width, + const int feature_stride, + const Dtype* im_infos, + const Dtype* boxes, + const Dtype* deltas, + Dtype* out_pred_boxes) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % feat_width; + int h = (index / num_anchors / feat_width) % feat_height; + int b = index / num_anchors / feat_width / feat_height; + + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); + + float width = boxes[index * 5 + 2] - boxes[index * 5 + 0] + 1.0f; + float height = boxes[index * 5 + 3] - boxes[index * 5 + 1] + 1.0f; + float ctr_x = boxes[index * 5 + 0] + 0.5f * (width - 1.0f); + float ctr_y = boxes[index * 5 + 1] + 0.5f * (height - 1.0f); + + int ba = (b * num_anchors + a); + float dx = deltas[((ba * 4) * feat_height + h) * feat_width + w]; + float dy = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; + float dw = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; + float dh = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w]; + + float pred_ctr_x = dx * width + ctr_x; + float pred_ctr_y = dy * height + ctr_y; + float pred_w = exp(dw) * width; + float pred_h = exp(dh) * height; + + float pred_x1 = pred_ctr_x - 0.5f * (pred_w - 1.0f); + float pred_y1 = pred_ctr_y - 0.5f * (pred_h - 1.0f); + float pred_x2 = pred_ctr_x + 0.5f * (pred_w - 1.0f); + float pred_y2 = pred_ctr_y + 0.5f * (pred_h - 1.0f); + + pred_x1 = max(min(pred_x1, im_width - 1.0f), 0.0f); + pred_y1 = max(min(pred_y1, im_height - 1.0f), 0.0f); + pred_x2 = max(min(pred_x2, im_width - 1.0f), 0.0f); + pred_y2 = max(min(pred_y2, im_height - 1.0f), 0.0f); + + out_pred_boxes[index * 5 + 0] = pred_x1; + out_pred_boxes[index * 5 + 1] = pred_y1; + out_pred_boxes[index * 5 + 2] = pred_x2; + out_pred_boxes[index * 5 + 3] = pred_y2; + + if (h >= real_height || w >= real_width) { + out_pred_boxes[index * 5 + 4] = -1.0f; + } + } +} + +// boxes are (b, h * w * anchor, 5) +// deltas are (b, 4 * anchor, h, w) +// out_pred_boxes are (b, h * w * anchor, 5) +// count should be total anchors numbers, b * h * w * anchors +// in-place write: boxes and out_pred_boxes are the same location +template +__global__ void IoUPredKernel(const int count, + const int num_anchors, + const int feat_height, + const int feat_width, + const int feature_stride, + const Dtype* im_infos, + const Dtype* boxes, + const Dtype* deltas, + Dtype* out_pred_boxes) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int a = index % num_anchors; + int w = (index / num_anchors) % feat_width; + int h = (index / num_anchors / feat_width) % feat_height; + int b = index / num_anchors / feat_width / feat_height; + + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); + + float x1 = boxes[index * 5 + 0]; + float y1 = boxes[index * 5 + 1]; + float x2 = boxes[index * 5 + 2]; + float y2 = boxes[index * 5 + 3]; + + int ba = (b * num_anchors + a); + float dx1 = deltas[((ba * 4) * feat_height + h) * feat_width + w]; + float dy1 = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; + float dx2 = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; + float dy2 = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w]; + + float pred_x1 = max(min(x1 + dx1, im_width - 1.0f), 0.0f); + float pred_y1 = max(min(y1 + dy1, im_height - 1.0f), 0.0f); + float pred_x2 = max(min(x2 + dx2, im_width - 1.0f), 0.0f); + float pred_y2 = max(min(y2 + dy2, im_height - 1.0f), 0.0f); + + out_pred_boxes[index * 5 + 0] = pred_x1; + out_pred_boxes[index * 5 + 1] = pred_y1; + out_pred_boxes[index * 5 + 2] = pred_x2; + out_pred_boxes[index * 5 + 3] = pred_y2; + + if (h >= real_height || w >= real_width) { + out_pred_boxes[index * 5 + 4] = -1.0f; + } + } +} + +// filter box with stride less than rpn_min_size +// filter: set score to zero +// dets (b, n, 5) +template +__global__ void FilterBoxKernel(const int count, + const int count_anchors, + const float original_min_size, + const Dtype* im_infos, + Dtype* dets) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + int b = index / count_anchors; + float iw = dets[index * 5 + 2] - dets[index * 5 + 0] + 1.0f; + float ih = dets[index * 5 + 3] - dets[index * 5 + 1] + 1.0f; + float min_size = original_min_size * im_infos[b * 3 + 2]; + if (iw < min_size || ih < min_size) { + dets[index * 5 + 0] -= min_size / 2; + dets[index * 5 + 1] -= min_size / 2; + dets[index * 5 + 2] += min_size / 2; + dets[index * 5 + 3] += min_size / 2; + dets[index * 5 + 4] = -1.0f; + } + } +} + +// copy score and init order +// dets (n, 5); score (n, ); order (n, ) +// count should be n (total anchors or proposals) +template +__global__ void CopyScoreKernel(const int count, + const Dtype* dets, + Dtype* score, + int* order) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + score[index] = dets[index * 5 + 4]; + order[index] = index; + } +} + +// reorder proposals according to order and keep the top_n proposals +// prev_dets (n, 5); order (n, ); dets (n, 5) +// count should be output anchor numbers (top_n) +template +__global__ void ReorderProposalsKernel(const int count, + const Dtype* prev_dets, + const int* order, + Dtype* dets) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + const int order_i = order[index]; + for (int j = 0; j < 5; j ++) { + dets[index * 5 + j] = prev_dets[order_i * 5 + j]; + } + } +} + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int threadsPerBlock = sizeof(uint64_t) * 8; + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +void _nms(const mshadow::Tensor& boxes, + const float nms_overlap_thresh, + int *keep, + int *num_out) { + const int threadsPerBlock = sizeof(uint64_t) * 8; + const int boxes_num = boxes.size(0); + const int boxes_dim = boxes.size(1); + + float* boxes_dev = boxes.dptr_; + uint64_t* mask_dev = NULL; + + const int col_blocks = DIVUP(boxes_num, threadsPerBlock); + FRCNN_CUDA_CHECK(cudaMalloc(&mask_dev, + boxes_num * col_blocks * sizeof(uint64_t))); + + dim3 blocks(DIVUP(boxes_num, threadsPerBlock), + DIVUP(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + std::vector mask_host(boxes_num * col_blocks); + FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(uint64_t) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep[num_to_keep++] = i; + uint64_t *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + *num_out = num_to_keep; + + FRCNN_CUDA_CHECK(cudaFree(mask_dev)); +} + +// copy proposals to output +// dets (top_n, 5); keep (top_n, ); out (top_n, ) +// count should be top_n (total anchors or proposals) +template +__global__ void PrepareOutput(const int count, + const Dtype* dets, + const int* keep, + const int out_size, + const int image_index, + Dtype* out, + Dtype* score) { + for (int index = blockIdx.x * blockDim.x + threadIdx.x; + index < count; + index += blockDim.x * gridDim.x) { + out[index * 5] = image_index; + if (index < out_size) { + int keep_i = keep[index]; + for (int j = 0; j < 4; ++j) { + out[index * 5 + j + 1] = dets[keep_i * 5 + j]; + } + score[index] = dets[keep_i * 5 + 4]; + } else { + int keep_i = keep[index % out_size]; + for (int j = 0; j < 4; ++j) { + out[index * 5 + j + 1] = dets[keep_i * 5 + j]; + } + score[index] = dets[keep_i * 5 + 4]; + } + } +} +} // namespace multi_proposal +} // namespace cuda +} // namespace mshadow + +namespace mxnet { +namespace op { + +template +class MultiProposalGPUOp : public Operator{ + public: + explicit MultiProposalGPUOp(MultiProposalParam param) { + this->param_ = param; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mshadow::cuda; + using namespace mshadow::cuda::multi_proposal; + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 2); + CHECK_GT(req.size(), 1); + CHECK_EQ(req[proposal::kOut], kWriteTo); + /*CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1) + << "Sorry, multiple images each device is not implemented.";*/ + + Stream *s = ctx.get_stream(); + + Tensor scores = in_data[proposal::kClsProb].get(s); + Tensor bbox_deltas = in_data[proposal::kBBoxPred].get(s); + Tensor im_info = in_data[proposal::kImInfo].get(s); + + Tensor out = out_data[proposal::kOut].get(s); + Tensor out_score = out_data[proposal::kScore].get(s); + + int num_images = scores.size(0); + int num_anchors = scores.size(1) / 2; + int height = scores.size(2); + int width = scores.size(3); + int count_anchors = num_anchors * height * width; // count of total anchors + int count = num_images * count_anchors; + // set to -1 for max + int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors; + rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors); + int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n); + + // Generate first anchors based on base anchor + std::vector base_anchor(4); + base_anchor[0] = 0.0; + base_anchor[1] = 0.0; + base_anchor[2] = param_.feature_stride - 1.0; + base_anchor[3] = param_.feature_stride - 1.0; + CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size()); + std::vector anchors; + utils::GenerateAnchors(base_anchor, + param_.ratios.info, + param_.scales.info, + &anchors); + + // Copy generated anchors to GPU + float* workspace_proposals_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * num_images * count_anchors * 5)); + Tensor workspace_proposals(workspace_proposals_ptr, Shape3(num_images, count_anchors, 5)); + FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, + &anchors[0], sizeof(float) * anchors.size(), + cudaMemcpyHostToDevice)); + + // Copy proposals to a mesh grid + dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); + dim3 dimBlock(kMaxThreadsPerBlock); + CheckLaunchParam(dimGrid, dimBlock, "ProposalGrid"); + ProposalGridKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, + scores.dptr_, workspace_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // Transform anchors and bbox_deltas into bboxes + CheckLaunchParam(dimGrid, dimBlock, "BBoxPred"); + if (param_.iou_loss) { + IoUPredKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); + } else { + BBoxPredKernel<<>>( + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); + } + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // filter boxes with less than rpn_min_size + CheckLaunchParam(dimGrid, dimBlock, "FilterBox"); + FilterBoxKernel<<>>( + count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + + + dimGrid = dim3((count_anchors + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); + dimBlock = dim3(kMaxThreadsPerBlock); + // Copy score to a continuous memory + float* score_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count_anchors)); + Tensor score(score_ptr, Shape1(count_anchors)); + int* order_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count_anchors)); + Tensor order(order_ptr, Shape1(count_anchors)); + + float* workspace_ordered_proposals_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, + sizeof(float) * rpn_pre_nms_top_n * 5)); + Tensor workspace_ordered_proposals(workspace_ordered_proposals_ptr, + Shape2(rpn_pre_nms_top_n, 5)); + + int* keep; + FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n)); + + for (int b = 0; b < num_images; b++) { + + CheckLaunchParam(dimGrid, dimBlock, "CopyScore"); + CopyScoreKernel << > >( + count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, score.dptr_, order.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // argsort score, save order + thrust::stable_sort_by_key(thrust::device, + score.dptr_, + score.dptr_ + score.size(0), + order.dptr_, + thrust::greater()); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // Reorder proposals according to order + + dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals"); + ReorderProposalsKernel << > >( + rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, order.dptr_, workspace_ordered_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // perform nms + std::vector _keep(workspace_ordered_proposals.size(0)); + int out_size = 0; + _nms(workspace_ordered_proposals, + param_.threshold, + &_keep[0], + &out_size); + + // copy nms result to gpu + FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(), + cudaMemcpyHostToDevice)); + + // copy results after nms + dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput"); + PrepareOutput << > >( + rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b, + out.dptr_ + b * rpn_post_nms_top_n * 5, out_score.dptr_ + b * rpn_post_nms_top_n); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + } + // free temporary memory + FRCNN_CUDA_CHECK(cudaFree(keep)); + FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(score_ptr)); + FRCNN_CUDA_CHECK(cudaFree(order_ptr)); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), 3); + + Stream *s = ctx.get_stream(); + Tensor gscores = in_grad[proposal::kClsProb].get(s); + Tensor gbbox = in_grad[proposal::kBBoxPred].get(s); + Tensor ginfo = in_grad[proposal::kImInfo].get(s); + + // can not assume the grad would be zero + Assign(gscores, req[proposal::kClsProb], 0); + Assign(gbbox, req[proposal::kBBoxPred], 0); + Assign(ginfo, req[proposal::kImInfo], 0); + } + + private: + MultiProposalParam param_; +}; // class MultiProposalGPUOp + +template<> +Operator* CreateOp(MultiProposalParam param) { + return new MultiProposalGPUOp(param); +} +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh new file mode 100644 index 000000000000..d9e7b970ca84 --- /dev/null +++ b/src/operator/contrib/nn/deformable_im2col.cuh @@ -0,0 +1,525 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai + */ + +#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ +#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ + +#include +#include +#include +#include +#include +#include "../../mxnet_op.h" +#include "../../../common/cuda_utils.h" + + + +namespace mxnet { +namespace op { + +template +__device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int data_width, + const int height, const int width, DType h, DType w) { + + int h_low = floor(h); + int w_low = floor(w); + int h_high; + int w_high; + if (h_low >= height - 1) { + h_high = h_low = height - 1; + h = (DType)h_low; + } + else { + h_high = h_low + 1; + } + + if (w_low >= width - 1) { + w_high = w_low = width - 1; + w = (DType)w_low; + } + else { + w_high = w_low + 1; + } + + DType lh = h - h_low; + DType lw = w - w_low; + DType hh = 1 - lh, hw = 1 - lw; + + DType v1 = bottom_data[h_low * data_width + w_low]; + DType v2 = bottom_data[h_low * data_width + w_high]; + DType v3 = bottom_data[h_high * data_width + w_low]; + DType v4 = bottom_data[h_high * data_width + w_high]; + DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ DType get_gradient_weight(DType argmax_h, DType argmax_w, + const int h, const int w, const int height, const int width) { + + if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) { + //empty + return 0; + } + + argmax_h = max(argmax_h, (DType)0.0f); + argmax_w = max(argmax_w, (DType)0.0f); + + int argmax_h_low = (int)argmax_h; + int argmax_w_low = (int)argmax_w; + int argmax_h_high; + int argmax_w_high; + if (argmax_h_low >= height - 1) { + argmax_h_high = argmax_h_low = height - 1; + argmax_h = (DType)argmax_h_low; + } else { + argmax_h_high = argmax_h_low + 1; + } + if (argmax_w_low >= width - 1) + { + argmax_w_high = argmax_w_low = width - 1; + argmax_w = (DType)argmax_w_low; + } else { + argmax_w_high = argmax_w_low + 1; + } + DType weight = 0; + if (h == argmax_h_low) { + if (w == argmax_w_low) { + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + } else if (w == argmax_w_high) { + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + } + } else if (h == argmax_h_high) { + if (w == argmax_w_low) { + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + } else if (w == argmax_w_high) { + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + } + } + return weight; +} + + +template +__device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w, + const int height, const int width, const DType* im_data, + const int data_width, const int bp_dir) { + + if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) + { + //empty + return 0; + } + + if (argmax_h < 0) argmax_h = 0; + if (argmax_w < 0) argmax_w = 0; + + int argmax_h_low = (int)argmax_h; + int argmax_w_low = (int)argmax_w; + int argmax_h_high; + int argmax_w_high; + if (argmax_h_low >= height - 1) { + argmax_h_high = argmax_h_low = height - 1; + argmax_h = (DType)argmax_h_low; + } else { + argmax_h_high = argmax_h_low + 1; + } + if (argmax_w_low >= width - 1) { + argmax_w_high = argmax_w_low = width - 1; + argmax_w = (DType)argmax_w_low; + } else { + argmax_w_high = argmax_w_low + 1; + } + DType weight = 0; + + if (bp_dir == 0) { + weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; + weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; + weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; + weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; + weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; + weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; + weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + + +/*! + * \brief deformable_im2col gpu kernel. + * DO NOT call this directly. Use wrapper function im2col() instead; + */ +template +__global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im, const DType* data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* data_col) { + CUDA_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int c_im = (index / width_col) / height_col; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + DType* data_col_ptr = data_col + (c_col * height_col + h_col) * width_col + w_col; + const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in; + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + DType val = static_cast(0); + const DType h_im = h_in + i * dilation_h + offset_h; + const DType w_im = w_in + j * dilation_w + offset_w; + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + const DType map_h = i * dilation_h + offset_h; + const DType map_w = j * dilation_w + offset_w; + const int cur_height = height - h_in; + const int cur_width = width - w_in; + val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); + } + *data_col_ptr = val; + data_col_ptr += height_col * width_col; + } + } + } +} + + + + + + +/*!\brief + * cpu function of deformable_im2col algorithm + * \param s device stream + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param data_col column buffer pointer + */ +template +inline void deformable_im2col(mshadow::Stream* s, + const DType* data_im, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, const TShape& dilation, + const uint32_t deformable_group, DType* data_col) { + // num_axes should be smaller than block size + index_t num_spatial_axes = kernel_shape.ndim(); + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + index_t channel_per_deformable_group = im_shape[1] / deformable_group; + index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim()); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + deformable_im2col_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) + <<::GetStream(s)>>>( + num_kernels, data_im, data_offset, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], + pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], channel_per_deformable_group, + col_shape[1], col_shape[2], data_col); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_im2col_gpu_kernel); + break; + default: + LOG(FATAL) << "im2col_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +/*! +* \brief deformable_col2im gpu kernel. +* \brief DO NOT call this directly. Use wrapper function deformable_col2im() instead; +*/ +template +__global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col, const DType* data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* grad_im, OpReqType req) { + CUDA_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col) % kernel_w; + const int i = (index / width_col / height_col / kernel_w) % kernel_h; + const int c = index / width_col / height_col / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType cur_inv_h_data = h_in + i * dilation_h + offset_h; + const DType cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const DType cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && + cur_w + dx >= 0 && cur_w + dx < width && + abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1 + ) { + int cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx; + DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + + +/*!\brief + * gpu function of deformable_col2im algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_im pointer of a image (C, H, W,...) in the image batch + */ +template +inline void deformable_col2im(mshadow::Stream* s, + const DType* data_col, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, + DType* grad_im, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + index_t im_size = im_shape.ProdShape(1, im_shape.ndim()); + index_t channel_per_deformable_group = im_shape[1] / deformable_group; + index_t num_kernels = col_shape.ProdShape(0, col_shape.ndim()); + // num_axes should be smaller than block size + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + deformable_col2im_gpu_kernel<<::GetStream(s)>>>( + num_kernels, data_col, data_offset, im_shape[1], im_shape[2], im_shape[3], + kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_im, req); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +/*! + * \brief deformable_col2im_coord gpu kernel. + * \brief DO NOT call this directly. Use wrapper function deformable_col2im_coord() instead; + */ +template +__global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* data_col, + const DType* data_im, const DType* data_offset, + const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, + const int height_col, const int width_col, + DType* grad_offset, OpReqType req) { + CUDA_KERNEL_LOOP(index, n) { + DType val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = index / width_col / height_col; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const DType* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * width_col * height_col; + const DType* data_im_ptr = data_im + deformable_group_index * channel_per_deformable_group / kernel_h / kernel_w * height * width; + const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { + const int col_pos = ((col_c * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col) % kernel_w; + int i = (col_pos / width_col / height_col / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + DType inv_h = h_in + i * dilation_h + offset_h; + DType inv_w = w_in + j * dilation_w + offset_w; + if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -1; + } + const DType weight = get_coordinate_weight( + inv_h, inv_w, + height, width, data_im_ptr + cnt * height * width, width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + +/*!\brief + * gpu function of deformable_col2im_coord algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch + */ +template +inline void deformable_col2im_coord(mshadow::Stream* s, + const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, + const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + index_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_shape[0] * kernel_shape[1] * deformable_group; + index_t channel_per_deformable_group = col_shape[0] / deformable_group; + // num_axes should be smaller than block size + CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum); + using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + + deformable_col2im_coord_gpu_kernel << ::GetStream(s) >> >( + num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3], + kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset, req); + MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_ diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h new file mode 100644 index 000000000000..93a555164d7a --- /dev/null +++ b/src/operator/contrib/nn/deformable_im2col.h @@ -0,0 +1,157 @@ +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ******************** + * + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_im2col.h + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai + */ + +#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ +#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ + +#include +#include +#include +#include +#include "../../mxnet_op.h" + +namespace mxnet { +namespace op { + +/*!\brief + * cpu function of deformable_im2col algorithm + * \param s device stream + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param data_col column buffer pointer + */ +template +inline void deformable_im2col(mshadow::Stream* s, + const DType* data_im, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, const TShape& dilation, + const uint32_t deformable_group, DType* data_col) { + if (2 == kernel_shape.ndim()) { + LOG(FATAL) << "not implemented"; + } else { + LOG(FATAL) << "not implemented"; + } +} + + +/*!\brief + * cpu function of deformable_col2im algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_im pointer of a image (C, H, W,...) in the image batch + */ +template +inline void deformable_col2im(mshadow::Stream* s, + const DType* data_col, const DType* data_offset, + const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, + DType* grad_im, OpReqType req) { + index_t num_spatial_axes = kernel_shape.ndim(); + LOG(FATAL) << "not implemented"; +} + + +/*!\brief + * cpu function of deformable_col2im_coord algorithm + * \param s device stream + * \param data_col start pointer of the column buffer to be filled + * \param data_im pointer of an image (C, H, W, ...) in the image batch + * \param data_offset pointer of offset (C, H, W, ...) in the offset batch + * \param im_shape input image shape in dimensions (N, C, H, W,) + * \param col_shape column buffer shape + * \param kernel_shape kernel filter shape + * \param pad pad shape + * \param stride stride shape + * \param dilation dilation shape + * \param deformable_group #offset group that deformable convolution use + * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch + */ + +template +inline void deformable_col2im_coord(mshadow::Stream* s, + const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, + const TShape& col_shape, const TShape& kernel_shape, + const TShape& pad, const TShape& stride, + const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { + LOG(FATAL) << "not implemented"; +} + +} // namespace op +} // namespace mxnet +#ifdef __CUDACC__ +#include "./deformable_im2col.cuh" +#endif +#endif // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h new file mode 100644 index 000000000000..f41d3237d5ac --- /dev/null +++ b/src/operator/contrib/psroi_pooling-inl.h @@ -0,0 +1,222 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling-inl.h + * \brief psroi pooling operator and symbol + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#ifndef MXNET_OPERATOR_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_PSROI_POOLING_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "../mshadow_op.h" +#include "../operator_common.h" + + +namespace mxnet { +namespace op { + +// Declare enumeration of input order to make code more intuitive. +// These enums are only visible within this header +namespace psroipool { +enum PSROIPoolingOpInputs {kData, kBox}; +enum PSROIPoolingOpOutputs {kOut}; +} // psroipool + +struct PSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int pooled_size; + int group_size; + DMLC_DECLARE_PARAMETER(PSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(group_size).set_default(0).describe("fix group size"); + } +}; + +template +class PSROIPoolingOp : public Operator { + public: + explicit PSROIPoolingOp(PSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + CHECK_EQ(in_data.size(), 2); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(out_data[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[psroipool::kData].get(s); + Tensor bbox = in_data[psroipool::kBox].get(s); + Tensor out = out_data[psroipool::kOut].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + out = -FLT_MAX; + PSROIPoolForward(out, data, bbox, param_.spatial_scale, param_.output_dim, param_.group_size); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + CHECK_EQ(in_data.size(), 2); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(out_grad[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]); + CHECK_NE(req[psroipool::kData], kWriteInplace) << + "ROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[psroipool::kBox], kWriteInplace) << + "ROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[psroipool::kOut].get(s); + Tensor bbox = in_data[psroipool::kBox].get(s); + Tensor grad_in = in_grad[psroipool::kData].get(s); + Tensor grad_roi = in_grad[psroipool::kBox].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + if (kAddTo == req[psroipool::kData] || kWriteTo == req[psroipool::kData]) { + if (kWriteTo == req[psroipool::kData]) { + grad_in = 0.0f; + } + PSROIPoolBackwardAcc(grad_in, grad_out, bbox, param_.spatial_scale, param_.output_dim, param_.group_size); + } + if (kWriteTo == req[psroipool::kBox]) { + grad_roi = 0.0f; + } + + } + + private: + PSROIPoolingParam param_; +}; // class PSROIPoolingOp + +// Decalre Factory function, used for dispatch specialization +template +Operator* CreateOp(PSROIPoolingParam param, int dtype); + +#if DMLC_USE_CXX11 +class PSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + return {"data", "rois"}; + } + + std::vector ListOutputs() const override { + return {"output"}; + } + + int NumOutputs() const override { + return 1; + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.group_size == 0) { + param_.group_size = param_.pooled_size; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(psroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(psroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + PSROIPoolingProp* psroi_pooling_sym = new PSROIPoolingProp(); + psroi_pooling_sym->param_ = this->param_; + return psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_PSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[psroipool::kOut], in_data[psroipool::kBox]}; + } + + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + + private: + PSROIPoolingParam param_; +}; // class PSROIPoolingProp +#endif +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_PSROI_POOLING_INL_H_ diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc new file mode 100644 index 000000000000..678b0b0a67c7 --- /dev/null +++ b/src/operator/contrib/psroi_pooling.cc @@ -0,0 +1,80 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling.cc + * \brief psroi pooling operator + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#include "./psroi_pooling-inl.h" +#include +#include +#include +#include +#include + +using std::max; +using std::min; +using std::floor; +using std::ceil; + +namespace mshadow { +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale_, + const int output_dim_, + const int group_size_) { + // NOT_IMPLEMENTED; + return; +} + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale_, + const int output_dim_, + const int group_size_) { + // NOT_IMPLEMENTED; + return; +} +} // namespace mshadow + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(PSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new PSROIPoolingOp(param); + }); + return op; +} + +Operator *PSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} + +DMLC_REGISTER_PARAMETER(PSROIPoolingParam); + +MXNET_REGISTER_OP_PROPERTY(_contrib_PSROIPooling, PSROIPoolingProp) +.describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " +"spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " +"by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " +"the number of region bounding boxes after PSROIPooling") +.add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") +.add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " +"[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " +"of designated region of interest. batch_index indicates the index of corresponding image " +"in the input data") +.add_arguments(PSROIPoolingParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu new file mode 100644 index 000000000000..3abd10bbdd29 --- /dev/null +++ b/src/operator/contrib/psroi_pooling.cu @@ -0,0 +1,260 @@ +/*! + * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file psroi_pooling.cu + * \brief psroi pooling operator + * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai +*/ +#include "./psroi_pooling-inl.h" +#include +#include +#include +#include +#include "../../common/cuda_utils.h" +#include "../mxnet_op.h" + +#define PSROIPOOLING_CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) +#define CUDA_KERNEL_LOOP(i, n) \ +for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +namespace mshadow { +namespace cuda { + +template +__global__ void PSROIPoolForwardKernel( + const int count, + const DType* bottom_data, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const DType* bottom_rois, + const int output_dim, + const int group_size, + DType* top_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(static_cast(ph) * bin_size_h + + roi_start_h); + int wstart = floor(static_cast(pw)* bin_size_w + + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0),width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + int c = (ctop*group_size + gh)*group_size + gw; + + const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; + DType out_sum = 0; + for (int h = hstart; h < hend; ++h){ + for (int w = wstart; w < wend; ++w){ + int bottom_index = h*width + w; + out_sum += offset_bottom_data[bottom_index]; + } + } + + DType bin_area = (hend - hstart)*(wend - wstart); + top_data[index] = is_empty? (DType)0. : out_sum/bin_area; + } +} + +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + DType *top_data = out.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int pooled_height = out.size(2); + const int pooled_width = out.size(3); + cudaStream_t stream = Stream::GetStream(out.stream_); + PSROIPoolForwardKernel << > >( + count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, output_dim_, group_size_, top_data); + PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); +} + + +template +__global__ void PSROIPoolBackwardAccKernel( + const int count, + const DType* top_diff, + const int num_rois, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int group_size, + const int output_dim, + DType* bottom_diff, + const DType* bottom_rois) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(static_cast(ph)* bin_size_h + + roi_start_h); + int wstart = floor(static_cast(pw)* bin_size_w + + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Compute c at bottom + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + int c = (ctop*group_size + gh)*group_size + gw; + DType* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; + DType bin_area = (hend - hstart)*(wend - wstart); + DType diff_val = is_empty ? (DType)0. : top_diff[index] / bin_area; + for (int h = hstart; h < hend; ++h){ + for (int w = wstart; w < wend; ++w){ + int bottom_index = h*width + w; + atomicAdd(offset_bottom_diff + bottom_index, diff_val); + } + } + } +} + + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + // LOG(INFO) << "PSROIPoolBackward"; + const DType *top_diff = out_grad.dptr_; + const DType *bottom_rois = bbox.dptr_; + DType *bottom_diff = in_grad.dptr_; + const int count = out_grad.shape_.Size(); + const int num_rois = bbox.size(0); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int pooled_height = out_grad.size(2); + const int pooled_width = out_grad.size(3); + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + PSROIPoolBackwardAccKernel << > >( + count, top_diff, num_rois, spatial_scale, channels, height, width, + pooled_height, pooled_width, group_size_, output_dim_, bottom_diff, bottom_rois); + PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); +} + +} // namespace cuda + +template +inline void PSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + cuda::PSROIPoolForward(out, data, bbox, spatial_scale, output_dim_, group_size_); +} + +template +inline void PSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &bbox, + const float spatial_scale, + const int output_dim_, + const int group_size_) { + cuda::PSROIPoolBackwardAcc(in_grad, out_grad, bbox, spatial_scale, output_dim_, group_size_); +} + +} // namespace mshadow + + +namespace mxnet { +namespace op { + +template<> +Operator* CreateOp(PSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new PSROIPoolingOp(param); + }); + return op; +} + +} // namespace op +} // namespace mxnet From db07e7c6d91f759b1a7d81f82345155e9edfe447 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 16:45:41 +0800 Subject: [PATCH 02/18] fix programming rule to meet pr rule --- src/operator/contrib/channel_operator-inl.h | 696 ++++++++------- src/operator/contrib/channel_operator.cc | 54 +- src/operator/contrib/channel_operator.cu | 412 +++++---- .../contrib/deformable_convolution-inl.h | 828 +++++++++--------- .../contrib/deformable_convolution.cu | 26 +- .../contrib/deformable_psroi_pooling-inl.h | 467 +++++----- .../contrib/deformable_psroi_pooling.cc | 63 +- .../contrib/deformable_psroi_pooling.cu | 630 ++++++------- src/operator/contrib/multi_proposal-inl.h | 6 +- src/operator/contrib/multi_proposal.cu | 205 +++-- src/operator/contrib/nn/deformable_im2col.h | 8 +- src/operator/contrib/psroi_pooling-inl.h | 7 +- src/operator/contrib/psroi_pooling.cc | 4 +- src/operator/contrib/psroi_pooling.cu | 14 +- 14 files changed, 1701 insertions(+), 1719 deletions(-) diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h index a5cd9a9172e4..7d8bcc499763 100644 --- a/src/operator/contrib/channel_operator-inl.h +++ b/src/operator/contrib/channel_operator-inl.h @@ -19,399 +19,393 @@ #include "../operator_common.h" namespace mxnet { - namespace op { - // Declare enumeration of input order to make code more intuitive. - // // These enums are only visible within this header - namespace channel_operator { - enum ChannelOperatorOpInputs { kData, kPick }; - enum ChannelOperatorOpOutputs { kOut, kTemp }; - enum ChannelOperatorOpType { kGroupMax, kGroupPick, kGroupSoftmax }; - enum ChannelOperatorOpPickType { kLabel, kScore }; - } // activation - - struct ChannelOperatorParam : public dmlc::Parameter { - // use int for enumeration - int op_type; - int group; - int pick_type; - DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { - DMLC_DECLARE_FIELD(op_type) - .add_enum("Group_Max", channel_operator::kGroupMax) - .add_enum("Group_Pick", channel_operator::kGroupPick) - .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) - .describe("Channel operator to be applied."); - DMLC_DECLARE_FIELD(group).describe("group size"); - DMLC_DECLARE_FIELD(pick_type) - .add_enum("Label_Pick", channel_operator::kLabel) - .add_enum("Score_Pick", channel_operator::kScore) - .set_default(channel_operator::kLabel) - .describe("pick type"); +namespace op { + // Declare enumeration of input order to make code more intuitive. + // // These enums are only visible within this header +namespace channel_operator { + enum ChannelOperatorOpInputs { kData, kPick }; + enum ChannelOperatorOpOutputs { kOut, kTemp }; + enum ChannelOperatorOpType { kGroupMax, kGroupPick, kGroupSoftmax }; + enum ChannelOperatorOpPickType { kLabel, kScore }; +} // activation + + struct ChannelOperatorParam : public dmlc::Parameter { + // use int for enumeration + int op_type; + int group; + int pick_type; + DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { + DMLC_DECLARE_FIELD(op_type) + .add_enum("Group_Max", channel_operator::kGroupMax) + .add_enum("Group_Pick", channel_operator::kGroupPick) + .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) + .describe("Channel operator to be applied."); + DMLC_DECLARE_FIELD(group).describe("group size"); + DMLC_DECLARE_FIELD(pick_type) + .add_enum("Label_Pick", channel_operator::kLabel) + .add_enum("Score_Pick", channel_operator::kScore) + .set_default(channel_operator::kLabel) + .describe("pick type"); + } + }; + + /** + * \brief This is the implementation of channel operator. + * \tparam xpu The device that the op will be executed on. + */ + template + class ChannelOperatorOp : public Operator { + public: + explicit ChannelOperatorOp(ChannelOperatorParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) + out_expected = 1; + else + out_expected = 2; + } else { + LOG(FATAL) << "No that operation type."; } - }; - - /** - * \brief This is the implementation of channel operator. - * \tparam xpu The device that the op will be executed on. - */ - template - class ChannelOperatorOp : public Operator { - public: - explicit ChannelOperatorOp(ChannelOperatorParam p) { - this->param_ = p; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + if (param_.op_type == channel_operator::kGroupSoftmax) { + int total_size = in_data[channel_operator::kData].Size(); + int batch_size = in_data[channel_operator::kData].shape_[0]; + int channel_num = in_data[channel_operator::kData].shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = + Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); + + Tensor data = + in_data[channel_operator::kData].get_with_shape(data_shape, s); + Tensor out = + out_data[channel_operator::kOut].get_with_shape(data_shape, s); + Softmax(out, data); } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; - out_expected = 1; - } - else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) - out_expected = 1; - else - out_expected = 2; - } - else { - LOG(FATAL) << "No that operation type."; - } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - if (param_.op_type == channel_operator::kGroupSoftmax) { - int total_size = in_data[channel_operator::kData].Size(); - int batch_size = in_data[channel_operator::kData].shape_[0]; - int channel_num = in_data[channel_operator::kData].shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); - - Tensor data = in_data[channel_operator::kData].get_with_shape(data_shape, s); - Tensor out = out_data[channel_operator::kOut].get_with_shape(data_shape, s); - Softmax(out, data); - } - else if (param_.op_type == channel_operator::kGroupMax) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - - GroupMaxForward(out, data, max_idx, param_.group); - } - else if (param_.op_type == channel_operator::kGroupPick) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - - if (param_.pick_type == channel_operator::kScore) { - Tensor argmax_data = out_data[channel_operator::kTemp].get(s); - GetMaxIdx(pick_idx, argmax_data, param_.group); - GroupPickForward(out, data, argmax_data, param_.group); - } - else { - GroupPickForward(out, data, pick_idx, param_.group); - } + else if (param_.op_type == channel_operator::kGroupMax) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + + GroupMaxForward(out, data, max_idx, param_.group); + } + else if (param_.op_type == channel_operator::kGroupPick) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + + if (param_.pick_type == channel_operator::kScore) { + Tensor argmax_data = + out_data[channel_operator::kTemp].get(s); + GetMaxIdx(pick_idx, argmax_data, param_.group); + GroupPickForward(out, data, argmax_data, param_.group); } else { - LOG(FATAL) << "No that operation type."; + GroupPickForward(out, data, pick_idx, param_.group); } - + } + else { + LOG(FATAL) << "No that operation type."; } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) out_expected = 1; - } - else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) - out_expected = 1; - else - out_expected = 2; - } - else { - LOG(FATAL) << "No that operation type."; - } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - - if (param_.op_type == channel_operator::kGroupMax) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { + else + out_expected = 2; + } else { + LOG(FATAL) << "No that operation type."; + } + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + + if (param_.op_type == channel_operator::kGroupMax) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + LOG(FATAL) << "Not Implemented."; + } else if (param_.op_type == channel_operator::kGroupPick) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + Tensor pick_diff = in_grad[channel_operator::kPick].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + Assign(pick_diff, req[channel_operator::kPick], 0); + if (param_.pick_type == channel_operator::kScore) { LOG(FATAL) << "Not Implemented."; - } - else if (param_.op_type == channel_operator::kGroupPick) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - Tensor pick_diff = in_grad[channel_operator::kPick].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - Assign(pick_diff, req[channel_operator::kPick], 0); - if (param_.pick_type == channel_operator::kScore) { - LOG(FATAL) << "Not Implemented."; - } - else { - GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); - } - - } - else { - LOG(FATAL) << "No that operation type."; + } else { + GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); } + } else { + LOG(FATAL) << "No that operation type."; } - private: - ChannelOperatorParam param_; - }; // class ChannelOperatorOp - // Decalre Factory function, used for dispatch specialization - template - Operator* CreateOp(ChannelOperatorParam type, int dtype); + } + private: + ChannelOperatorParam param_; + }; // class ChannelOperatorOp + + // Decalre Factory function, used for dispatch specialization + template + Operator* CreateOp(ChannelOperatorParam type, int dtype); #if DMLC_USE_CXX11 - class ChannelOperatorProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.op_type == channel_operator::kGroupMax || - param_.op_type == channel_operator::kGroupSoftmax) { - return{ "data" }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - return{ "data", "pick_idx" }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } + class ChannelOperatorProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.op_type == channel_operator::kGroupMax || + param_.op_type == channel_operator::kGroupSoftmax) { + return{ "data" }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + return{ "data", "pick_idx" }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; } + } - std::vector ListOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { + std::vector ListOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ "output" }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) return{ "output" }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) - return{ "output" }; - else - return{ "output", "argmax_data" }; - } - else if (param_.op_type == channel_operator::kGroupMax) { - return{ "output", "max_idx" }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } + else + return{ "output", "argmax_data" }; } + else if (param_.op_type == channel_operator::kGroupMax) { + return{ "output", "max_idx" }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; + } + } - int NumOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { + int NumOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return 1; + } + else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) return 1; - } - else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) - return 1; - else - return 2; - } - else if (param_.op_type == channel_operator::kGroupMax) { + else return 2; - } - else { - LOG(FATAL) << "No that operation type."; - return NULL; - } } - - int NumVisibleOutputs() const override { - return 1; + else if (param_.op_type == channel_operator::kGroupMax) { + return 2; } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); + else { + LOG(FATAL) << "No that operation type."; + return NULL; } - - std::map GetParams() const override { - return param_.__DICT__(); + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(channel_operator::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + return true; } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - - - // data: [batch_size, c, h, w] - TShape dshape = in_shape->at(channel_operator::kData); - CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; - - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, group, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); - out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); - return true; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, c, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); - return true; - } - else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; - // out: [num_rois, c/group, h, w] - out_shape->clear(); + else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, c, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); + return true; + } + else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; + // out: [num_rois, c/group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); + if (param_.pick_type == channel_operator::kScore) { out_shape->push_back( - Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); - if (param_.pick_type == channel_operator::kScore) { - out_shape->push_back( - Shape4(dshape[0], 1, 1, 1)); - } - return true; + Shape4(dshape[0], 1, 1, 1)); } - else { - LOG(FATAL) << "No that operation type."; - return false; - } - + return true; + } + else { + LOG(FATAL) << "No that operation type."; + return false; } - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "Input must have specified type"; + } - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); - out_type->push_back(dtype); - return true; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); - return true; - } - else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_type->size(), 2); - out_type->clear(); + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "Input must have specified type"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + return true; + } + else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_type->size(), 2); + out_type->clear(); + out_type->push_back(dtype); + if (param_.pick_type == channel_operator::kScore) { out_type->push_back(dtype); - if (param_.pick_type == channel_operator::kScore) { - out_type->push_back(dtype); - } - return true; } - else { - LOG(FATAL) << "No that operation type."; - return false; - } - + return true; } - - OperatorProperty* Copy() const override { - ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); - channel_operator_sym->param_ = this->param_; - return channel_operator_sym; + else { + LOG(FATAL) << "No that operation type."; + return false; } - std::string TypeString() const override { - return "_contrib_ChannelOperator"; + } + + OperatorProperty* Copy() const override { + ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); + channel_operator_sym->param_ = this->param_; + return channel_operator_sym; + } + + std::string TypeString() const override { + return "_contrib_ChannelOperator"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.op_type == channel_operator::kGroupMax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kTemp] }; } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.op_type == channel_operator::kGroupMax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kTemp] }; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kOut] }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - return{ out_grad[channel_operator::kOut], - in_data[channel_operator::kPick] }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } + else if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kOut] }; + } + else if (param_.op_type == channel_operator::kGroupPick) { + return{ out_grad[channel_operator::kOut], + in_data[channel_operator::kPick] }; + } + else { + LOG(FATAL) << "No that operation type."; + return{}; } + } - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; - private: - ChannelOperatorParam param_; - }; // class PSROIPoolingAlignProp + private: + ChannelOperatorParam param_; + }; // class PSROIPoolingAlignProp #endif - } // namespace op +} // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ \ No newline at end of file +#endif // MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ diff --git a/src/operator/contrib/channel_operator.cc b/src/operator/contrib/channel_operator.cc index 0c9c82fa554d..ae49244e0866 100644 --- a/src/operator/contrib/channel_operator.cc +++ b/src/operator/contrib/channel_operator.cc @@ -63,34 +63,34 @@ namespace mshadow { } // namespace mshadow namespace mxnet { - namespace op { +namespace op { - template<> - Operator *CreateOp(ChannelOperatorParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ChannelOperatorOp(param); - }); - return op; - } + template<> + Operator *CreateOp(ChannelOperatorParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ChannelOperatorOp(param); + }); + return op; + } - Operator *ChannelOperatorProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - std::vector out_type, aux_type; - CHECK(InferType(in_type, &out_type, &aux_type)); - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); - } + Operator *ChannelOperatorProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + } - DMLC_REGISTER_PARAMETER(ChannelOperatorParam); + DMLC_REGISTER_PARAMETER(ChannelOperatorParam); - MXNET_REGISTER_OP_PROPERTY(_contrib_ChannelOperator, ChannelOperatorProp) - .describe("Performs channel operation on inputs, including GroupMax, GroupSoftmax, GroupPick " - "and ChannelPick. This layer is designed for FCIS ") - .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") - .add_argument("pick_idx", "Symbol", "In GroupPick or ChannelPick mode, pick_idx is used to" - "pick specific group or channel") - .add_arguments(ChannelOperatorParam::__FIELDS__()); - } // namespace op -} // namespace mxnet \ No newline at end of file + MXNET_REGISTER_OP_PROPERTY(_contrib_ChannelOperator, ChannelOperatorProp) + .describe("Performs channel operation on inputs, including GroupMax, GroupSoftmax, GroupPick " + "and ChannelPick. This layer is designed for FCIS ") + .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") + .add_argument("pick_idx", "Symbol", "In GroupPick or ChannelPick mode, pick_idx is used to" + "pick specific group or channel") + .add_arguments(ChannelOperatorParam::__FIELDS__()); +} // namespace op +} // namespace mxnet diff --git a/src/operator/contrib/channel_operator.cu b/src/operator/contrib/channel_operator.cu index bbf9ad2b43db..30f51e338754 100644 --- a/src/operator/contrib/channel_operator.cu +++ b/src/operator/contrib/channel_operator.cu @@ -25,226 +25,224 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i += blockDim.x * gridDim.x) namespace mshadow { - namespace cuda { - template - __global__ void GroupMaxForwardKernel( - const int count, - const DType* bottom_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* top_data, - DType* max_idx_data) { - CUDA_KERNEL_LOOP(index, count) { +namespace cuda { + template + __global__ void GroupMaxForwardKernel( + const int count, + const DType* bottom_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* top_data, + DType* max_idx_data) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int g = (index / spatial_dim) % group; + int n = index / spatial_dim / group; - int s = index % spatial_dim; - int g = (index / spatial_dim) % group; - int n = index / spatial_dim / group; - - DType max_val = -FLT_MAX; - int max_idx = -1; - for (int i = 0; i < channels_in_group; ++i) { - int c = g*channels_in_group + i; - int bottom_index = (n*channels + c)*spatial_dim + s; - if (bottom_data[bottom_index]>max_val) { - max_val = bottom_data[bottom_index]; - max_idx = c; - } + DType max_val = -FLT_MAX; + int max_idx = -1; + for (int i = 0; i < channels_in_group; ++i) { + int c = g*channels_in_group + i; + int bottom_index = (n*channels + c)*spatial_dim + s; + if (bottom_data[bottom_index] > max_val) { + max_val = bottom_data[bottom_index]; + max_idx = c; } - top_data[index] = max_val; - max_idx_data[index] = max_idx; } + top_data[index] = max_val; + max_idx_data[index] = max_idx; } + } - template - __global__ void GroupPickForwardKernel( - const int count, - const DType* bottom_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* top_data, - const DType* pick_idx_data) { - CUDA_KERNEL_LOOP(index, count) { - - int s = index % spatial_dim; - int c = (index / spatial_dim) % channels_in_group; - int n = index / spatial_dim / channels_in_group; - int g = pick_idx_data[n]; - int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; + template + __global__ void GroupPickForwardKernel( + const int count, + const DType* bottom_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* top_data, + const DType* pick_idx_data) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int c = (index / spatial_dim) % channels_in_group; + int n = index / spatial_dim / channels_in_group; + int g = pick_idx_data[n]; + int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; - top_data[index] = (g < group && g >= 0) ? bottom_data[bottom_index] : DType(0); - } + top_data[index] = (g < group && g >= 0) ? bottom_data[bottom_index] : DType(0); } + } - template - inline void GroupMaxForward(const Tensor &out, - const Tensor &data, - const Tensor &max_idx, - const int group) { - // LOG(INFO) << "GroupMaxForward"; - const DType *bottom_data = data.dptr_; - DType *top_data = out.dptr_; - DType *max_idx_data = max_idx.dptr_; - const int count = out.shape_.Size(); - const int channels = data.size(1); - const int height = data.size(2); - const int width = data.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(out.stream_); - GroupMaxForwardKernel << > >( - count, bottom_data, channels, group, - channels_in_group, spatial_dim, top_data, max_idx_data); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } + template + inline void GroupMaxForward(const Tensor &out, + const Tensor &data, + const Tensor &max_idx, + const int group) { + // LOG(INFO) << "GroupMaxForward"; + const DType *bottom_data = data.dptr_; + DType *top_data = out.dptr_; + DType *max_idx_data = max_idx.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(out.stream_); + GroupMaxForwardKernel << > >( + count, bottom_data, channels, group, + channels_in_group, spatial_dim, top_data, max_idx_data); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } - template - inline void GroupPickForward(const Tensor &out, - const Tensor &data, - const Tensor &pick_idx, - const int group) { - // LOG(INFO) << "GroupPickForward"; - const DType *bottom_data = data.dptr_; - DType *top_data = out.dptr_; - const DType *pick_idx_data = pick_idx.dptr_; - const int count = out.shape_.Size(); - const int channels = data.size(1); - const int height = data.size(2); - const int width = data.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(out.stream_); - GroupPickForwardKernel << > >( - count, bottom_data, channels, group, - channels_in_group, spatial_dim, top_data, pick_idx_data); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } + template + inline void GroupPickForward(const Tensor &out, + const Tensor &data, + const Tensor &pick_idx, + const int group) { + // LOG(INFO) << "GroupPickForward"; + const DType *bottom_data = data.dptr_; + DType *top_data = out.dptr_; + const DType *pick_idx_data = pick_idx.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(out.stream_); + GroupPickForwardKernel << > >( + count, bottom_data, channels, group, + channels_in_group, spatial_dim, top_data, pick_idx_data); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } - template - __global__ void GroupMaxBackwardAccKernel( - const int count, - const DType* top_diff, - const DType* max_idx_data, - const int channels, - const int group, - const int spatial_dim, - DType* bottom_diff) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int n = index / spatial_dim / group; + template + __global__ void GroupMaxBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* max_idx_data, + const int channels, + const int group, + const int spatial_dim, + DType* bottom_diff) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int n = index / spatial_dim / group; - int c = max_idx_data[index]; - int bottom_index = (n*channels + c)*spatial_dim + s; - bottom_diff[bottom_index] = top_diff[index]; - } + int c = max_idx_data[index]; + int bottom_index = (n*channels + c)*spatial_dim + s; + bottom_diff[bottom_index] = top_diff[index]; } + } - template - __global__ void GroupPickBackwardAccKernel( - const int count, - const DType* top_diff, - const DType* pick_idx_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* bottom_diff) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int c = (index / spatial_dim) % channels_in_group; - int n = index / spatial_dim / channels_in_group; - int g = pick_idx_data[n]; + template + __global__ void GroupPickBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* pick_idx_data, + const int channels, + const int group, + const int channels_in_group, + const int spatial_dim, + DType* bottom_diff) { + CUDA_KERNEL_LOOP(index, count) { + int s = index % spatial_dim; + int c = (index / spatial_dim) % channels_in_group; + int n = index / spatial_dim / channels_in_group; + int g = pick_idx_data[n]; - int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; - bottom_diff[bottom_index] = (g < group && g >= 0) ? top_diff[index] : DType(0); - } + int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; + bottom_diff[bottom_index] = (g < group && g >= 0) ? top_diff[index] : DType(0); } + } - template - inline void GroupMaxBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &max_idx, - const int group) { - // LOG(INFO) << "GroupMaxBackward"; - const DType *top_diff = out_grad.dptr_; - DType *bottom_diff = in_grad.dptr_; - const DType *max_idx_data = max_idx.dptr_; - const int count = out_grad.shape_.Size(); - const int channels = in_grad.size(1); - const int height = in_grad.size(2); - const int width = in_grad.size(3); - const int spatial_dim = height * width; - cudaStream_t stream = Stream::GetStream(in_grad.stream_); - GroupMaxBackwardAccKernel << > >( - count, top_diff, max_idx_data, channels, group, spatial_dim, bottom_diff); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } + template + inline void GroupMaxBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &max_idx, + const int group) { + // LOG(INFO) << "GroupMaxBackward"; + const DType *top_diff = out_grad.dptr_; + DType *bottom_diff = in_grad.dptr_; + const DType *max_idx_data = max_idx.dptr_; + const int count = out_grad.shape_.Size(); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int spatial_dim = height * width; + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + GroupMaxBackwardAccKernel << > >( + count, top_diff, max_idx_data, channels, group, spatial_dim, bottom_diff); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } - template - inline void GroupPickBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &pick_idx, - const int group) { - // LOG(INFO) << "GroupPickBackward"; - const DType *top_diff = out_grad.dptr_; - DType *bottom_diff = in_grad.dptr_; - const DType *pick_idx_data = pick_idx.dptr_; - const int count = out_grad.shape_.Size(); - const int channels = in_grad.size(1); - const int height = in_grad.size(2); - const int width = in_grad.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(in_grad.stream_); - GroupPickBackwardAccKernel << > >( - count, top_diff, pick_idx_data, channels, group, - channels_in_group, spatial_dim, bottom_diff); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - // GetMaxIdx - template - __global__ void GetMaxIdxKernel( - const int count, - const DType* pick_score_data, - DType* argmax_data, - const int group) { - CUDA_KERNEL_LOOP(index, count) { - const DType* offset_pick_score_data = pick_score_data + index*group; - int max_idx = -1; - DType max_val = -FLT_MAX; - for (int i = 1; i < group; ++i) { - max_idx = offset_pick_score_data[i] > max_val ? i : max_idx; - max_val = offset_pick_score_data[i] > max_val ? offset_pick_score_data[i] : max_val; - } - argmax_data[index] = static_cast(max_idx); + template + inline void GroupPickBackwardAcc(const Tensor &in_grad, + const Tensor &out_grad, + const Tensor &pick_idx, + const int group) { + // LOG(INFO) << "GroupPickBackward"; + const DType *top_diff = out_grad.dptr_; + DType *bottom_diff = in_grad.dptr_; + const DType *pick_idx_data = pick_idx.dptr_; + const int count = out_grad.shape_.Size(); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int spatial_dim = height * width; + const int channels_in_group = channels / group; + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + GroupPickBackwardAccKernel << > >( + count, top_diff, pick_idx_data, channels, group, + channels_in_group, spatial_dim, bottom_diff); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } + // GetMaxIdx + template + __global__ void GetMaxIdxKernel( + const int count, + const DType* pick_score_data, + DType* argmax_data, + const int group) { + CUDA_KERNEL_LOOP(index, count) { + const DType* offset_pick_score_data = pick_score_data + index*group; + int max_idx = -1; + DType max_val = -FLT_MAX; + for (int i = 1; i < group; ++i) { + max_idx = offset_pick_score_data[i] > max_val ? i : max_idx; + max_val = offset_pick_score_data[i] > max_val ? offset_pick_score_data[i] : max_val; } + argmax_data[index] = static_cast(max_idx); } + } - template - inline void GetMaxIdx(const Tensor &pick_score, - const Tensor &argmax, - const int group) { - // LOG(INFO) << "GroupPickBackward"; - const DType *pick_score_data = pick_score.dptr_; - DType *argmax_data = argmax.dptr_; - const int count = argmax.shape_.Size(); + template + inline void GetMaxIdx(const Tensor &pick_score, + const Tensor &argmax, + const int group) { + // LOG(INFO) << "GroupPickBackward"; + const DType *pick_score_data = pick_score.dptr_; + DType *argmax_data = argmax.dptr_; + const int count = argmax.shape_.Size(); - cudaStream_t stream = Stream::GetStream(argmax.stream_); - GetMaxIdxKernel << > >( - count, pick_score_data, argmax_data, group); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - } // namespace cuda + cudaStream_t stream = Stream::GetStream(argmax.stream_); + GetMaxIdxKernel << > >( + count, pick_score_data, argmax_data, group); + ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); + } +} // namespace cuda template inline void GroupMaxForward(const Tensor &out, @@ -288,16 +286,16 @@ namespace mshadow { namespace mxnet { - namespace op { +namespace op { - template<> - Operator* CreateOp(ChannelOperatorParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ChannelOperatorOp(param); - }); - return op; - } + template<> + Operator* CreateOp(ChannelOperatorParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ChannelOperatorOp(param); + }); + return op; + } - } // namespace op +} // namespace op } // namespace mxnet diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index ccc6bb36e72b..bac840774cd6 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -7,8 +7,8 @@ * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai */ -#ifndef MXNET_OPERATOR_DEFORMABLE_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_DEFORMABLE_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ #include #include @@ -28,460 +28,454 @@ namespace mxnet { - namespace op { - - namespace conv { - enum DeformableConvolutionOpInputs { kData, kOffset, kWeight, kBias }; - enum DeformableConvolutionOpOutputs { kOut }; - enum DeformableConvolutionOpResource { kTempSpace }; +namespace op { + +namespace conv { + enum DeformableConvolutionOpInputs { kData, kOffset, kWeight, kBias }; + enum DeformableConvolutionOpOutputs { kOut }; + enum DeformableConvolutionOpResource { kTempSpace }; +} + + struct DeformableConvolutionParam : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape dilate; + TShape pad; + uint32_t num_filter; + uint32_t num_group; + uint32_t num_deformable_group; + uint64_t workspace; + bool no_bias; + dmlc::optional layout; + DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { + DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .describe("convolution stride: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(dilate).set_default(TShape()) + .describe("convolution dilate: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for convolution: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) + .describe("convolution filter(channel) number"); + DMLC_DECLARE_FIELD(num_group).set_default(1) + .describe("Number of group partitions."); + DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) + .describe("Number of deformable group partitions."); + DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) + .describe("Maximum temperal workspace allowed for convolution (MB)."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(layout) + .add_enum("NCW", mshadow::kNCW) + .add_enum("NCHW", mshadow::kNCHW) + .add_enum("NCDHW", mshadow::kNCDHW) + .set_default(dmlc::optional()) + .describe("Set layout for input, output and weight. Empty for\n " + "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + } + }; + + template + class DeformableConvolutionOp : public Operator { + public: + explicit DeformableConvolutionOp(DeformableConvolutionParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + CHECK(param_.layout.value() == mshadow::kNCW || + param_.layout.value() == mshadow::kNCHW || + param_.layout.value() == mshadow::kNCDHW) + << "Only support NCW, NCHW and NCDHW layout"; } - struct DeformableConvolutionParam : public dmlc::Parameter { - TShape kernel; - TShape stride; - TShape dilate; - TShape pad; - uint32_t num_filter; - uint32_t num_group; - uint32_t num_deformable_group; - uint64_t workspace; - bool no_bias; - dmlc::optional layout; - DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { - DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(stride).set_default(TShape()) - .describe("convolution stride: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(dilate).set_default(TShape()) - .describe("convolution dilate: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(pad).set_default(TShape()) - .describe("pad for convolution: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) - .describe("convolution filter(channel) number"); - DMLC_DECLARE_FIELD(num_group).set_default(1) - .describe("Number of group partitions."); - DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) - .describe("Number of deformable group partitions."); - DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) - .describe("Maximum temperal workspace allowed for convolution (MB)."); - DMLC_DECLARE_FIELD(no_bias).set_default(false) - .describe("Whether to disable bias parameter."); - DMLC_DECLARE_FIELD(layout) - .add_enum("NCW", mshadow::kNCW) - .add_enum("NCHW", mshadow::kNCHW) - .add_enum("NCDHW", mshadow::kNCDHW) - .set_default(dmlc::optional()) - .describe("Set layout for input, output and weight. Empty for\n " - "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(req[conv::kOut], kWriteTo); + size_t expected = param_.no_bias ? 3 : 4; + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[conv::kData].shape_, in_data[conv::kOffset].shape_, out_data[conv::kOut].shape_); + Stream* s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_data[0].shape_[i + 1]; } - }; - - template - class DeformableConvolutionOp : public Operator { - public: - explicit DeformableConvolutionOp(DeformableConvolutionParam p) { - this->param_ = p; - // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(DType); - CHECK(param_.layout.value() == mshadow::kNCW || - param_.layout.value() == mshadow::kNCHW || - param_.layout.value() == mshadow::kNCDHW) - << "Only support NCW, NCHW and NCDHW layout"; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(req[conv::kOut], kWriteTo); - size_t expected = param_.no_bias ? 3 : 4; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1U); - LayerSetUp(in_data[conv::kData].shape_, in_data[conv::kOffset].shape_, out_data[conv::kOut].shape_); - Stream* s = ctx.get_stream(); - // allocate workspace for col_buffer - Tensor workspace = ctx.requested[conv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_), s); - // calculate the shape of col_buffer - TShape col_buffer_shape(num_spatial_axes_ + 1); - col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); - for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = out_data[0].shape_[i + 1]; - } - // create a column buffer using workspace and col_buffer_shape - TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); - - // initialize weight and col_buffer 3D tensors for using gemm - index_t M = conv_out_channels_ / group_; - index_t N = conv_out_spatial_dim_; - index_t K = kernel_dim_; - Tensor weight_3d = in_data[conv::kWeight].get_with_shape( - Shape3(group_, M, K), s); - Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, K, N), s); - Tensor output_4d = out_data[conv::kOut].get_with_shape( - Shape4(num_, group_, M, N), s); - for (index_t n = 0; n < num_; ++n) { - // transform image to col_buffer in order to use gemm - deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, - in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, - col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - col_buffer.dptr()); - Tensor output_3d = output_4d[n]; - for (index_t g = 0; g < group_; ++g) { - ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); - } - } - if (bias_term_) { - Tensor bias = in_data[conv::kBias].get(s); - Tensor output_3d = out_data[conv::kOut].get_with_shape( - Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); - // has bias term, broadcast it to the same shape of output_3d in channel dim - output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + index_t M = conv_out_channels_ / group_; + index_t N = conv_out_spatial_dim_; + index_t K = kernel_dim_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, M, K), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, K, N), s); + Tensor output_4d = out_data[conv::kOut].get_with_shape( + Shape4(num_, group_, M, N), s); + for (index_t n = 0; n < num_; ++n) { + // transform image to col_buffer in order to use gemm + deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + col_buffer.dptr()); + Tensor output_3d = output_4d[n]; + for (index_t g = 0; g < group_; ++g) { + ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); } } + if (bias_term_) { + Tensor bias = in_data[conv::kBias].get(s); + Tensor output_3d = out_data[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + // has bias term, broadcast it to the same shape of output_3d in channel dim + output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + } + } - virtual void Backward(const OpContext &ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - size_t expected = param_.no_bias == 0 ? 4 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); - LayerSetUp(in_grad[conv::kData].shape_, in_grad[conv::kOffset].shape_, out_grad[conv::kOut].shape_); - Stream *s = ctx.get_stream(); - // allocate workspace for col_buffer - Tensor workspace = ctx.requested[conv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_), s); - // calculate the shape of col_buffer - TShape col_buffer_shape(num_spatial_axes_ + 1); - col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); - for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = param_.no_bias == 0 ? 4 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); + LayerSetUp(in_grad[conv::kData].shape_, in_grad[conv::kOffset].shape_, out_grad[conv::kOut].shape_); + Stream *s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // For computing dLoss/d(in_data[kData]) + index_t M = kernel_dim_; + index_t N = conv_out_spatial_dim_; + index_t K = conv_out_channels_ / group_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( + Shape4(num_, group_, K, N), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, M, N), s); + // For computing dLoss/dWeight + Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + + Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); + data_grad = 0; + + + for (index_t n = 0; n < num_; ++n) { + Tensor out_grad_3d = out_grad_4d[n]; + for (index_t g = 0; g < group_; ++g) { + col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); } - // create a column buffer using workspace and col_buffer_shape - TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); - - // initialize weight and col_buffer 3D tensors for using gemm - // For computing dLoss/d(in_data[kData]) - index_t M = kernel_dim_; - index_t N = conv_out_spatial_dim_; - index_t K = conv_out_channels_ / group_; - Tensor weight_3d = in_data[conv::kWeight].get_with_shape( - Shape3(group_, K, M), s); - Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( - Shape4(num_, group_, K, N), s); - Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, M, N), s); - // For computing dLoss/dWeight - Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( - Shape3(group_, K, M), s); - - Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); - data_grad = 0; - - - for (index_t n = 0; n < num_; ++n) { - Tensor out_grad_3d = out_grad_4d[n]; - for (index_t g = 0; g < group_; ++g) { - col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); - } - // gradient w.r.t. input coordinate data - deformable_col2im_coord(s, col_buffer.dptr(), - in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kOffset].dptr() + n*input_offset_dim_, - in_grad[conv::kData].shape_, col_buffer.shape_, - param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - in_grad[conv::kOffset].dptr() + n*input_offset_dim_, - req[conv::kData]); - - // gradient w.r.t. input data - deformable_col2im(s, col_buffer.dptr(), - in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_grad[conv::kData].shape_, col_buffer.shape_, - param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - in_grad[conv::kData].dptr() + n*input_dim_, - req[conv::kData]); - - // gradient w.r.t. weight, dWeight should accumulate across the batch and group - im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, - col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, - col_buffer.dptr()); - for (index_t g = 0; g < group_; ++g) { - if (0 == n) { - ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], - dot(out_grad_3d[g], col_buffer_3d[g].T())); - } - else { - dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); - } + // gradient w.r.t. input coordinate data + deformable_col2im_coord(s, col_buffer.dptr(), + in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kOffset].dptr() + n*input_offset_dim_, + req[conv::kData]); + + // gradient w.r.t. input data + deformable_col2im(s, col_buffer.dptr(), + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kData].dptr() + n*input_dim_, + req[conv::kData]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + col_buffer.dptr()); + for (index_t g = 0; g < group_; ++g) { + if (0 == n) { + ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], + dot(out_grad_3d[g], col_buffer_3d[g].T())); + } else { + dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); } } + } - // gradient w.r.t bias - if (bias_term_) { - Tensor dbias = in_grad[conv::kBias].get(s); - Tensor dout = out_grad[conv::kOut].get_with_shape( - Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); - ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); - } - + // gradient w.r.t bias + if (bias_term_) { + Tensor dbias = in_grad[conv::kBias].get(s); + Tensor dout = out_grad[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); } - private: - void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { - channel_axis_ = 1; // hard code channel axis - const index_t first_spatial_axis = channel_axis_ + 1; - const index_t num_axes = param_.kernel.ndim() + 2; - num_spatial_axes_ = num_axes - first_spatial_axis; - is_1x1_ = true; - for (index_t i = 0; i < param_.kernel.ndim(); ++i) { - is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; - if (!is_1x1_) break; - } + } - // batch size - num_ = ishape[0]; - // number of input channels - channels_ = ishape[1]; - group_ = param_.num_group; - conv_out_channels_ = param_.num_filter; - conv_in_channels_ = channels_; - bias_term_ = !param_.no_bias; - kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); - weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; - conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); - col_offset_ = kernel_dim_ * conv_out_spatial_dim_; - output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; - // size of the column buffer used for storing im2col-ed pixels - col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; - // input/output image size (#channels * height * width) - input_dim_ = ishape.ProdShape(1, ishape.ndim()); - input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); - output_dim_ = oshape.ProdShape(1, oshape.ndim()); - num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; - num_kernels_col2im_ = input_dim_; + private: + void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const index_t num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + is_1x1_ = true; + for (index_t i = 0; i < param_.kernel.ndim(); ++i) { + is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; + if (!is_1x1_) break; } - private: - DeformableConvolutionParam param_; - index_t channel_axis_; // channel axis of the input - index_t channels_; // number of channels of input image - index_t num_spatial_axes_; // number of spatial axes - index_t num_; // batch size - index_t group_; // number of groups - index_t conv_out_channels_; // number of output channels (num_filter) - index_t conv_out_spatial_dim_; // number of pixels of output images per channel - index_t conv_in_channels_; // number of input channels - index_t kernel_dim_; // number of input channels per group * kernel size - index_t weight_offset_; // number of output channels per group * kernel_dim_ - index_t col_offset_; - index_t output_offset_; - index_t col_buffer_size_; - index_t input_dim_; - index_t input_offset_dim_; - index_t output_dim_; - index_t num_kernels_im2col_; - index_t num_kernels_col2im_; - bool bias_term_; // has bias term? - bool is_1x1_; - }; // class ConvolutionOp - - template - Operator* CreateOp(DeformableConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = param_.num_filter; + conv_in_channels_ = channels_; + bias_term_ = !param_.no_bias; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + + private: + DeformableConvolutionParam param_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t input_offset_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool bias_term_; // has bias term? + bool is_1x1_; + }; // class ConvolutionOp + + template + Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); #if DMLC_USE_CXX11 - class DeformableConvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return{ "data", "offset", "weight", "bias" }; - } - else { - return{ "data", "offset", "weight" }; - } + class DeformableConvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return{ "data", "offset", "weight", "bias" }; + } else { + return{ "data", "offset", "weight" }; } + } - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } - else { - LOG(FATAL) << "not implemented"; - } + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + LOG(FATAL) << "not implemented"; } + } - std::map GetParams() const override { - return param_.__DICT__(); - } + std::map GetParams() const override { + return param_.__DICT__(); + } - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + const TShape &oshp = (*in_shape)[conv::kOffset]; + if (dshp.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + CHECK_EQ(oshp.ndim(), 4U) \ + << "Input offset should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); } - else { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; - } - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - const TShape &oshp = (*in_shape)[conv::kOffset]; - if (dshp.ndim() == 0) return false; - if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - CHECK_EQ(oshp.ndim(), 4U) \ - << "Input offset should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - const index_t ksize_y = static_cast(param_.kernel[0]); - const index_t ksize_x = static_cast(param_.kernel[1]); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ - << "input num_filter must divide deformable group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = (dshape[2] + 2 * param_.pad[0] - - (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; - oshape[3] = (dshape[3] + 2 * param_.pad[1] - - (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ - << "output num_filter must divide deformable group size"; - CHECK_EQ(oshape[2], offsetshape[2]) \ - << "output height must equal to offset map height"; - CHECK_EQ(oshape[3], offsetshape[3]) \ - << "output width must equal to offset map width"; - CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ - << "offset filter must divide deformable group size"; - CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), param_.num_deformable_group) \ - << "offset filter must divide deformable group size"; - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (param_.stride[0] == 1) { - dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; - } - if (param_.stride[1] == 1) { - dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; - } - return true; + const index_t ksize_y = static_cast(param_.kernel[0]); + const index_t ksize_x = static_cast(param_.kernel[1]); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ + << "input num_filter must divide deformable group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = (dshape[2] + 2 * param_.pad[0] - + (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; + oshape[3] = (dshape[3] + 2 * param_.pad[1] - + (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + << "output num_filter must divide deformable group size"; + CHECK_EQ(oshape[2], offsetshape[2]) \ + << "output height must equal to offset map height"; + CHECK_EQ(oshape[3], offsetshape[3]) \ + << "output width must equal to offset map width"; + CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + << "offset filter must divide deformable group size"; + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), param_.num_deformable_group) \ + << "offset filter must divide deformable group size"; + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (param_.stride[0] == 1) { + dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; } - else { - LOG(FATAL) << "not implemented"; - return false; + if (param_.stride[1] == 1) { + dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } - else { - CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " - << "Expected " << dtype << " v.s. given " - << (*in_type)[i] << " at " << ListArguments()[i]; - } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; } - out_type->clear(); - out_type->push_back(dtype); return true; + } else { + LOG(FATAL) << "not implemented"; + return false; } + } - OperatorProperty* Copy() const override { - auto ptr = new DeformableConvolutionProp(); - ptr->param_ = param_; - return ptr; + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } } + out_type->clear(); + out_type->push_back(dtype); + return true; + } - std::string TypeString() const override { - return "_contrib_DeformableConvolution"; - } + OperatorProperty* Copy() const override { + auto ptr = new DeformableConvolutionProp(); + ptr->param_ = param_; + return ptr; + } - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return{ out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kOffset], in_data[conv::kWeight] }; - } + std::string TypeString() const override { + return "_contrib_DeformableConvolution"; + } - std::vector ForwardResource( - const std::vector &in_shape) const override { - return{ ResourceRequest::kTempSpace }; - } + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return{ out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kOffset], in_data[conv::kWeight] }; + } - std::vector BackwardResource( - const std::vector &in_shape) const override { - return{ ResourceRequest::kTempSpace }; - } + std::vector ForwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } + std::vector BackwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; - private: - DeformableConvolutionParam param_; - }; // class ConvolutionProp + private: + DeformableConvolutionParam param_; + }; // class ConvolutionProp #endif // DMLC_USE_CXX11 - } // namespace op +} // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ diff --git a/src/operator/contrib/deformable_convolution.cu b/src/operator/contrib/deformable_convolution.cu index 59948fd7765b..f690cc1ce24c 100644 --- a/src/operator/contrib/deformable_convolution.cu +++ b/src/operator/contrib/deformable_convolution.cu @@ -10,20 +10,20 @@ #include namespace mxnet { - namespace op { +namespace op { - template<> - Operator* CreateOp(DeformableConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeformableConvolutionOp(param); - }) - return op; - } + template<> + Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionOp(param); + }) + return op; + } - } // namespace op +} // namespace op } // namespace mxnet diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h index 7226299997af..0f191dc673af 100644 --- a/src/operator/contrib/deformable_psroi_pooling-inl.h +++ b/src/operator/contrib/deformable_psroi_pooling-inl.h @@ -5,8 +5,8 @@ * \brief deformable psroi pooling operator and symbol * \author Yi Li, Guodong Zhang, Jifeng Dai */ -#ifndef MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ -#define MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ #include #include @@ -20,261 +20,258 @@ namespace mxnet { - namespace op { - - // Declare enumeration of input order to make code more intuitive. - // These enums are only visible within this header - namespace deformablepsroipool { - enum DeformablePSROIPoolingOpInputs { kData, kBox, kTrans }; - enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount }; - } // deformablepsroipool - - struct DeformablePSROIPoolingParam : public dmlc::Parameter { - // TShape pooled_size; - float spatial_scale; - int output_dim; - int group_size; - int pooled_size; - int part_size; - int sample_per_part; - float trans_std; - bool no_trans; - DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { - DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) - .describe("Ratio of input feature map height (or w) to raw image height (or w). " - "Equals the reciprocal of total stride in convolutional layers"); - DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); - DMLC_DECLARE_FIELD(group_size).describe("fix group size"); - DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); - DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); - DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); - DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0).describe("fix transition std"); - DMLC_DECLARE_FIELD(no_trans).set_default(false) - .describe("Whether to disable trans parameter."); +namespace op { + + // Declare enumeration of input order to make code more intuitive. + // These enums are only visible within this header +namespace deformablepsroipool { + enum DeformablePSROIPoolingOpInputs { kData, kBox, kTrans }; + enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount }; +} // deformablepsroipool + + struct DeformablePSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int group_size; + int pooled_size; + int part_size; + int sample_per_part; + float trans_std; + bool no_trans; + DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(group_size).describe("fix group size"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); + DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); + DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0).describe("fix transition std"); + DMLC_DECLARE_FIELD(no_trans).set_default(false) + .describe("Whether to disable trans parameter."); + } + }; + + template + class DeformablePSROIPoolingOp : public Operator { + public: + explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor out = out_data[deformablepsroipool::kOut].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + out = -FLT_MAX; + top_count = 0.0f; + + Tensor trans; + if (!param_.no_trans) { + trans = in_data[deformablepsroipool::kTrans].get(s); } - }; - - template - class DeformablePSROIPoolingOp : public Operator { - public: - explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { - this->param_ = p; + DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, param_.spatial_scale, + param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans ? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << + // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); + Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); + Tensor grad_trans; + Tensor trans; + if (!param_.no_trans) { + CHECK_EQ(in_grad.size(), 3); + trans = in_data[deformablepsroipool::kTrans].get(s); + grad_trans = in_grad[deformablepsroipool::kTrans].get(s); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - size_t in_expected = param_.no_trans? 2 : 3; - size_t out_expected = 2; - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - Stream *s = ctx.get_stream(); - - Tensor data = in_data[deformablepsroipool::kData].get(s); - Tensor bbox = in_data[deformablepsroipool::kBox].get(s); - Tensor out = out_data[deformablepsroipool::kOut].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(bbox.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(top_count.CheckContiguous(), true); - out = -FLT_MAX; - top_count = 0.0f; - - Tensor trans; - if (!param_.no_trans) { - trans = in_data[deformablepsroipool::kTrans].get(s); - } - DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, param_.spatial_scale, - param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); - } + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - size_t in_expected = param_.no_trans ? 2 : 3; - size_t out_expected = 2; - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << - "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << - "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << - // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - Stream *s = ctx.get_stream(); - - Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); - Tensor data = in_data[deformablepsroipool::kData].get(s); - Tensor bbox = in_data[deformablepsroipool::kBox].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); - Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); - Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); - Tensor grad_trans; - Tensor trans; - if (!param_.no_trans) { - CHECK_EQ(in_grad.size(), 3); - trans = in_data[deformablepsroipool::kTrans].get(s); - grad_trans = in_grad[deformablepsroipool::kTrans].get(s); - } - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(bbox.CheckContiguous(), true); - CHECK_EQ(top_count.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[deformablepsroipool::kData], 0); - if (!param_.no_trans) { - Assign(grad_trans, req[deformablepsroipool::kTrans], 0); - } - DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, top_count, param_.no_trans, - param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, - param_.sample_per_part, param_.trans_std); - Assign(grad_roi, req[deformablepsroipool::kBox], 0); + Assign(grad_in, req[deformablepsroipool::kData], 0); + if (!param_.no_trans) { + Assign(grad_trans, req[deformablepsroipool::kTrans], 0); } + DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, top_count, param_.no_trans, + param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, + param_.sample_per_part, param_.trans_std); + Assign(grad_roi, req[deformablepsroipool::kBox], 0); + } - private: - DeformablePSROIPoolingParam param_; - }; // class DeformablePSROIPoolingOp + private: + DeformablePSROIPoolingParam param_; + }; // class DeformablePSROIPoolingOp - // Decalre Factory function, used for dispatch specialization - template - Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); + // Decalre Factory function, used for dispatch specialization + template + Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); #if DMLC_USE_CXX11 - class DeformablePSROIPoolingProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.no_trans) { - return{ "data", "rois" }; - } - else { - return{ "data", "rois", "trans" }; - } + class DeformablePSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.no_trans) { + return{ "data", "rois" }; + } else { + return{ "data", "rois", "trans" }; } + } - std::vector ListOutputs() const override { - return{ "output", "top_count" }; - } + std::vector ListOutputs() const override { + return{ "output", "top_count" }; + } - int NumOutputs() const override { - return 2; - } + int NumOutputs() const override { + return 2; + } - int NumVisibleOutputs() const override { - return 1; - } + int NumVisibleOutputs() const override { + return 1; + } - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - if (param_.part_size == 0) { - param_.part_size = param_.pooled_size; - } + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.part_size == 0) { + param_.part_size = param_.pooled_size; } - - std::map GetParams() const override { - return param_.__DICT__(); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.no_trans) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + } else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; + // trans: [num_rois, 2, pooled_h, pooled_w] + TShape tshape = in_shape->at(deformablepsroipool::kTrans); + CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; } - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (param_.no_trans) { - CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; - } - else { - CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; - // trans: [num_rois, 2, pooled_h, pooled_w] - TShape tshape = in_shape->at(deformablepsroipool::kTrans); - CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; - } - - // data: [batch_size, c, h, w] - TShape dshape = in_shape->at(deformablepsroipool::kData); - CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; - - // bbox: [num_rois, 5] - TShape bshape = in_shape->at(deformablepsroipool::kBox); - CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; - CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; - - // out: [num_rois, c, pooled_h, pooled_w] - // top_count: [num_rois, c, pooled_h, pooled_w] - out_shape->clear(); - out_shape->push_back( - Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); - out_shape->push_back( - Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); - return true; + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(deformablepsroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(deformablepsroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + // top_count: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); + deformable_psroi_pooling_sym->param_ = this->param_; + return deformable_psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_DeformablePSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.no_trans) { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], + out_data[deformablepsroipool::kTopCount] }; + } else { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], + in_data[deformablepsroipool::kTrans], out_data[deformablepsroipool::kTopCount] }; } + } - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 2); - int dtype = (*in_type)[0]; - CHECK_EQ(dtype, (*in_type)[1]); - CHECK_NE(dtype, -1) << "Input must have specified type"; - - out_type->clear(); - out_type->push_back(dtype); - out_type->push_back(dtype); - return true; - } - OperatorProperty* Copy() const override { - DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); - deformable_psroi_pooling_sym->param_ = this->param_; - return deformable_psroi_pooling_sym; - } - - std::string TypeString() const override { - return "_contrib_DeformablePSROIPooling"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.no_trans) { - return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], - out_data[deformablepsroipool::kTopCount] }; - } - else { - return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], - in_data[deformablepsroipool::kTrans], out_data[deformablepsroipool::kTopCount] }; - } - } - - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; - private: - DeformablePSROIPoolingParam param_; - }; // class DeformablePSROIPoolingProp + private: + DeformablePSROIPoolingParam param_; + }; // class DeformablePSROIPoolingProp #endif - } // namespace op +} // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_DEFORMABLE_PSROI_POOLING_INL_H_ \ No newline at end of file +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc index 4a21a7933f44..7dce9be9b866 100644 --- a/src/operator/contrib/deformable_psroi_pooling.cc +++ b/src/operator/contrib/deformable_psroi_pooling.cc @@ -58,39 +58,40 @@ namespace mshadow { } // namespace mshadow namespace mxnet { - namespace op { +namespace op { - template<> - Operator *CreateOp(DeformablePSROIPoolingParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeformablePSROIPoolingOp(param); - }); - return op; - } + template<> + Operator *CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } - Operator *DeformablePSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - std::vector out_type, aux_type; - CHECK(InferType(in_type, &out_type, &aux_type)); - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); - } + Operator *DeformablePSROIPoolingProp::CreateOperatorEx( + Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); + } - DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); + DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); - MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) - .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " - "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " - "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " - "the number of region bounding boxes after DeformablePSROIPooling") - .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") - .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " - "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " - "of designated region of interest. batch_index indicates the index of corresponding image " - "in the input data") - .add_argument("trans", "Symbol", "transition parameter") - .add_arguments(DeformablePSROIPoolingParam::__FIELDS__()); - } // namespace op + MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) + .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " + "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " + "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " + "the number of region bounding boxes after DeformablePSROIPooling") + .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") + .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " + "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " + "of designated region of interest. batch_index indicates the index of corresponding image " + "in the input data") + .add_argument("trans", "Symbol", "transition parameter") + .add_arguments(DeformablePSROIPoolingParam::__FIELDS__()); +} // namespace op } // namespace mxnet diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu index 5b8f36138e86..94cd1f985507 100644 --- a/src/operator/contrib/deformable_psroi_pooling.cu +++ b/src/operator/contrib/deformable_psroi_pooling.cu @@ -25,325 +25,325 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ i += blockDim.x * gridDim.x) namespace mshadow { - namespace cuda { - template - __device__ DType bilinear_interp( - const DType* data, - const DType x, - const DType y, - const int width, - const int height) { - int x1 = floor(x); - int x2 = ceil(x); - int y1 = floor(y); - int y2 = ceil(y); - DType dist_x = static_cast(x - x1); - DType dist_y = static_cast(y - y1); - DType value11 = data[y1*width + x1]; - DType value12 = data[y2*width + x1]; - DType value21 = data[y1*width + x2]; - DType value22 = data[y2*width + x2]; - DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12 - + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22; - return value; - } +namespace cuda { + template + __device__ DType bilinear_interp( + const DType* data, + const DType x, + const DType y, + const int width, + const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + DType dist_x = static_cast(x - x1); + DType dist_y = static_cast(y - y1); + DType value11 = data[y1*width + x1]; + DType value12 = data[y2*width + x1]; + DType value21 = data[y1*width + x2]; + DType value22 = data[y2*width + x2]; + DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12 + + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22; + return value; + } - template - __global__ void DeformablePSROIPoolForwardKernel( - const int count, - const DType* bottom_data, - const DType spatial_scale, - const int channels, - const int height, const int width, - const int pooled_height, const int pooled_width, - const DType* bottom_rois, const DType* bottom_trans, - const bool no_trans, - const DType trans_std, - const int sample_per_part, - const int output_dim, - const int group_size, - const int part_size, - const int num_classes, - const int channels_each_class, - DType* top_data, - DType* top_count) { - CUDA_KERNEL_LOOP(index, count) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int ctop = (index / pooled_width / pooled_height) % output_dim; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const DType* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; - DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; - DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; - DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; - - // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 - DType roi_height = max(roi_end_h - roi_start_h, 0.1); - - // Compute w and h at bottom - DType bin_size_h = roi_height / static_cast(pooled_height); - DType bin_size_w = roi_width / static_cast(pooled_width); - - DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); - DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); - - int part_h = floor(static_cast(ph) / pooled_height*part_size); - int part_w = floor(static_cast(pw) / pooled_width*part_size); - int class_id = ctop / channels_each_class; - DType trans_x = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; - DType trans_y = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; - - DType wstart = static_cast(pw)* bin_size_w - + roi_start_w; - wstart += trans_x * roi_width; - DType hstart = static_cast(ph) * bin_size_h - + roi_start_h; - hstart += trans_y * roi_height; - - DType sum = 0; - int count = 0; - int gw = floor(static_cast(pw) * group_size / pooled_width); - int gh = floor(static_cast(ph)* group_size / pooled_height); - gw = min(max(gw, 0), group_size - 1); - gh = min(max(gh, 0), group_size - 1); - - const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; - for (int ih = 0; ih < sample_per_part; ih++) { - for (int iw = 0; iw < sample_per_part; iw++) { - DType w = wstart + iw*sub_bin_size_w; - DType h = hstart + ih*sub_bin_size_h; - // bilinear interpolation - if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { - continue; - } - w = min(max(w, 0.), width - 1.); - h = min(max(h, 0.), height - 1.); - int c = (ctop*group_size + gh)*group_size + gw; - DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height); - sum += val; - count++; + template + __global__ void DeformablePSROIPoolForwardKernel( + const int count, + const DType* bottom_data, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const DType* bottom_rois, const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int output_dim, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class, + DType* top_data, + DType* top_count) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + DType sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height); + sum += val; + count++; } - top_data[index] = count == 0 ? static_cast(0) : sum / count; - top_count[index] = count; } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; } + } - template - inline void DeformablePSROIPoolForward(const Tensor &out, - const Tensor &data, - const Tensor &bbox, - const Tensor &trans, - const Tensor &top_count, - const bool no_trans, - const float spatial_scale, - const int output_dim, - const int group_size, - const int pooled_size, - const int part_size, - const int sample_per_part, - const float trans_std) { - // LOG(INFO) << "DeformablePSROIPoolForward"; - const DType *bottom_data = data.dptr_; - const DType *bottom_rois = bbox.dptr_; - const DType *bottom_trans = no_trans ? NULL : trans.dptr_; - DType *top_data = out.dptr_; - DType *top_count_data = top_count.dptr_; - const int count = out.shape_.Size(); - const int channels = data.size(1); - const int height = data.size(2); - const int width = data.size(3); - const int pooled_height = pooled_size; - const int pooled_width = pooled_size; - const int num_classes = no_trans ? 1 : trans.size(1) / 2; - const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; - - cudaStream_t stream = Stream::GetStream(out.stream_); - DeformablePSROIPoolForwardKernel << > >( - count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width, - bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim, - group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); - DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); - } + template + inline void DeformablePSROIPoolForward(const Tensor &out, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolForward"; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *top_data = out.dptr_; + DType *top_count_data = top_count.dptr_; + const int count = out.shape_.Size(); + const int channels = data.size(1); + const int height = data.size(2); + const int width = data.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(out.stream_); + DeformablePSROIPoolForwardKernel << > >( + count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width, + bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim, + group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } - template - __global__ void DeformablePSROIPoolBackwardAccKernel( - const int count, - const DType* top_diff, - const DType* top_count, - const int num_rois, - const DType spatial_scale, - const int channels, - const int height, const int width, - const int pooled_height, const int pooled_width, - const int output_dim, - DType* bottom_data_diff, DType* bottom_trans_diff, - const DType* bottom_data, - const DType* bottom_rois, - const DType* bottom_trans, - const bool no_trans, - const DType trans_std, - const int sample_per_part, - const int group_size, - const int part_size, - const int num_classes, - const int channels_each_class) { - CUDA_KERNEL_LOOP(index, count) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int ctop = (index / pooled_width / pooled_height) % output_dim; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const DType* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; - DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; - DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; - DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; - - // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 - DType roi_height = max(roi_end_h - roi_start_h, 0.1); - - // Compute w and h at bottom - DType bin_size_h = roi_height / static_cast(pooled_height); - DType bin_size_w = roi_width / static_cast(pooled_width); - - DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); - DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); - - int part_h = floor(static_cast(ph) / pooled_height*part_size); - int part_w = floor(static_cast(pw) / pooled_width*part_size); - int class_id = ctop / channels_each_class; - DType trans_x = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; - DType trans_y = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; - - DType wstart = static_cast(pw)* bin_size_w - + roi_start_w; - wstart += trans_x * roi_width; - DType hstart = static_cast(ph) * bin_size_h - + roi_start_h; - hstart += trans_y * roi_height; - - if (top_count[index] <= 0) { - continue; - } - DType diff_val = top_diff[index] / top_count[index]; - const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; - DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; - int gw = floor(static_cast(pw)* group_size / pooled_width); - int gh = floor(static_cast(ph)* group_size / pooled_height); - gw = min(max(gw, 0), group_size - 1); - gh = min(max(gh, 0), group_size - 1); - - for (int ih = 0; ih < sample_per_part; ih++) { - for (int iw = 0; iw < sample_per_part; iw++) { - DType w = wstart + iw*sub_bin_size_w; - DType h = hstart + ih*sub_bin_size_h; - // bilinear interpolation - if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { - continue; - } - w = min(max(w, 0.), width - 1.); - h = min(max(h, 0.), height - 1.); - int c = (ctop*group_size + gh)*group_size + gw; - // backward on feature - int x0 = floor(w); - int x1 = ceil(w); - int y0 = floor(h); - int y1 = ceil(h); - DType dist_x = w - x0, dist_y = h - y0; - DType q00 = (1 - dist_x)*(1 - dist_y); - DType q01 = (1 - dist_x)*dist_y; - DType q10 = dist_x*(1 - dist_y); - DType q11 = dist_x*dist_y; - int bottom_index_base = c * height *width; - atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val); - atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val); - atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val); - atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val); - - if (no_trans) { - continue; - } - DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0]; - DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0]; - DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1]; - DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1]; - DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y)) - *trans_std*diff_val; - diff_x *= roi_width; - DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x)) - *trans_std*diff_val; - diff_y *= roi_height; - - atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w, diff_x); - atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)*part_size + part_h)*part_size + part_w, diff_y); + template + __global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, + const DType* top_diff, + const DType* top_count, + const int num_rois, + const DType spatial_scale, + const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, + DType* bottom_data_diff, DType* bottom_trans_diff, + const DType* bottom_data, + const DType* bottom_rois, + const DType* bottom_trans, + const bool no_trans, + const DType trans_std, + const int sample_per_part, + const int group_size, + const int part_size, + const int num_classes, + const int channels_each_class) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + const DType* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + + // Force too small ROIs to be 1x1 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_height = max(roi_end_h - roi_start_h, 0.1); + + // Compute w and h at bottom + DType bin_size_h = roi_height / static_cast(pooled_height); + DType bin_size_w = roi_width / static_cast(pooled_width); + + DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor(static_cast(ph) / pooled_height*part_size); + int part_w = floor(static_cast(pw) / pooled_width*part_size); + int class_id = ctop / channels_each_class; + DType trans_x = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + DType trans_y = no_trans ? static_cast(0) : + bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + + DType wstart = static_cast(pw)* bin_size_w + + roi_start_w; + wstart += trans_x * roi_width; + DType hstart = static_cast(ph) * bin_size_h + + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + DType diff_val = top_diff[index] / top_count[index]; + const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; + DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw)* group_size / pooled_width); + int gh = floor(static_cast(ph)* group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + DType w = wstart + iw*sub_bin_size_w; + DType h = hstart + ih*sub_bin_size_h; + // bilinear interpolation + if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { + continue; } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop*group_size + gh)*group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + DType dist_x = w - x0, dist_y = h - y0; + DType q00 = (1 - dist_x)*(1 - dist_y); + DType q01 = (1 - dist_x)*dist_y; + DType q10 = dist_x*(1 - dist_y); + DType q11 = dist_x*dist_y; + int bottom_index_base = c * height *width; + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val); + atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val); + + if (no_trans) { + continue; + } + DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0]; + DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0]; + DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1]; + DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1]; + DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y)) + *trans_std*diff_val; + diff_x *= roi_width; + DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x)) + *trans_std*diff_val; + diff_y *= roi_height; + + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)*part_size + part_h)*part_size + part_w, diff_y); } } } + } - template - inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, - const Tensor &trans_grad, - const Tensor &out_grad, - const Tensor &data, - const Tensor &bbox, - const Tensor &trans, - const Tensor &top_count, - const bool no_trans, - const float spatial_scale, - const int output_dim, - const int group_size, - const int pooled_size, - const int part_size, - const int sample_per_part, - const float trans_std) { - // LOG(INFO) << "DeformablePSROIPoolBackward"; - const DType *top_diff = out_grad.dptr_; - const DType *bottom_data = data.dptr_; - const DType *bottom_rois = bbox.dptr_; - const DType *bottom_trans = no_trans ? NULL : trans.dptr_; - DType *bottom_data_diff = in_grad.dptr_; - DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_; - const DType *top_count_data = top_count.dptr_; - const int count = out_grad.shape_.Size(); - const int num_rois = bbox.size(0); - const int channels = in_grad.size(1); - const int height = in_grad.size(2); - const int width = in_grad.size(3); - const int pooled_height = pooled_size; - const int pooled_width = pooled_size; - const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2; - const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; - - cudaStream_t stream = Stream::GetStream(in_grad.stream_); - DeformablePSROIPoolBackwardAccKernel << > >( - count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width, - pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, - bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, - group_size, part_size, num_classes, channels_each_class); - DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); - } + template + inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, + const Tensor &trans_grad, + const Tensor &out_grad, + const Tensor &data, + const Tensor &bbox, + const Tensor &trans, + const Tensor &top_count, + const bool no_trans, + const float spatial_scale, + const int output_dim, + const int group_size, + const int pooled_size, + const int part_size, + const int sample_per_part, + const float trans_std) { + // LOG(INFO) << "DeformablePSROIPoolBackward"; + const DType *top_diff = out_grad.dptr_; + const DType *bottom_data = data.dptr_; + const DType *bottom_rois = bbox.dptr_; + const DType *bottom_trans = no_trans ? NULL : trans.dptr_; + DType *bottom_data_diff = in_grad.dptr_; + DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_; + const DType *top_count_data = top_count.dptr_; + const int count = out_grad.shape_.Size(); + const int num_rois = bbox.size(0); + const int channels = in_grad.size(1); + const int height = in_grad.size(2); + const int width = in_grad.size(3); + const int pooled_height = pooled_size; + const int pooled_width = pooled_size; + const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2; + const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; + + cudaStream_t stream = Stream::GetStream(in_grad.stream_); + DeformablePSROIPoolBackwardAccKernel << > >( + count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width, + pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, + bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, + group_size, part_size, num_classes, channels_each_class); + DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); + } - } // namespace cuda +} // namespace cuda template inline void DeformablePSROIPoolForward(const Tensor &out, @@ -387,16 +387,16 @@ namespace mshadow { namespace mxnet { - namespace op { - - template<> - Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeformablePSROIPoolingOp(param); - }); - return op; - } +namespace op { + + template<> + Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformablePSROIPoolingOp(param); + }); + return op; + } - } // namespace op +} // namespace op } // namespace mxnet diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h index 3116d8c15623..48f3535b5eab 100644 --- a/src/operator/contrib/multi_proposal-inl.h +++ b/src/operator/contrib/multi_proposal-inl.h @@ -6,8 +6,8 @@ * \brief MultiProposal Operator * \author Piotr Teterwak, Bing Xu, Jian Guo, Xizhou Zhu */ -#ifndef MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ -#define MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ +#define MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ #include #include @@ -298,4 +298,4 @@ inline void GenerateAnchors(const std::vector& base_anchor, } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_ +#endif // MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_ diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu index 28e1c6a4cebd..e5f877cc937f 100644 --- a/src/operator/contrib/multi_proposal.cu +++ b/src/operator/contrib/multi_proposal.cu @@ -57,15 +57,15 @@ __global__ void ProposalGridKernel(const int count, int a = index % num_anchors; int w = (index / num_anchors) % width; int h = (index / num_anchors / width) % height; - int b = index / num_anchors / width / height; + int b = index / num_anchors / width / height; workspace_proposals[index * 5 + 0] = workspace_proposals[a * 5 + 0] + w * feature_stride; workspace_proposals[index * 5 + 1] = workspace_proposals[a * 5 + 1] + h * feature_stride; workspace_proposals[index * 5 + 2] = workspace_proposals[a * 5 + 2] + w * feature_stride; workspace_proposals[index * 5 + 3] = workspace_proposals[a * 5 + 3] + h * feature_stride; - workspace_proposals[index * 5 + 4] = - scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w]; - //workspace_proposals[index * 5 + 4] = scores[(a * height + h) * width + w]; + workspace_proposals[index * 5 + 4] = + scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w]; + //workspace_proposals[index * 5 + 4] = scores[(a * height + h) * width + w]; } } @@ -79,8 +79,8 @@ __global__ void BBoxPredKernel(const int count, const int num_anchors, const int feat_height, const int feat_width, - const int feature_stride, - const Dtype* im_infos, + const int feature_stride, + const Dtype* im_infos, const Dtype* boxes, const Dtype* deltas, Dtype* out_pred_boxes) { @@ -89,20 +89,20 @@ __global__ void BBoxPredKernel(const int count, index += blockDim.x * gridDim.x) { int a = index % num_anchors; int w = (index / num_anchors) % feat_width; - int h = (index / num_anchors / feat_width) % feat_height; - int b = index / num_anchors / feat_width / feat_height; + int h = (index / num_anchors / feat_width) % feat_height; + int b = index / num_anchors / feat_width / feat_height; - float im_height = im_infos[b * 3]; - float im_width = im_infos[b * 3 + 1]; - int real_height = static_cast(im_height / feature_stride); - int real_width = static_cast(im_width / feature_stride); + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); float width = boxes[index * 5 + 2] - boxes[index * 5 + 0] + 1.0f; float height = boxes[index * 5 + 3] - boxes[index * 5 + 1] + 1.0f; float ctr_x = boxes[index * 5 + 0] + 0.5f * (width - 1.0f); float ctr_y = boxes[index * 5 + 1] + 0.5f * (height - 1.0f); - int ba = (b * num_anchors + a); + int ba = (b * num_anchors + a); float dx = deltas[((ba * 4) * feat_height + h) * feat_width + w]; float dy = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; float dw = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; @@ -145,7 +145,7 @@ __global__ void IoUPredKernel(const int count, const int feat_height, const int feat_width, const int feature_stride, - const Dtype* im_infos, + const Dtype* im_infos, const Dtype* boxes, const Dtype* deltas, Dtype* out_pred_boxes) { @@ -155,19 +155,19 @@ __global__ void IoUPredKernel(const int count, int a = index % num_anchors; int w = (index / num_anchors) % feat_width; int h = (index / num_anchors / feat_width) % feat_height; - int b = index / num_anchors / feat_width / feat_height; + int b = index / num_anchors / feat_width / feat_height; - float im_height = im_infos[b * 3]; - float im_width = im_infos[b * 3 + 1]; - int real_height = static_cast(im_height / feature_stride); - int real_width = static_cast(im_width / feature_stride); + float im_height = im_infos[b * 3]; + float im_width = im_infos[b * 3 + 1]; + int real_height = static_cast(im_height / feature_stride); + int real_width = static_cast(im_width / feature_stride); float x1 = boxes[index * 5 + 0]; float y1 = boxes[index * 5 + 1]; float x2 = boxes[index * 5 + 2]; float y2 = boxes[index * 5 + 3]; - int ba = (b * num_anchors + a); + int ba = (b * num_anchors + a); float dx1 = deltas[((ba * 4) * feat_height + h) * feat_width + w]; float dy1 = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w]; float dx2 = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w]; @@ -183,7 +183,7 @@ __global__ void IoUPredKernel(const int count, out_pred_boxes[index * 5 + 2] = pred_x2; out_pred_boxes[index * 5 + 3] = pred_y2; - if (h >= real_height || w >= real_width) { + if (h >= real_height || w >= real_width) { out_pred_boxes[index * 5 + 4] = -1.0f; } } @@ -194,17 +194,17 @@ __global__ void IoUPredKernel(const int count, // dets (b, n, 5) template __global__ void FilterBoxKernel(const int count, - const int count_anchors, + const int count_anchors, const float original_min_size, - const Dtype* im_infos, + const Dtype* im_infos, Dtype* dets) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < count; index += blockDim.x * gridDim.x) { - int b = index / count_anchors; + int b = index / count_anchors; float iw = dets[index * 5 + 2] - dets[index * 5 + 0] + 1.0f; float ih = dets[index * 5 + 3] - dets[index * 5 + 1] + 1.0f; - float min_size = original_min_size * im_infos[b * 3 + 2]; + float min_size = original_min_size * im_infos[b * 3 + 2]; if (iw < min_size || ih < min_size) { dets[index * 5 + 0] -= min_size / 2; dets[index * 5 + 1] -= min_size / 2; @@ -364,7 +364,7 @@ __global__ void PrepareOutput(const int count, const Dtype* dets, const int* keep, const int out_size, - const int image_index, + const int image_index, Dtype* out, Dtype* score) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; @@ -407,7 +407,7 @@ class MultiProposalGPUOp : public Operator{ const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; - using namespace mshadow::cuda; + using namespace mshadow::cuda; using namespace mshadow::cuda::multi_proposal; CHECK_EQ(in_data.size(), 3); CHECK_EQ(out_data.size(), 2); @@ -424,13 +424,13 @@ class MultiProposalGPUOp : public Operator{ Tensor out = out_data[proposal::kOut].get(s); Tensor out_score = out_data[proposal::kScore].get(s); - - int num_images = scores.size(0); - int num_anchors = scores.size(1) / 2; + + int num_images = scores.size(0); + int num_anchors = scores.size(1) / 2; int height = scores.size(2); int width = scores.size(3); int count_anchors = num_anchors * height * width; // count of total anchors - int count = num_images * count_anchors; + int count = num_images * count_anchors; // set to -1 for max int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors; rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors); @@ -462,7 +462,7 @@ class MultiProposalGPUOp : public Operator{ dim3 dimBlock(kMaxThreadsPerBlock); CheckLaunchParam(dimGrid, dimBlock, "ProposalGrid"); ProposalGridKernel<<>>( - count, num_anchors, height, width, param_.feature_stride, + count, num_anchors, height, width, param_.feature_stride, scores.dptr_, workspace_proposals.dptr_); FRCNN_CUDA_CHECK(cudaPeekAtLastError()); @@ -470,11 +470,11 @@ class MultiProposalGPUOp : public Operator{ CheckLaunchParam(dimGrid, dimBlock, "BBoxPred"); if (param_.iou_loss) { IoUPredKernel<<>>( - count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); } else { BBoxPredKernel<<>>( - count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, + count, num_anchors, height, width, param_.feature_stride, im_info.dptr_, workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_); } FRCNN_CUDA_CHECK(cudaPeekAtLastError()); @@ -482,79 +482,78 @@ class MultiProposalGPUOp : public Operator{ // filter boxes with less than rpn_min_size CheckLaunchParam(dimGrid, dimBlock, "FilterBox"); FilterBoxKernel<<>>( - count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_); + count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_); FRCNN_CUDA_CHECK(cudaPeekAtLastError()); - dimGrid = dim3((count_anchors + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); - dimBlock = dim3(kMaxThreadsPerBlock); - // Copy score to a continuous memory - float* score_ptr = NULL; - FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count_anchors)); - Tensor score(score_ptr, Shape1(count_anchors)); - int* order_ptr = NULL; - FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count_anchors)); - Tensor order(order_ptr, Shape1(count_anchors)); - - float* workspace_ordered_proposals_ptr = NULL; - FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, - sizeof(float) * rpn_pre_nms_top_n * 5)); - Tensor workspace_ordered_proposals(workspace_ordered_proposals_ptr, - Shape2(rpn_pre_nms_top_n, 5)); - - int* keep; - FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n)); - - for (int b = 0; b < num_images; b++) { - - CheckLaunchParam(dimGrid, dimBlock, "CopyScore"); - CopyScoreKernel << > >( - count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, score.dptr_, order.dptr_); - FRCNN_CUDA_CHECK(cudaPeekAtLastError()); - - // argsort score, save order - thrust::stable_sort_by_key(thrust::device, - score.dptr_, - score.dptr_ + score.size(0), - order.dptr_, - thrust::greater()); - FRCNN_CUDA_CHECK(cudaPeekAtLastError()); - - // Reorder proposals according to order - - dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; - CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals"); - ReorderProposalsKernel << > >( - rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, order.dptr_, workspace_ordered_proposals.dptr_); - FRCNN_CUDA_CHECK(cudaPeekAtLastError()); - - // perform nms - std::vector _keep(workspace_ordered_proposals.size(0)); - int out_size = 0; - _nms(workspace_ordered_proposals, - param_.threshold, - &_keep[0], - &out_size); - - // copy nms result to gpu - FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(), - cudaMemcpyHostToDevice)); - - // copy results after nms - dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; - CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput"); - PrepareOutput << > >( - rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b, - out.dptr_ + b * rpn_post_nms_top_n * 5, out_score.dptr_ + b * rpn_post_nms_top_n); - FRCNN_CUDA_CHECK(cudaPeekAtLastError()); - } - // free temporary memory - FRCNN_CUDA_CHECK(cudaFree(keep)); - FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr)); - FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr)); - FRCNN_CUDA_CHECK(cudaFree(score_ptr)); - FRCNN_CUDA_CHECK(cudaFree(order_ptr)); + dimGrid = dim3((count_anchors + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); + dimBlock = dim3(kMaxThreadsPerBlock); + // Copy score to a continuous memory + float* score_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count_anchors)); + Tensor score(score_ptr, Shape1(count_anchors)); + int* order_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count_anchors)); + Tensor order(order_ptr, Shape1(count_anchors)); + + float* workspace_ordered_proposals_ptr = NULL; + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, + sizeof(float) * rpn_pre_nms_top_n * 5)); + Tensor workspace_ordered_proposals(workspace_ordered_proposals_ptr, + Shape2(rpn_pre_nms_top_n, 5)); + + int* keep; + FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n)); + + for (int b = 0; b < num_images; b++) { + CheckLaunchParam(dimGrid, dimBlock, "CopyScore"); + CopyScoreKernel << > >( + count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, score.dptr_, order.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // argsort score, save order + thrust::stable_sort_by_key(thrust::device, + score.dptr_, + score.dptr_ + score.size(0), + order.dptr_, + thrust::greater()); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // Reorder proposals according to order + + dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals"); + ReorderProposalsKernel << > >( + rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, order.dptr_, workspace_ordered_proposals.dptr_); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + + // perform nms + std::vector _keep(workspace_ordered_proposals.size(0)); + int out_size = 0; + _nms(workspace_ordered_proposals, + param_.threshold, + &_keep[0], + &out_size); + + // copy nms result to gpu + FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(), + cudaMemcpyHostToDevice)); + + // copy results after nms + dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; + CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput"); + PrepareOutput << > >( + rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b, + out.dptr_ + b * rpn_post_nms_top_n * 5, out_score.dptr_ + b * rpn_post_nms_top_n); + FRCNN_CUDA_CHECK(cudaPeekAtLastError()); + } + // free temporary memory + FRCNN_CUDA_CHECK(cudaFree(keep)); + FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr)); + FRCNN_CUDA_CHECK(cudaFree(score_ptr)); + FRCNN_CUDA_CHECK(cudaFree(order_ptr)); } virtual void Backward(const OpContext &ctx, diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h index 93a555164d7a..aeed47b8b1ba 100644 --- a/src/operator/contrib/nn/deformable_im2col.h +++ b/src/operator/contrib/nn/deformable_im2col.h @@ -86,14 +86,14 @@ namespace op { */ template inline void deformable_im2col(mshadow::Stream* s, - const DType* data_im, const DType* data_offset, + const DType* data_im, const DType* data_offset, const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, - const TShape& pad, const TShape& stride, const TShape& dilation, + const TShape& pad, const TShape& stride, const TShape& dilation, const uint32_t deformable_group, DType* data_col) { if (2 == kernel_shape.ndim()) { - LOG(FATAL) << "not implemented"; + LOG(FATAL) << "not implemented"; } else { - LOG(FATAL) << "not implemented"; + LOG(FATAL) << "not implemented"; } } diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h index f41d3237d5ac..560b09c5a73e 100644 --- a/src/operator/contrib/psroi_pooling-inl.h +++ b/src/operator/contrib/psroi_pooling-inl.h @@ -6,8 +6,8 @@ * \brief psroi pooling operator and symbol * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai */ -#ifndef MXNET_OPERATOR_PSROI_POOLING_INL_H_ -#define MXNET_OPERATOR_PSROI_POOLING_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ #include #include @@ -109,7 +109,6 @@ class PSROIPoolingOp : public Operator { if (kWriteTo == req[psroipool::kBox]) { grad_roi = 0.0f; } - } private: @@ -219,4 +218,4 @@ class PSROIPoolingProp : public OperatorProperty { #endif } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_PSROI_POOLING_INL_H_ +#endif // MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_ diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc index 678b0b0a67c7..ad25aec8eee8 100644 --- a/src/operator/contrib/psroi_pooling.cc +++ b/src/operator/contrib/psroi_pooling.cc @@ -24,7 +24,7 @@ inline void PSROIPoolForward(const Tensor &out, const Tensor &data, const Tensor &bbox, const float spatial_scale_, - const int output_dim_, + const int output_dim_, const int group_size_) { // NOT_IMPLEMENTED; return; @@ -35,7 +35,7 @@ inline void PSROIPoolBackwardAcc(const Tensor &in_grad, const Tensor &out_grad, const Tensor &bbox, const float spatial_scale_, - const int output_dim_, + const int output_dim_, const int group_size_) { // NOT_IMPLEMENTED; return; diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu index 3abd10bbdd29..67c2d55303fd 100644 --- a/src/operator/contrib/psroi_pooling.cu +++ b/src/operator/contrib/psroi_pooling.cu @@ -56,7 +56,7 @@ __global__ void PSROIPoolForwardKernel( DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom @@ -74,7 +74,7 @@ __global__ void PSROIPoolForwardKernel( // Add roi offsets and clip to input boundaries hstart = min(max(hstart, 0), height); hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0),width); + wstart = min(max(wstart, 0), width); wend = min(max(wend, 0), width); bool is_empty = (hend <= hstart) || (wend <= wstart); @@ -86,8 +86,8 @@ __global__ void PSROIPoolForwardKernel( const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; DType out_sum = 0; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { int bottom_index = h*width + w; out_sum += offset_bottom_data[bottom_index]; } @@ -152,7 +152,7 @@ __global__ void PSROIPoolBackwardAccKernel( DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom @@ -198,7 +198,7 @@ inline void PSROIPoolBackwardAcc(const Tensor &in_grad, const Tensor &out_grad, const Tensor &bbox, const float spatial_scale, - const int output_dim_, + const int output_dim_, const int group_size_) { // LOG(INFO) << "PSROIPoolBackward"; const DType *top_diff = out_grad.dptr_; @@ -236,7 +236,7 @@ inline void PSROIPoolBackwardAcc(const Tensor &in_grad, const Tensor &out_grad, const Tensor &bbox, const float spatial_scale, - const int output_dim_, + const int output_dim_, const int group_size_) { cuda::PSROIPoolBackwardAcc(in_grad, out_grad, bbox, spatial_scale, output_dim_, group_size_); } From bf622023ce632ef6b53a31d671f04b011d6e68c8 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 17:09:59 +0800 Subject: [PATCH 03/18] fix programming rule to meet pr rule --- src/operator/contrib/channel_operator-inl.h | 656 +++++++------- .../contrib/deformable_convolution-inl.h | 840 +++++++++--------- .../contrib/deformable_psroi_pooling-inl.h | 449 +++++----- .../contrib/deformable_psroi_pooling.cu | 33 +- src/operator/contrib/multi_proposal.cu | 23 +- src/operator/contrib/psroi_pooling.cu | 6 +- 6 files changed, 1010 insertions(+), 997 deletions(-) diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h index 7d8bcc499763..38192d302651 100644 --- a/src/operator/contrib/channel_operator-inl.h +++ b/src/operator/contrib/channel_operator-inl.h @@ -2,7 +2,7 @@ * Copyright (c) 2017 Microsoft * Licensed under The Apache-2.0 License [see LICENSE for details] * \file channel_operator-inl.h - * \brief + * \brief * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai */ #ifndef MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ @@ -29,382 +29,364 @@ namespace channel_operator { enum ChannelOperatorOpPickType { kLabel, kScore }; } // activation - struct ChannelOperatorParam : public dmlc::Parameter { - // use int for enumeration - int op_type; - int group; - int pick_type; - DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { - DMLC_DECLARE_FIELD(op_type) - .add_enum("Group_Max", channel_operator::kGroupMax) - .add_enum("Group_Pick", channel_operator::kGroupPick) - .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) - .describe("Channel operator to be applied."); - DMLC_DECLARE_FIELD(group).describe("group size"); - DMLC_DECLARE_FIELD(pick_type) - .add_enum("Label_Pick", channel_operator::kLabel) - .add_enum("Score_Pick", channel_operator::kScore) - .set_default(channel_operator::kLabel) - .describe("pick type"); - } - }; - - /** - * \brief This is the implementation of channel operator. - * \tparam xpu The device that the op will be executed on. - */ - template - class ChannelOperatorOp : public Operator { - public: - explicit ChannelOperatorOp(ChannelOperatorParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; +struct ChannelOperatorParam : public dmlc::Parameter { + // use int for enumeration + int op_type; + int group; + int pick_type; + DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { + DMLC_DECLARE_FIELD(op_type) + .add_enum("Group_Max", channel_operator::kGroupMax) + .add_enum("Group_Pick", channel_operator::kGroupPick) + .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) + .describe("Channel operator to be applied."); + DMLC_DECLARE_FIELD(group).describe("group size"); + DMLC_DECLARE_FIELD(pick_type) + .add_enum("Label_Pick", channel_operator::kLabel) + .add_enum("Score_Pick", channel_operator::kScore) + .set_default(channel_operator::kLabel) + .describe("pick type"); + } +}; + +/** +* \brief This is the implementation of channel operator. +* \tparam xpu The device that the op will be executed on. +*/ +template +class ChannelOperatorOp : public Operator { + public: + explicit ChannelOperatorOp(ChannelOperatorParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) { out_expected = 1; - } else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) - out_expected = 1; - else - out_expected = 2; } else { - LOG(FATAL) << "No that operation type."; - } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - if (param_.op_type == channel_operator::kGroupSoftmax) { - int total_size = in_data[channel_operator::kData].Size(); - int batch_size = in_data[channel_operator::kData].shape_[0]; - int channel_num = in_data[channel_operator::kData].shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = - Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); - - Tensor data = - in_data[channel_operator::kData].get_with_shape(data_shape, s); - Tensor out = - out_data[channel_operator::kOut].get_with_shape(data_shape, s); - Softmax(out, data); - } - else if (param_.op_type == channel_operator::kGroupMax) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - - GroupMaxForward(out, data, max_idx, param_.group); - } - else if (param_.op_type == channel_operator::kGroupPick) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - - if (param_.pick_type == channel_operator::kScore) { - Tensor argmax_data = - out_data[channel_operator::kTemp].get(s); - GetMaxIdx(pick_idx, argmax_data, param_.group); - GroupPickForward(out, data, argmax_data, param_.group); - } - else { - GroupPickForward(out, data, pick_idx, param_.group); - } + out_expected = 2; } - else { - LOG(FATAL) << "No that operation type."; + } else { + LOG(FATAL) << "No that operation type."; + } + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + if (param_.op_type == channel_operator::kGroupSoftmax) { + int total_size = in_data[channel_operator::kData].Size(); + int batch_size = in_data[channel_operator::kData].shape_[0]; + int channel_num = in_data[channel_operator::kData].shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = + Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); + + Tensor data = + in_data[channel_operator::kData].get_with_shape(data_shape, s); + Tensor out = + out_data[channel_operator::kOut].get_with_shape(data_shape, s); + Softmax(out, data); + } else if (param_.op_type == channel_operator::kGroupMax) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + + GroupMaxForward(out, data, max_idx, param_.group); + } else if (param_.op_type == channel_operator::kGroupPick) { + Tensor data = in_data[channel_operator::kData].get(s); + Tensor out = out_data[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + + if (param_.pick_type == channel_operator::kScore) { + Tensor argmax_data = + out_data[channel_operator::kTemp].get(s); + GetMaxIdx(pick_idx, argmax_data, param_.group); + GroupPickForward(out, data, argmax_data, param_.group); + } else { + GroupPickForward(out, data, pick_idx, param_.group); } - + } else { + LOG(FATAL) << "No that operation type."; } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + + size_t in_expected; + size_t out_expected; + if (param_.op_type == channel_operator::kGroupMax) { + in_expected = 1; + out_expected = 2; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + in_expected = 1; + out_expected = 1; + } else if (param_.op_type == channel_operator::kGroupPick) { + in_expected = 2; + if (param_.pick_type == channel_operator::kLabel) { out_expected = 1; - } else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) - out_expected = 1; - else - out_expected = 2; } else { - LOG(FATAL) << "No that operation type."; + out_expected = 2; } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - - if (param_.op_type == channel_operator::kGroupMax) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); - } else if (param_.op_type == channel_operator::kGroupSoftmax) { + } else { + LOG(FATAL) << "No that operation type."; + } + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + Stream *s = ctx.get_stream(); + + if (param_.op_type == channel_operator::kGroupMax) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor max_idx = out_data[channel_operator::kTemp].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(max_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + LOG(FATAL) << "Not Implemented."; + } else if (param_.op_type == channel_operator::kGroupPick) { + Tensor grad_out = out_grad[channel_operator::kOut].get(s); + Tensor pick_idx = in_data[channel_operator::kPick].get(s); + Tensor grad_in = in_grad[channel_operator::kData].get(s); + Tensor pick_diff = in_grad[channel_operator::kPick].get(s); + + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(pick_idx.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); + + Assign(grad_in, req[channel_operator::kData], 0); + Assign(pick_diff, req[channel_operator::kPick], 0); + if (param_.pick_type == channel_operator::kScore) { LOG(FATAL) << "Not Implemented."; - } else if (param_.op_type == channel_operator::kGroupPick) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - Tensor pick_diff = in_grad[channel_operator::kPick].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - Assign(pick_diff, req[channel_operator::kPick], 0); - if (param_.pick_type == channel_operator::kScore) { - LOG(FATAL) << "Not Implemented."; - } else { - GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); - } - } else { - LOG(FATAL) << "No that operation type."; + GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); } + } else { + LOG(FATAL) << "No that operation type."; } - private: - ChannelOperatorParam param_; - }; // class ChannelOperatorOp - // Decalre Factory function, used for dispatch specialization - template - Operator* CreateOp(ChannelOperatorParam type, int dtype); + } + private: + ChannelOperatorParam param_; +}; // class ChannelOperatorOp + + // Decalre Factory function, used for dispatch specialization +template +Operator* CreateOp(ChannelOperatorParam type, int dtype); #if DMLC_USE_CXX11 - class ChannelOperatorProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.op_type == channel_operator::kGroupMax || - param_.op_type == channel_operator::kGroupSoftmax) { - return{ "data" }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - return{ "data", "pick_idx" }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } +class ChannelOperatorProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.op_type == channel_operator::kGroupMax || + param_.op_type == channel_operator::kGroupSoftmax) { + return{ "data" }; + } else if (param_.op_type == channel_operator::kGroupPick) { + return{ "data", "pick_idx" }; + } else { + LOG(FATAL) << "No that operation type."; + return{}; } + } - std::vector ListOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { + std::vector ListOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ "output" }; + } else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) return{ "output" }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) - return{ "output" }; - else - return{ "output", "argmax_data" }; - } - else if (param_.op_type == channel_operator::kGroupMax) { - return{ "output", "max_idx" }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } + else + return{ "output", "argmax_data" }; + } else if (param_.op_type == channel_operator::kGroupMax) { + return{ "output", "max_idx" }; + } else { + LOG(FATAL) << "No that operation type."; + return{}; } + } - int NumOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { + int NumOutputs() const override { + if (param_.op_type == channel_operator::kGroupSoftmax) { + return 1; + } else if (param_.op_type == channel_operator::kGroupPick) { + if (param_.pick_type == channel_operator::kLabel) { return 1; - } - else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) - return 1; - else - return 2; - } - else if (param_.op_type == channel_operator::kGroupMax) { + } else { return 2; } - else { - LOG(FATAL) << "No that operation type."; - return NULL; - } - } - - int NumVisibleOutputs() const override { - return 1; - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); + } else if (param_.op_type == channel_operator::kGroupMax) { + return 2; + } else { + LOG(FATAL) << "No that operation type."; + return NULL; } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - - - // data: [batch_size, c, h, w] - TShape dshape = in_shape->at(channel_operator::kData); - CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; - - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, group, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + } + + int NumVisibleOutputs() const override { + return 1; + } + + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + + + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(channel_operator::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + out_shape->push_back( + Shape4(dshape[0], param_.group, dshape[2], dshape[3])); + return true; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; + // out: [num_rois, c, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); + return true; + } else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; + // out: [num_rois, c/group, h, w] + out_shape->clear(); + out_shape->push_back( + Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); + if (param_.pick_type == channel_operator::kScore) { out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); - return true; + Shape4(dshape[0], 1, 1, 1)); } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, c, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); - return true; - } - else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; - // out: [num_rois, c/group, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); - if (param_.pick_type == channel_operator::kScore) { - out_shape->push_back( - Shape4(dshape[0], 1, 1, 1)); - } - return true; - } - else { - LOG(FATAL) << "No that operation type."; - return false; - } - + return true; + } else { + LOG(FATAL) << "No that operation type."; + return false; } - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "Input must have specified type"; - - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "Input must have specified type"; + + if (param_.op_type == channel_operator::kGroupMax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + CHECK_EQ(in_type->size(), 1); + out_type->clear(); + out_type->push_back(dtype); + return true; + } else if (param_.op_type == channel_operator::kGroupPick) { + CHECK_EQ(in_type->size(), 2); + out_type->clear(); + out_type->push_back(dtype); + if (param_.pick_type == channel_operator::kScore) { out_type->push_back(dtype); - return true; } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); - return true; - } - else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_type->size(), 2); - out_type->clear(); - out_type->push_back(dtype); - if (param_.pick_type == channel_operator::kScore) { - out_type->push_back(dtype); - } - return true; - } - else { - LOG(FATAL) << "No that operation type."; - return false; - } - + return true; + } else { + LOG(FATAL) << "No that operation type."; + return false; } - OperatorProperty* Copy() const override { - ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); - channel_operator_sym->param_ = this->param_; - return channel_operator_sym; + } + + OperatorProperty* Copy() const override { + ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); + channel_operator_sym->param_ = this->param_; + return channel_operator_sym; + } + + std::string TypeString() const override { + return "_contrib_ChannelOperator"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.op_type == channel_operator::kGroupMax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kTemp] }; + } else if (param_.op_type == channel_operator::kGroupSoftmax) { + return{ out_grad[channel_operator::kOut], + out_data[channel_operator::kOut] }; + } else if (param_.op_type == channel_operator::kGroupPick) { + return{ out_grad[channel_operator::kOut], + in_data[channel_operator::kPick] }; + } else { + LOG(FATAL) << "No that operation type."; + return{}; } + } - std::string TypeString() const override { - return "_contrib_ChannelOperator"; - } - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.op_type == channel_operator::kGroupMax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kTemp] }; - } - else if (param_.op_type == channel_operator::kGroupSoftmax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kOut] }; - } - else if (param_.op_type == channel_operator::kGroupPick) { - return{ out_grad[channel_operator::kOut], - in_data[channel_operator::kPick] }; - } - else { - LOG(FATAL) << "No that operation type."; - return{}; - } - } - - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; - private: - ChannelOperatorParam param_; - }; // class PSROIPoolingAlignProp + private: + ChannelOperatorParam param_; +}; // class PSROIPoolingAlignProp #endif } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index bac840774cd6..c2ca88ef532d 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -7,8 +7,8 @@ * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai */ -#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ #include #include @@ -36,446 +36,454 @@ namespace conv { enum DeformableConvolutionOpResource { kTempSpace }; } - struct DeformableConvolutionParam : public dmlc::Parameter { - TShape kernel; - TShape stride; - TShape dilate; - TShape pad; - uint32_t num_filter; - uint32_t num_group; - uint32_t num_deformable_group; - uint64_t workspace; - bool no_bias; - dmlc::optional layout; - DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { - DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(stride).set_default(TShape()) - .describe("convolution stride: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(dilate).set_default(TShape()) - .describe("convolution dilate: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(pad).set_default(TShape()) - .describe("pad for convolution: (h, w) or (d, h, w)"); - DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) - .describe("convolution filter(channel) number"); - DMLC_DECLARE_FIELD(num_group).set_default(1) - .describe("Number of group partitions."); - DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) - .describe("Number of deformable group partitions."); - DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) - .describe("Maximum temperal workspace allowed for convolution (MB)."); - DMLC_DECLARE_FIELD(no_bias).set_default(false) - .describe("Whether to disable bias parameter."); - DMLC_DECLARE_FIELD(layout) - .add_enum("NCW", mshadow::kNCW) - .add_enum("NCHW", mshadow::kNCHW) - .add_enum("NCDHW", mshadow::kNCDHW) - .set_default(dmlc::optional()) - .describe("Set layout for input, output and weight. Empty for\n " - "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); +struct DeformableConvolutionParam : public dmlc::Parameter { + TShape kernel; + TShape stride; + TShape dilate; + TShape pad; + uint32_t num_filter; + uint32_t num_group; + uint32_t num_deformable_group; + uint64_t workspace; + bool no_bias; + dmlc::optional layout; + DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) { + DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(stride).set_default(TShape()) + .describe("convolution stride: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(dilate).set_default(TShape()) + .describe("convolution dilate: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(pad).set_default(TShape()) + .describe("pad for convolution: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) + .describe("convolution filter(channel) number"); + DMLC_DECLARE_FIELD(num_group).set_default(1) + .describe("Number of group partitions."); + DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) + .describe("Number of deformable group partitions."); + DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) + .describe("Maximum temperal workspace allowed for convolution (MB)."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(layout) + .add_enum("NCW", mshadow::kNCW) + .add_enum("NCHW", mshadow::kNCHW) + .add_enum("NCDHW", mshadow::kNCDHW) + .set_default(dmlc::optional()) + .describe("Set layout for input, output and weight. Empty for\n " + "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + } +}; + +template +class DeformableConvolutionOp : public Operator { + public: + explicit DeformableConvolutionOp(DeformableConvolutionParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + CHECK(param_.layout.value() == mshadow::kNCW || + param_.layout.value() == mshadow::kNCHW || + param_.layout.value() == mshadow::kNCDHW) + << "Only support NCW, NCHW and NCDHW layout"; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(req[conv::kOut], kWriteTo); + size_t expected = param_.no_bias ? 3 : 4; + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[conv::kData].shape_, + in_data[conv::kOffset].shape_, + out_data[conv::kOut].shape_); + Stream* s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_data[0].shape_[i + 1]; } - }; - - template - class DeformableConvolutionOp : public Operator { - public: - explicit DeformableConvolutionOp(DeformableConvolutionParam p) { - this->param_ = p; - // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(DType); - CHECK(param_.layout.value() == mshadow::kNCW || - param_.layout.value() == mshadow::kNCHW || - param_.layout.value() == mshadow::kNCDHW) - << "Only support NCW, NCHW and NCDHW layout"; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(req[conv::kOut], kWriteTo); - size_t expected = param_.no_bias ? 3 : 4; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1U); - LayerSetUp(in_data[conv::kData].shape_, in_data[conv::kOffset].shape_, out_data[conv::kOut].shape_); - Stream* s = ctx.get_stream(); - // allocate workspace for col_buffer - Tensor workspace = ctx.requested[conv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_), s); - // calculate the shape of col_buffer - TShape col_buffer_shape(num_spatial_axes_ + 1); - col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); - for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = out_data[0].shape_[i + 1]; - } - // create a column buffer using workspace and col_buffer_shape - TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); - - // initialize weight and col_buffer 3D tensors for using gemm - index_t M = conv_out_channels_ / group_; - index_t N = conv_out_spatial_dim_; - index_t K = kernel_dim_; - Tensor weight_3d = in_data[conv::kWeight].get_with_shape( - Shape3(group_, M, K), s); - Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, K, N), s); - Tensor output_4d = out_data[conv::kOut].get_with_shape( - Shape4(num_, group_, M, N), s); - for (index_t n = 0; n < num_; ++n) { - // transform image to col_buffer in order to use gemm - deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, - in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, - col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - col_buffer.dptr()); - Tensor output_3d = output_4d[n]; - for (index_t g = 0; g < group_; ++g) { - ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); - } - } - if (bias_term_) { - Tensor bias = in_data[conv::kBias].get(s); - Tensor output_3d = out_data[conv::kOut].get_with_shape( - Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); - // has bias term, broadcast it to the same shape of output_3d in channel dim - output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + index_t M = conv_out_channels_ / group_; + index_t N = conv_out_spatial_dim_; + index_t K = kernel_dim_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, M, K), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, K, N), s); + Tensor output_4d = out_data[conv::kOut].get_with_shape( + Shape4(num_, group_, M, N), s); + for (index_t n = 0; n < num_; ++n) { + // transform image to col_buffer in order to use gemm + deformable_im2col(s, in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + param_.num_deformable_group, col_buffer.dptr()); + Tensor output_3d = output_4d[n]; + for (index_t g = 0; g < group_; ++g) { + ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g])); } } - - virtual void Backward(const OpContext &ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - size_t expected = param_.no_bias == 0 ? 4 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); - LayerSetUp(in_grad[conv::kData].shape_, in_grad[conv::kOffset].shape_, out_grad[conv::kOut].shape_); - Stream *s = ctx.get_stream(); - // allocate workspace for col_buffer - Tensor workspace = ctx.requested[conv::kTempSpace] - .get_space_typed(Shape1(col_buffer_size_), s); - // calculate the shape of col_buffer - TShape col_buffer_shape(num_spatial_axes_ + 1); - col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); - for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { - col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + if (bias_term_) { + Tensor bias = in_data[conv::kBias].get(s); + Tensor output_3d = out_data[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + // has bias term, broadcast it to the same shape of output_3d in channel dim + output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = param_.no_bias == 0 ? 4 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); + LayerSetUp(in_grad[conv::kData].shape_, + in_grad[conv::kOffset].shape_, + out_grad[conv::kOut].shape_); + Stream *s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + TShape col_buffer_shape(num_spatial_axes_ + 1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // For computing dLoss/d(in_data[kData]) + index_t M = kernel_dim_; + index_t N = conv_out_spatial_dim_; + index_t K = conv_out_channels_ / group_; + Tensor weight_3d = in_data[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( + Shape4(num_, group_, K, N), s); + Tensor col_buffer_3d = col_buffer.get_with_shape( + Shape3(group_, M, N), s); + // For computing dLoss/dWeight + Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( + Shape3(group_, K, M), s); + + Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); + data_grad = 0; + + + for (index_t n = 0; n < num_; ++n) { + Tensor out_grad_3d = out_grad_4d[n]; + for (index_t g = 0; g < group_; ++g) { + col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); } - // create a column buffer using workspace and col_buffer_shape - TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); - - // initialize weight and col_buffer 3D tensors for using gemm - // For computing dLoss/d(in_data[kData]) - index_t M = kernel_dim_; - index_t N = conv_out_spatial_dim_; - index_t K = conv_out_channels_ / group_; - Tensor weight_3d = in_data[conv::kWeight].get_with_shape( - Shape3(group_, K, M), s); - Tensor out_grad_4d = out_grad[conv::kOut].get_with_shape( - Shape4(num_, group_, K, N), s); - Tensor col_buffer_3d = col_buffer.get_with_shape( - Shape3(group_, M, N), s); - // For computing dLoss/dWeight - Tensor dweight_3d = in_grad[conv::kWeight].get_with_shape( - Shape3(group_, K, M), s); - - Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); - data_grad = 0; - - - for (index_t n = 0; n < num_; ++n) { - Tensor out_grad_3d = out_grad_4d[n]; - for (index_t g = 0; g < group_; ++g) { - col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]); - } - // gradient w.r.t. input coordinate data - deformable_col2im_coord(s, col_buffer.dptr(), - in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kOffset].dptr() + n*input_offset_dim_, - in_grad[conv::kData].shape_, col_buffer.shape_, - param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - in_grad[conv::kOffset].dptr() + n*input_offset_dim_, - req[conv::kData]); - - // gradient w.r.t. input data - deformable_col2im(s, col_buffer.dptr(), - in_data[conv::kOffset].dptr() + n*input_offset_dim_, in_grad[conv::kData].shape_, col_buffer.shape_, - param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, - in_grad[conv::kData].dptr() + n*input_dim_, - req[conv::kData]); - - // gradient w.r.t. weight, dWeight should accumulate across the batch and group - im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, - col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, - col_buffer.dptr()); - for (index_t g = 0; g < group_; ++g) { - if (0 == n) { - ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], - dot(out_grad_3d[g], col_buffer_3d[g].T())); - } else { - dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); - } + // gradient w.r.t. input coordinate data + deformable_col2im_coord(s, col_buffer.dptr(), + in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kOffset].dptr() + n*input_offset_dim_, + req[conv::kData]); + + // gradient w.r.t. input data + deformable_col2im(s, col_buffer.dptr(), + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, col_buffer.shape_, + param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group, + in_grad[conv::kData].dptr() + n*input_dim_, + req[conv::kData]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and group + im2col(s, in_data[conv::kData].dptr() + n*input_dim_, in_data[conv::kData].shape_, + col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate, + col_buffer.dptr()); + for (index_t g = 0; g < group_; ++g) { + if (0 == n) { + ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight], + dot(out_grad_3d[g], col_buffer_3d[g].T())); + } else { + dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T()); } } - - // gradient w.r.t bias - if (bias_term_) { - Tensor dbias = in_grad[conv::kBias].get(s); - Tensor dout = out_grad[conv::kOut].get_with_shape( - Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); - ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); - } - } - private: - void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { - channel_axis_ = 1; // hard code channel axis - const index_t first_spatial_axis = channel_axis_ + 1; - const index_t num_axes = param_.kernel.ndim() + 2; - num_spatial_axes_ = num_axes - first_spatial_axis; - is_1x1_ = true; - for (index_t i = 0; i < param_.kernel.ndim(); ++i) { - is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; - if (!is_1x1_) break; - } + // gradient w.r.t bias + if (bias_term_) { + Tensor dbias = in_grad[conv::kBias].get(s); + Tensor dout = out_grad[conv::kOut].get_with_shape( + Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); + ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); + } - // batch size - num_ = ishape[0]; - // number of input channels - channels_ = ishape[1]; - group_ = param_.num_group; - conv_out_channels_ = param_.num_filter; - conv_in_channels_ = channels_; - bias_term_ = !param_.no_bias; - kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); - weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; - conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); - col_offset_ = kernel_dim_ * conv_out_spatial_dim_; - output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; - // size of the column buffer used for storing im2col-ed pixels - col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; - // input/output image size (#channels * height * width) - input_dim_ = ishape.ProdShape(1, ishape.ndim()); - input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); - output_dim_ = oshape.ProdShape(1, oshape.ndim()); - num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; - num_kernels_col2im_ = input_dim_; + } + + private: + void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const index_t num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + is_1x1_ = true; + for (index_t i = 0; i < param_.kernel.ndim(); ++i) { + is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; + if (!is_1x1_) break; } - private: - DeformableConvolutionParam param_; - index_t channel_axis_; // channel axis of the input - index_t channels_; // number of channels of input image - index_t num_spatial_axes_; // number of spatial axes - index_t num_; // batch size - index_t group_; // number of groups - index_t conv_out_channels_; // number of output channels (num_filter) - index_t conv_out_spatial_dim_; // number of pixels of output images per channel - index_t conv_in_channels_; // number of input channels - index_t kernel_dim_; // number of input channels per group * kernel size - index_t weight_offset_; // number of output channels per group * kernel_dim_ - index_t col_offset_; - index_t output_offset_; - index_t col_buffer_size_; - index_t input_dim_; - index_t input_offset_dim_; - index_t output_dim_; - index_t num_kernels_im2col_; - index_t num_kernels_col2im_; - bool bias_term_; // has bias term? - bool is_1x1_; - }; // class ConvolutionOp - - template - Operator* CreateOp(DeformableConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = param_.num_filter; + conv_in_channels_ = channels_; + bias_term_ = !param_.no_bias; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + + private: + DeformableConvolutionParam param_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t input_offset_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool bias_term_; // has bias term? + bool is_1x1_; +}; // class ConvolutionOp + +template +Operator* CreateOp(DeformableConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); #if DMLC_USE_CXX11 - class DeformableConvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return{ "data", "offset", "weight", "bias" }; - } else { - return{ "data", "offset", "weight" }; - } +class DeformableConvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return{ "data", "offset", "weight", "bias" }; + } else { + return{ "data", "offset", "weight" }; } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - LOG(FATAL) << "not implemented"; - } + } + + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + LOG(FATAL) << "not implemented"; } + } - std::map GetParams() const override { - return param_.__DICT__(); - } + std::map GetParams() const override { + return param_.__DICT__(); + } - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + const TShape &oshp = (*in_shape)[conv::kOffset]; + if (dshp.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + CHECK_EQ(oshp.ndim(), 4U) \ + << "Input offset should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]"; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); } - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - const TShape &oshp = (*in_shape)[conv::kOffset]; - if (dshp.ndim() == 0) return false; - if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - CHECK_EQ(oshp.ndim(), 4U) \ - << "Input offset should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - const index_t ksize_y = static_cast(param_.kernel[0]); - const index_t ksize_x = static_cast(param_.kernel[1]); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ - << "input num_filter must divide deformable group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = (dshape[2] + 2 * param_.pad[0] - - (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; - oshape[3] = (dshape[3] + 2 * param_.pad[1] - - (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ - << "output num_filter must divide deformable group size"; - CHECK_EQ(oshape[2], offsetshape[2]) \ - << "output height must equal to offset map height"; - CHECK_EQ(oshape[3], offsetshape[3]) \ - << "output width must equal to offset map width"; - CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ - << "offset filter must divide deformable group size"; - CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), param_.num_deformable_group) \ - << "offset filter must divide deformable group size"; - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (param_.stride[0] == 1) { - dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; - } - if (param_.stride[1] == 1) { - dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; - } - return true; - } else { - LOG(FATAL) << "not implemented"; - return false; + const index_t ksize_y = static_cast(param_.kernel[0]); + const index_t ksize_x = static_cast(param_.kernel[1]); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ + << "input num_filter must divide deformable group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = (dshape[2] + 2 * param_.pad[0] - + (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; + oshape[3] = (dshape[3] + 2 * param_.pad[1] - + (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + << "output num_filter must divide deformable group size"; + CHECK_EQ(oshape[2], offsetshape[2]) \ + << "output height must equal to offset map height"; + CHECK_EQ(oshape[3], offsetshape[3]) \ + << "output width must equal to offset map width"; + CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + << "offset filter must divide deformable group size"; + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), + param_.num_deformable_group) \ + << "offset filter must divide deformable group size"; + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (param_.stride[0] == 1) { + dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " - << "Expected " << dtype << " v.s. given " - << (*in_type)[i] << " at " << ListArguments()[i]; - } + if (param_.stride[1] == 1) { + dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; } - out_type->clear(); - out_type->push_back(dtype); return true; + } else { + LOG(FATAL) << "not implemented"; + return false; } - - OperatorProperty* Copy() const override { - auto ptr = new DeformableConvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "_contrib_DeformableConvolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return{ out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kOffset], in_data[conv::kWeight] }; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return{ ResourceRequest::kTempSpace }; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return{ ResourceRequest::kTempSpace }; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. " + << "Expected " << dtype << " v.s. given " + << (*in_type)[i] << " at " << ListArguments()[i]; + } } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - DeformableConvolutionParam param_; - }; // class ConvolutionProp + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DeformableConvolutionProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_DeformableConvolution"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return{ out_grad[conv::kOut], in_data[conv::kData], + in_data[conv::kOffset], in_data[conv::kWeight] }; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + DeformableConvolutionParam param_; +}; // class ConvolutionProp #endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h index 0f191dc673af..bac557b7bedf 100644 --- a/src/operator/contrib/deformable_psroi_pooling-inl.h +++ b/src/operator/contrib/deformable_psroi_pooling-inl.h @@ -5,8 +5,8 @@ * \brief deformable psroi pooling operator and symbol * \author Yi Li, Guodong Zhang, Jifeng Dai */ -#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ -#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_ #include #include @@ -29,248 +29,255 @@ namespace deformablepsroipool { enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount }; } // deformablepsroipool - struct DeformablePSROIPoolingParam : public dmlc::Parameter { - // TShape pooled_size; - float spatial_scale; - int output_dim; - int group_size; - int pooled_size; - int part_size; - int sample_per_part; - float trans_std; - bool no_trans; - DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { - DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) - .describe("Ratio of input feature map height (or w) to raw image height (or w). " - "Equals the reciprocal of total stride in convolutional layers"); - DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); - DMLC_DECLARE_FIELD(group_size).describe("fix group size"); - DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); - DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); - DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); - DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0).describe("fix transition std"); - DMLC_DECLARE_FIELD(no_trans).set_default(false) - .describe("Whether to disable trans parameter."); +struct DeformablePSROIPoolingParam : public dmlc::Parameter { + // TShape pooled_size; + float spatial_scale; + int output_dim; + int group_size; + int pooled_size; + int part_size; + int sample_per_part; + float trans_std; + bool no_trans; + DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) { + DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0) + .describe("Ratio of input feature map height (or w) to raw image height (or w). " + "Equals the reciprocal of total stride in convolutional layers"); + DMLC_DECLARE_FIELD(output_dim).describe("fix output dim"); + DMLC_DECLARE_FIELD(group_size).describe("fix group size"); + DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size"); + DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size"); + DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part"); + DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0) + .describe("fix transition std"); + DMLC_DECLARE_FIELD(no_trans).set_default(false) + .describe("Whether to disable trans parameter."); + } +}; + +template +class DeformablePSROIPoolingOp : public Operator { + public: + explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + Stream *s = ctx.get_stream(); + + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor out = out_data[deformablepsroipool::kOut].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + out = -FLT_MAX; + top_count = 0.0f; + + Tensor trans; + if (!param_.no_trans) { + trans = in_data[deformablepsroipool::kTrans].get(s); } - }; - - template - class DeformablePSROIPoolingOp : public Operator { - public: - explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) { - this->param_ = p; + DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, + param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, + param_.part_size, param_.sample_per_part, param_.trans_std); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + size_t in_expected = param_.no_trans ? 2 : 3; + size_t out_expected = 2; + CHECK_EQ(in_data.size(), in_expected); + CHECK_EQ(out_data.size(), out_expected); + CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], + in_data[deformablepsroipool::kBox].shape_[0]); + CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << + "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << + // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; + Stream *s = ctx.get_stream(); + + Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); + Tensor data = in_data[deformablepsroipool::kData].get(s); + Tensor bbox = in_data[deformablepsroipool::kBox].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); + Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); + Tensor grad_trans; + Tensor trans; + if (!param_.no_trans) { + CHECK_EQ(in_grad.size(), 3); + trans = in_data[deformablepsroipool::kTrans].get(s); + grad_trans = in_grad[deformablepsroipool::kTrans].get(s); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - size_t in_expected = param_.no_trans? 2 : 3; - size_t out_expected = 2; - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - Stream *s = ctx.get_stream(); - - Tensor data = in_data[deformablepsroipool::kData].get(s); - Tensor bbox = in_data[deformablepsroipool::kBox].get(s); - Tensor out = out_data[deformablepsroipool::kOut].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(bbox.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(top_count.CheckContiguous(), true); - out = -FLT_MAX; - top_count = 0.0f; - - Tensor trans; - if (!param_.no_trans) { - trans = in_data[deformablepsroipool::kTrans].get(s); - } - DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans, param_.spatial_scale, - param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); - } + CHECK_EQ(grad_out.CheckContiguous(), true); + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(bbox.CheckContiguous(), true); + CHECK_EQ(top_count.CheckContiguous(), true); + CHECK_EQ(grad_in.CheckContiguous(), true); - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - size_t in_expected = param_.no_trans ? 2 : 3; - size_t out_expected = 2; - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0], in_data[deformablepsroipool::kBox].shape_[0]); - CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) << - "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) << - "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) << - // "DeformablePSROIPooling: Backward doesn't support kWriteInplace."; - Stream *s = ctx.get_stream(); - - Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); - Tensor data = in_data[deformablepsroipool::kData].get(s); - Tensor bbox = in_data[deformablepsroipool::kBox].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); - Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); - Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); - Tensor grad_trans; - Tensor trans; - if (!param_.no_trans) { - CHECK_EQ(in_grad.size(), 3); - trans = in_data[deformablepsroipool::kTrans].get(s); - grad_trans = in_grad[deformablepsroipool::kTrans].get(s); - } - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(bbox.CheckContiguous(), true); - CHECK_EQ(top_count.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[deformablepsroipool::kData], 0); - if (!param_.no_trans) { - Assign(grad_trans, req[deformablepsroipool::kTrans], 0); - } - DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, top_count, param_.no_trans, - param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size, param_.part_size, - param_.sample_per_part, param_.trans_std); - Assign(grad_roi, req[deformablepsroipool::kBox], 0); + Assign(grad_in, req[deformablepsroipool::kData], 0); + if (!param_.no_trans) { + Assign(grad_trans, req[deformablepsroipool::kTrans], 0); } + DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans, + top_count, param_.no_trans, param_.spatial_scale, param_.output_dim, param_.group_size, + param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std); + Assign(grad_roi, req[deformablepsroipool::kBox], 0); + } - private: - DeformablePSROIPoolingParam param_; - }; // class DeformablePSROIPoolingOp + private: + DeformablePSROIPoolingParam param_; +}; // class DeformablePSROIPoolingOp - // Decalre Factory function, used for dispatch specialization - template - Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); +// Decalre Factory function, used for dispatch specialization +template +Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype); #if DMLC_USE_CXX11 - class DeformablePSROIPoolingProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.no_trans) { - return{ "data", "rois" }; - } else { - return{ "data", "rois", "trans" }; - } +class DeformablePSROIPoolingProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (param_.no_trans) { + return{ "data", "rois" }; + } else { + return{ "data", "rois", "trans" }; } + } - std::vector ListOutputs() const override { - return{ "output", "top_count" }; - } + std::vector ListOutputs() const override { + return{ "output", "top_count" }; + } - int NumOutputs() const override { - return 2; - } + int NumOutputs() const override { + return 2; + } - int NumVisibleOutputs() const override { - return 1; - } + int NumVisibleOutputs() const override { + return 1; + } - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - if (param_.part_size == 0) { - param_.part_size = param_.pooled_size; - } + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + if (param_.part_size == 0) { + param_.part_size = param_.pooled_size; } - - std::map GetParams() const override { - return param_.__DICT__(); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (param_.no_trans) { + CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; + } else { + CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; + // trans: [num_rois, 2, pooled_h, pooled_w] + TShape tshape = in_shape->at(deformablepsroipool::kTrans); + CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; } - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (param_.no_trans) { - CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]"; - } else { - CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]"; - // trans: [num_rois, 2, pooled_h, pooled_w] - TShape tshape = in_shape->at(deformablepsroipool::kTrans); - CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape"; - } - - // data: [batch_size, c, h, w] - TShape dshape = in_shape->at(deformablepsroipool::kData); - CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; - - // bbox: [num_rois, 5] - TShape bshape = in_shape->at(deformablepsroipool::kBox); - CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; - CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; - - // out: [num_rois, c, pooled_h, pooled_w] - // top_count: [num_rois, c, pooled_h, pooled_w] - out_shape->clear(); - out_shape->push_back( - Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); - out_shape->push_back( - Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); - return true; + // data: [batch_size, c, h, w] + TShape dshape = in_shape->at(deformablepsroipool::kData); + CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; + + // bbox: [num_rois, 5] + TShape bshape = in_shape->at(deformablepsroipool::kBox); + CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]"; + CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]"; + + // out: [num_rois, c, pooled_h, pooled_w] + // top_count: [num_rois, c, pooled_h, pooled_w] + out_shape->clear(); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + out_shape->push_back( + Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size)); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 2); + int dtype = (*in_type)[0]; + CHECK_EQ(dtype, (*in_type)[1]); + CHECK_NE(dtype, -1) << "Input must have specified type"; + + out_type->clear(); + out_type->push_back(dtype); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); + deformable_psroi_pooling_sym->param_ = this->param_; + return deformable_psroi_pooling_sym; + } + + std::string TypeString() const override { + return "_contrib_DeformablePSROIPooling"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.no_trans) { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], + in_data[deformablepsroipool::kBox], out_data[deformablepsroipool::kTopCount] }; + } else { + return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], + in_data[deformablepsroipool::kBox], in_data[deformablepsroipool::kTrans], + out_data[deformablepsroipool::kTopCount] }; } + } - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 2); - int dtype = (*in_type)[0]; - CHECK_EQ(dtype, (*in_type)[1]); - CHECK_NE(dtype, -1) << "Input must have specified type"; - - out_type->clear(); - out_type->push_back(dtype); - out_type->push_back(dtype); - return true; - } - OperatorProperty* Copy() const override { - DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp(); - deformable_psroi_pooling_sym->param_ = this->param_; - return deformable_psroi_pooling_sym; - } - - std::string TypeString() const override { - return "_contrib_DeformablePSROIPooling"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.no_trans) { - return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], - out_data[deformablepsroipool::kTopCount] }; - } else { - return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData], in_data[deformablepsroipool::kBox], - in_data[deformablepsroipool::kTrans], out_data[deformablepsroipool::kTopCount] }; - } - } - - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; - private: - DeformablePSROIPoolingParam param_; - }; // class DeformablePSROIPoolingProp + private: + DeformablePSROIPoolingParam param_; +}; // class DeformablePSROIPoolingProp #endif } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu index 94cd1f985507..f9eb01a26e38 100644 --- a/src/operator/contrib/deformable_psroi_pooling.cu +++ b/src/operator/contrib/deformable_psroi_pooling.cu @@ -83,7 +83,7 @@ namespace cuda { DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom @@ -97,9 +97,13 @@ namespace cuda { int part_w = floor(static_cast(pw) / pooled_width*part_size); int class_id = ctop / channels_each_class; DType trans_x = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + bottom_trans[(((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w] * trans_std; DType trans_y = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + bottom_trans[(((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w] * trans_std; DType wstart = static_cast(pw)* bin_size_w + roi_start_w; @@ -214,7 +218,7 @@ namespace cuda { DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom @@ -228,9 +232,13 @@ namespace cuda { int part_w = floor(static_cast(pw) / pooled_width*part_size); int class_id = ctop / channels_each_class; DType trans_x = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; + bottom_trans[(((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w] * trans_std; DType trans_y = no_trans ? static_cast(0) : - bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; + bottom_trans[(((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w] * trans_std; DType wstart = static_cast(pw)* bin_size_w + roi_start_w; @@ -291,8 +299,12 @@ namespace cuda { *trans_std*diff_val; diff_y *= roi_height; - atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w, diff_x); - atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)*part_size + part_h)*part_size + part_w, diff_y); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) + * part_size + part_h) + * part_size + part_w, diff_x); + atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) + * part_size + part_h) + * part_size + part_w, diff_y); } } } @@ -379,8 +391,9 @@ namespace cuda { const int part_size, const int sample_per_part, const float trans_std) { - cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans, top_count, no_trans, - spatial_scale, output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std); + cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans, + top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size, + sample_per_part, trans_std); } } // namespace mshadow diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu index e5f877cc937f..052d777d5fac 100644 --- a/src/operator/contrib/multi_proposal.cu +++ b/src/operator/contrib/multi_proposal.cu @@ -63,9 +63,8 @@ __global__ void ProposalGridKernel(const int count, workspace_proposals[index * 5 + 1] = workspace_proposals[a * 5 + 1] + h * feature_stride; workspace_proposals[index * 5 + 2] = workspace_proposals[a * 5 + 2] + w * feature_stride; workspace_proposals[index * 5 + 3] = workspace_proposals[a * 5 + 3] + h * feature_stride; - workspace_proposals[index * 5 + 4] = + workspace_proposals[index * 5 + 4] = scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w]; - //workspace_proposals[index * 5 + 4] = scores[(a * height + h) * width + w]; } } @@ -432,7 +431,8 @@ class MultiProposalGPUOp : public Operator{ int count_anchors = num_anchors * height * width; // count of total anchors int count = num_images * count_anchors; // set to -1 for max - int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors; + int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n + : count_anchors; rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors); int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n); @@ -451,11 +451,12 @@ class MultiProposalGPUOp : public Operator{ // Copy generated anchors to GPU float* workspace_proposals_ptr = NULL; - FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * num_images * count_anchors * 5)); - Tensor workspace_proposals(workspace_proposals_ptr, Shape3(num_images, count_anchors, 5)); - FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, - &anchors[0], sizeof(float) * anchors.size(), - cudaMemcpyHostToDevice)); + FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, + sizeof(float) * num_images * count_anchors * 5)); + Tensor workspace_proposals(workspace_proposals_ptr, + Shape3(num_images, count_anchors, 5)); + FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0], + sizeof(float) * anchors.size(), cudaMemcpyHostToDevice)); // Copy proposals to a mesh grid dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock); @@ -509,7 +510,8 @@ class MultiProposalGPUOp : public Operator{ for (int b = 0; b < num_images; b++) { CheckLaunchParam(dimGrid, dimBlock, "CopyScore"); CopyScoreKernel << > >( - count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, score.dptr_, order.dptr_); + count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5, + score.dptr_, order.dptr_); FRCNN_CUDA_CHECK(cudaPeekAtLastError()); // argsort score, save order @@ -525,7 +527,8 @@ class MultiProposalGPUOp : public Operator{ dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock; CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals"); ReorderProposalsKernel << > >( - rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, order.dptr_, workspace_ordered_proposals.dptr_); + rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5, + order.dptr_, workspace_ordered_proposals.dptr_); FRCNN_CUDA_CHECK(cudaPeekAtLastError()); // perform nms diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu index 67c2d55303fd..b7d7c547e59e 100644 --- a/src/operator/contrib/psroi_pooling.cu +++ b/src/operator/contrib/psroi_pooling.cu @@ -152,7 +152,7 @@ __global__ void PSROIPoolBackwardAccKernel( DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom @@ -183,8 +183,8 @@ __global__ void PSROIPoolBackwardAccKernel( DType* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; DType bin_area = (hend - hstart)*(wend - wstart); DType diff_val = is_empty ? (DType)0. : top_diff[index] / bin_area; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { int bottom_index = h*width + w; atomicAdd(offset_bottom_diff + bottom_index, diff_val); } From 4045fb4888079555fe37f2cfee2c17636125e7ff Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 17:15:07 +0800 Subject: [PATCH 04/18] fix programming rule to meet pr rule --- src/operator/contrib/channel_operator-inl.h | 5 +---- src/operator/contrib/deformable_convolution-inl.h | 7 +++---- src/operator/contrib/deformable_psroi_pooling-inl.h | 6 ++++-- src/operator/contrib/psroi_pooling-inl.h | 3 ++- src/operator/contrib/psroi_pooling.cu | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h index 38192d302651..c9894897a921 100644 --- a/src/operator/contrib/channel_operator-inl.h +++ b/src/operator/contrib/channel_operator-inl.h @@ -129,7 +129,6 @@ class ChannelOperatorOp : public Operator { } else { LOG(FATAL) << "No that operation type."; } - } virtual void Backward(const OpContext &ctx, @@ -198,8 +197,8 @@ class ChannelOperatorOp : public Operator { } else { LOG(FATAL) << "No that operation type."; } - } + private: ChannelOperatorParam param_; }; // class ChannelOperatorOp @@ -309,7 +308,6 @@ class ChannelOperatorProp : public OperatorProperty { LOG(FATAL) << "No that operation type."; return false; } - } bool InferType(std::vector *in_type, @@ -341,7 +339,6 @@ class ChannelOperatorProp : public OperatorProperty { LOG(FATAL) << "No that operation type."; return false; } - } OperatorProperty* Copy() const override { diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index c2ca88ef532d..6c8d9440f8eb 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -7,8 +7,8 @@ * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai */ -#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ #include #include @@ -238,7 +238,6 @@ class DeformableConvolutionOp : public Operator { Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s); ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout)); } - } private: @@ -486,4 +485,4 @@ class DeformableConvolutionProp : public OperatorProperty { #endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_ diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h index bac557b7bedf..16a98f76bcff 100644 --- a/src/operator/contrib/deformable_psroi_pooling-inl.h +++ b/src/operator/contrib/deformable_psroi_pooling-inl.h @@ -81,7 +81,8 @@ class DeformablePSROIPoolingOp : public Operator { Tensor data = in_data[deformablepsroipool::kData].get(s); Tensor bbox = in_data[deformablepsroipool::kBox].get(s); Tensor out = out_data[deformablepsroipool::kOut].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount] + .get(s); CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(bbox.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); @@ -125,7 +126,8 @@ class DeformablePSROIPoolingOp : public Operator { Tensor grad_out = out_grad[deformablepsroipool::kOut].get(s); Tensor data = in_data[deformablepsroipool::kData].get(s); Tensor bbox = in_data[deformablepsroipool::kBox].get(s); - Tensor top_count = out_data[deformablepsroipool::kTopCount].get(s); + Tensor top_count = out_data[deformablepsroipool::kTopCount] + .get(s); Tensor grad_in = in_grad[deformablepsroipool::kData].get(s); Tensor grad_roi = in_grad[deformablepsroipool::kBox].get(s); Tensor grad_trans; diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h index 560b09c5a73e..3a3a9c34927c 100644 --- a/src/operator/contrib/psroi_pooling-inl.h +++ b/src/operator/contrib/psroi_pooling-inl.h @@ -104,7 +104,8 @@ class PSROIPoolingOp : public Operator { if (kWriteTo == req[psroipool::kData]) { grad_in = 0.0f; } - PSROIPoolBackwardAcc(grad_in, grad_out, bbox, param_.spatial_scale, param_.output_dim, param_.group_size); + PSROIPoolBackwardAcc(grad_in, grad_out, bbox, param_.spatial_scale, + param_.output_dim, param_.group_size); } if (kWriteTo == req[psroipool::kBox]) { grad_roi = 0.0f; diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu index b7d7c547e59e..962c874c6d1a 100644 --- a/src/operator/contrib/psroi_pooling.cu +++ b/src/operator/contrib/psroi_pooling.cu @@ -56,7 +56,7 @@ __global__ void PSROIPoolForwardKernel( DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale; // Force too small ROIs to be 1x1 - DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 + DType roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 DType roi_height = max(roi_end_h - roi_start_h, 0.1); // Compute w and h at bottom From b39a767bf39fcda70db1880999a795a82fb595cb Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 17:19:53 +0800 Subject: [PATCH 05/18] minor fix channel operator --- src/operator/contrib/channel_operator-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h index c9894897a921..0f27f3ae3846 100644 --- a/src/operator/contrib/channel_operator-inl.h +++ b/src/operator/contrib/channel_operator-inl.h @@ -251,7 +251,7 @@ class ChannelOperatorProp : public OperatorProperty { return 2; } else { LOG(FATAL) << "No that operation type."; - return NULL; + return 0; } } From dfa83ef6772735db3fc62464ef6fb903c8dc3076 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 17:29:58 +0800 Subject: [PATCH 06/18] minor fix deformable conv --- src/operator/contrib/deformable_convolution-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index 6c8d9440f8eb..ba2707d64391 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -393,7 +393,7 @@ class DeformableConvolutionProp : public OperatorProperty { << "output width must equal to offset map width"; CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ << "offset filter must divide deformable group size"; - CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \ param_.num_deformable_group) \ << "offset filter must divide deformable group size"; // Perform incomplete shape inference. Fill in the missing values in data shape. From ec209607624a53fbf8ef2df5750f77383360a57c Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Wed, 17 May 2017 18:56:01 +0800 Subject: [PATCH 07/18] fix a stupid error --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 4ce449c1c130..7526a9234e3d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -135,6 +135,7 @@ List of Contributors * [David Salinas](https://github.com/geoalgo) * [Asmus Hetzel](https://github.com/asmushetzel) * [Roshani Nagmote](https://github.com/Roshrini) +* [Chetan Khatri](https://github.com/chetkhatri/) * [Yuwen Xiong](https://github.com/Orpine/) * [Haozhi Qi](https://github.com/Oh233/) * [Yi Li](https://github.com/liyi14/) From bd7a89da0d6767808e0bd80bf1f9f7b7e6dd00b7 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Tue, 23 May 2017 17:01:15 +0800 Subject: [PATCH 08/18] add test code --- tests/python/gpu/test_operator_gpu.py | 157 ++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index fd3dd9289836..f66dc0aa12a0 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1093,6 +1093,158 @@ def test_unfuse(): check_rnn_consistency(fused, stack) check_rnn_consistency(stack, fused) +def test_psroipooling_with_type(): + np.random.seed(1234) + arg_params = { + 'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])} + + # plain psroipooling + sym = mx.contrib.sym.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool') + ctx_list = [{'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float64, 'psroipool_rois': np.float64}}, + {'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float32, 'psroipool_rois': np.float32}}, + {'ctx': mx.gpu(0), + 'psroipool_data': (1, 18, 14, 14), + 'psroipool_rois': (2, 5), + 'type_dict': {'psroipool_data': np.float16, 'psroipool_rois': np.float16}}, + ] + + check_consistency(sym, ctx_list, grad_req={'psroipool_data': 'write', + 'psroipool_rois': 'null'}, arg_params=arg_params) + +def test_deformable_psroipooling_with_type(): + np.random.seed(1234) + arg_params = { + 'deformable_psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])} + + # deformable psroipooling + sym = mx.contrib.sym.DeformablePSROIPooling(spatial_scale=0.0625, sample_per_part=4, group_size=3, pooled_size=3, + output_dim=2, trans_std=0.1, no_trans=False, name='deformable_psroipool') + + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float64, 'deformable_psroipool_rois': np.float64, + 'deformable_psroipool_trans': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float32, 'deformable_psroipool_rois': np.float32, + 'deformable_psroipool_trans': np.float32}}, + {'ctx': mx.gpu(0), + 'deformable_psroipool_data': (1, 18, 14, 14), + 'deformable_psroipool_rois': (2, 5), + 'deformable_psroipool_trans': (2, 4, 3, 3), + 'type_dict': {'deformable_psroipool_data': np.float16, 'deformable_psroipool_rois': np.float16, + 'deformable_psroipool_trans': np.float16}}, + ] + + check_consistency(sym, ctx_list, grad_req={'deformable_psroipool_data': 'write', + 'deformable_psroipool_rois': 'null', + 'deformable_psroipool_trans': 'write'}, arg_params=arg_params) + +def test_deformable_convolution_with_type(): + np.random.seed(1234) + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv') + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 10, 10), + 'deformable_conv_offset': (2, 18, 8, 8), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 10, 10), + 'deformable_conv_offset': (2, 18, 8, 8), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 10, 10), + # 'deformable_conv_offset': (2, 18, 8, 8), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_conv_offset': np.float16}}, + ] + # wider tolerance needed for true-fp16 NCHW test above + tol = {np.dtype(np.float16): 0.5, + np.dtype(np.float32): 1e-3, + np.dtype(np.float64): 1e-5, + np.dtype(np.uint8): 0, + np.dtype(np.int32): 0} + check_consistency(sym, ctx_list, tol=tol) + # test ability to turn off training on bias + check_consistency(sym, ctx_list, grad_req={'deformable_conv_data': 'write', + 'deformable_conv_offset': 'write', + 'deformable_conv_weight': 'write', + 'deformable_conv_bias': 'null'}, tol=tol) +def test_deformable_convolution_options(): + # 2D convolution + + # Pad > 0 + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 7, 7), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 7, 7), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Stride > 1 + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Dilate > 1 + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 18, 3, 3), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv') + check_consistency(sym, ctx_list) + + # Deformable group > 1 + ctx_list = [{'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 36, 5, 5), + 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}}, + {'ctx': mx.gpu(0), + 'deformable_conv_data': (2, 2, 7, 7), + 'deformable_conv_offset': (2, 36, 5, 5), + 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, + # {'ctx': mx.gpu(0), + # 'deformable_conv_data': (2, 2, 7, 7), + # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, + ] + sym = mx.contrib.sym.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, + name='deformable_conv') + if __name__ == '__main__': test_countsketch() test_ifft() @@ -1124,3 +1276,8 @@ def test_unfuse(): test_take_with_type() test_bilinear_sampler_with_type() test_grid_generator_with_type() + test_psroipooling_with_type() + test_deformable_psroipooling_with_type() + test_deformable_convolution_options() + test_deformable_convolution_with_type() + From 1349588488cc73fe19129ce62c48ffe34e334b1d Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Tue, 23 May 2017 17:01:38 +0800 Subject: [PATCH 09/18] add test code --- tests/python/gpu/test_operator_gpu.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index f66dc0aa12a0..ed3c23321135 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1153,6 +1153,7 @@ def test_deformable_psroipooling_with_type(): def test_deformable_convolution_with_type(): np.random.seed(1234) sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv') + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here ctx_list = [{'ctx': mx.gpu(0), 'deformable_conv_data': (2, 2, 10, 10), 'deformable_conv_offset': (2, 18, 8, 8), @@ -1182,6 +1183,7 @@ def test_deformable_convolution_options(): # 2D convolution # Pad > 0 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here ctx_list = [{'ctx': mx.gpu(0), 'deformable_conv_data': (2, 2, 7, 7), 'deformable_conv_offset': (2, 18, 7, 7), @@ -1192,12 +1194,14 @@ def test_deformable_convolution_options(): 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, # {'ctx': mx.gpu(0), # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_offset': (2, 18, 7, 7), # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, ] sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv') check_consistency(sym, ctx_list) # Stride > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here ctx_list = [{'ctx': mx.gpu(0), 'deformable_conv_data': (2, 2, 7, 7), 'deformable_conv_offset': (2, 18, 3, 3), @@ -1208,12 +1212,14 @@ def test_deformable_convolution_options(): 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, # {'ctx': mx.gpu(0), # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 18, 3, 3), # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, ] sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv') check_consistency(sym, ctx_list) # Dilate > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here ctx_list = [{'ctx': mx.gpu(0), 'deformable_conv_data': (2, 2, 7, 7), 'deformable_conv_offset': (2, 18, 3, 3), @@ -1224,12 +1230,14 @@ def test_deformable_convolution_options(): 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, # {'ctx': mx.gpu(0), # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 18, 3, 3), # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, ] sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv') check_consistency(sym, ctx_list) # Deformable group > 1 + # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here ctx_list = [{'ctx': mx.gpu(0), 'deformable_conv_data': (2, 2, 7, 7), 'deformable_conv_offset': (2, 36, 5, 5), @@ -1240,6 +1248,7 @@ def test_deformable_convolution_options(): 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}}, # {'ctx': mx.gpu(0), # 'deformable_conv_data': (2, 2, 7, 7), + # 'deformable_conv_offset': (2, 36, 5, 5), # 'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}}, ] sym = mx.contrib.sym.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, From 9109283eb2052d633648e3a7441e2a8d21103a81 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 27 May 2017 18:27:51 +0800 Subject: [PATCH 10/18] remove channel operator and add gradient check --- src/operator/contrib/channel_operator-inl.h | 390 -------------------- src/operator/contrib/channel_operator.cc | 96 ----- src/operator/contrib/channel_operator.cu | 301 --------------- tests/python/unittest/test_operator.py | 103 ++++++ 4 files changed, 103 insertions(+), 787 deletions(-) delete mode 100644 src/operator/contrib/channel_operator-inl.h delete mode 100644 src/operator/contrib/channel_operator.cc delete mode 100644 src/operator/contrib/channel_operator.cu diff --git a/src/operator/contrib/channel_operator-inl.h b/src/operator/contrib/channel_operator-inl.h deleted file mode 100644 index 0f27f3ae3846..000000000000 --- a/src/operator/contrib/channel_operator-inl.h +++ /dev/null @@ -1,390 +0,0 @@ -/*! - * Copyright (c) 2017 Microsoft - * Licensed under The Apache-2.0 License [see LICENSE for details] - * \file channel_operator-inl.h - * \brief - * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai -*/ -#ifndef MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ -#define MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" - -namespace mxnet { -namespace op { - // Declare enumeration of input order to make code more intuitive. - // // These enums are only visible within this header -namespace channel_operator { - enum ChannelOperatorOpInputs { kData, kPick }; - enum ChannelOperatorOpOutputs { kOut, kTemp }; - enum ChannelOperatorOpType { kGroupMax, kGroupPick, kGroupSoftmax }; - enum ChannelOperatorOpPickType { kLabel, kScore }; -} // activation - -struct ChannelOperatorParam : public dmlc::Parameter { - // use int for enumeration - int op_type; - int group; - int pick_type; - DMLC_DECLARE_PARAMETER(ChannelOperatorParam) { - DMLC_DECLARE_FIELD(op_type) - .add_enum("Group_Max", channel_operator::kGroupMax) - .add_enum("Group_Pick", channel_operator::kGroupPick) - .add_enum("Group_Softmax", channel_operator::kGroupSoftmax) - .describe("Channel operator to be applied."); - DMLC_DECLARE_FIELD(group).describe("group size"); - DMLC_DECLARE_FIELD(pick_type) - .add_enum("Label_Pick", channel_operator::kLabel) - .add_enum("Score_Pick", channel_operator::kScore) - .set_default(channel_operator::kLabel) - .describe("pick type"); - } -}; - -/** -* \brief This is the implementation of channel operator. -* \tparam xpu The device that the op will be executed on. -*/ -template -class ChannelOperatorOp : public Operator { - public: - explicit ChannelOperatorOp(ChannelOperatorParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; - out_expected = 1; - } else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) { - out_expected = 1; - } else { - out_expected = 2; - } - } else { - LOG(FATAL) << "No that operation type."; - } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - if (param_.op_type == channel_operator::kGroupSoftmax) { - int total_size = in_data[channel_operator::kData].Size(); - int batch_size = in_data[channel_operator::kData].shape_[0]; - int channel_num = in_data[channel_operator::kData].shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = - Shape3(batch_size*param_.group, channel_num / param_.group, rest_size); - - Tensor data = - in_data[channel_operator::kData].get_with_shape(data_shape, s); - Tensor out = - out_data[channel_operator::kOut].get_with_shape(data_shape, s); - Softmax(out, data); - } else if (param_.op_type == channel_operator::kGroupMax) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - - GroupMaxForward(out, data, max_idx, param_.group); - } else if (param_.op_type == channel_operator::kGroupPick) { - Tensor data = in_data[channel_operator::kData].get(s); - Tensor out = out_data[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - - if (param_.pick_type == channel_operator::kScore) { - Tensor argmax_data = - out_data[channel_operator::kTemp].get(s); - GetMaxIdx(pick_idx, argmax_data, param_.group); - GroupPickForward(out, data, argmax_data, param_.group); - } else { - GroupPickForward(out, data, pick_idx, param_.group); - } - } else { - LOG(FATAL) << "No that operation type."; - } - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - size_t in_expected; - size_t out_expected; - if (param_.op_type == channel_operator::kGroupMax) { - in_expected = 1; - out_expected = 2; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - in_expected = 1; - out_expected = 1; - } else if (param_.op_type == channel_operator::kGroupPick) { - in_expected = 2; - if (param_.pick_type == channel_operator::kLabel) { - out_expected = 1; - } else { - out_expected = 2; - } - } else { - LOG(FATAL) << "No that operation type."; - } - CHECK_EQ(in_data.size(), in_expected); - CHECK_EQ(out_data.size(), out_expected); - Stream *s = ctx.get_stream(); - - if (param_.op_type == channel_operator::kGroupMax) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor max_idx = out_data[channel_operator::kTemp].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(max_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - GroupMaxBackwardAcc(grad_in, grad_out, max_idx, param_.group); - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - LOG(FATAL) << "Not Implemented."; - } else if (param_.op_type == channel_operator::kGroupPick) { - Tensor grad_out = out_grad[channel_operator::kOut].get(s); - Tensor pick_idx = in_data[channel_operator::kPick].get(s); - Tensor grad_in = in_grad[channel_operator::kData].get(s); - Tensor pick_diff = in_grad[channel_operator::kPick].get(s); - - CHECK_EQ(grad_out.CheckContiguous(), true); - CHECK_EQ(pick_idx.CheckContiguous(), true); - CHECK_EQ(grad_in.CheckContiguous(), true); - - Assign(grad_in, req[channel_operator::kData], 0); - Assign(pick_diff, req[channel_operator::kPick], 0); - if (param_.pick_type == channel_operator::kScore) { - LOG(FATAL) << "Not Implemented."; - } else { - GroupPickBackwardAcc(grad_in, grad_out, pick_idx, param_.group); - } - - } else { - LOG(FATAL) << "No that operation type."; - } - } - - private: - ChannelOperatorParam param_; -}; // class ChannelOperatorOp - - // Decalre Factory function, used for dispatch specialization -template -Operator* CreateOp(ChannelOperatorParam type, int dtype); - -#if DMLC_USE_CXX11 -class ChannelOperatorProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (param_.op_type == channel_operator::kGroupMax || - param_.op_type == channel_operator::kGroupSoftmax) { - return{ "data" }; - } else if (param_.op_type == channel_operator::kGroupPick) { - return{ "data", "pick_idx" }; - } else { - LOG(FATAL) << "No that operation type."; - return{}; - } - } - - std::vector ListOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { - return{ "output" }; - } else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) - return{ "output" }; - else - return{ "output", "argmax_data" }; - } else if (param_.op_type == channel_operator::kGroupMax) { - return{ "output", "max_idx" }; - } else { - LOG(FATAL) << "No that operation type."; - return{}; - } - } - - int NumOutputs() const override { - if (param_.op_type == channel_operator::kGroupSoftmax) { - return 1; - } else if (param_.op_type == channel_operator::kGroupPick) { - if (param_.pick_type == channel_operator::kLabel) { - return 1; - } else { - return 2; - } - } else if (param_.op_type == channel_operator::kGroupMax) { - return 2; - } else { - LOG(FATAL) << "No that operation type."; - return 0; - } - } - - int NumVisibleOutputs() const override { - return 1; - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - - - // data: [batch_size, c, h, w] - TShape dshape = in_shape->at(channel_operator::kData); - CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor"; - - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, group, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); - out_shape->push_back( - Shape4(dshape[0], param_.group, dshape[2], dshape[3])); - return true; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_shape->size(), 1) << "Input:[data]"; - // out: [num_rois, c, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], dshape[1], dshape[2], dshape[3])); - return true; - } else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_shape->size(), 2) << "Input:[data, pick_idx]"; - // out: [num_rois, c/group, h, w] - out_shape->clear(); - out_shape->push_back( - Shape4(dshape[0], dshape[1] / param_.group, dshape[2], dshape[3])); - if (param_.pick_type == channel_operator::kScore) { - out_shape->push_back( - Shape4(dshape[0], 1, 1, 1)); - } - return true; - } else { - LOG(FATAL) << "No that operation type."; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "Input must have specified type"; - - if (param_.op_type == channel_operator::kGroupMax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); - out_type->push_back(dtype); - return true; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - CHECK_EQ(in_type->size(), 1); - out_type->clear(); - out_type->push_back(dtype); - return true; - } else if (param_.op_type == channel_operator::kGroupPick) { - CHECK_EQ(in_type->size(), 2); - out_type->clear(); - out_type->push_back(dtype); - if (param_.pick_type == channel_operator::kScore) { - out_type->push_back(dtype); - } - return true; - } else { - LOG(FATAL) << "No that operation type."; - return false; - } - } - - OperatorProperty* Copy() const override { - ChannelOperatorProp* channel_operator_sym = new ChannelOperatorProp(); - channel_operator_sym->param_ = this->param_; - return channel_operator_sym; - } - - std::string TypeString() const override { - return "_contrib_ChannelOperator"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.op_type == channel_operator::kGroupMax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kTemp] }; - } else if (param_.op_type == channel_operator::kGroupSoftmax) { - return{ out_grad[channel_operator::kOut], - out_data[channel_operator::kOut] }; - } else if (param_.op_type == channel_operator::kGroupPick) { - return{ out_grad[channel_operator::kOut], - in_data[channel_operator::kPick] }; - } else { - LOG(FATAL) << "No that operation type."; - return{}; - } - } - - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - - private: - ChannelOperatorParam param_; -}; // class PSROIPoolingAlignProp -#endif -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_CONTRIB_CHANNEL_OPERATOR_INL_H_ diff --git a/src/operator/contrib/channel_operator.cc b/src/operator/contrib/channel_operator.cc deleted file mode 100644 index ae49244e0866..000000000000 --- a/src/operator/contrib/channel_operator.cc +++ /dev/null @@ -1,96 +0,0 @@ -/*! - * Copyright (c) 2017 Microsoft - * Licensed under The Apache-2.0 License [see LICENSE for details] - * \file channel_operator.cc - * \brief - * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai -*/ -#include "./channel_operator-inl.h" -#include -#include -#include -#include -#include - -using std::max; -using std::min; -using std::floor; -using std::ceil; - -namespace mshadow { - template - inline void GroupMaxForward(const Tensor &out, - const Tensor &data, - const Tensor &max_idx, - const int group) { - // NOT_IMPLEMENTED; - return; - } - template - inline void GroupPickForward(const Tensor &out, - const Tensor &data, - const Tensor &pick_idx, - const int group) { - // NOT_IMPLEMENTED; - return; - } - - template - inline void GroupMaxBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &max_idx, - const int group) { - // NOT_IMPLEMENTED; - return; - } - - template - inline void GroupPickBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &pick_idx, - const int group) { - // NOT_IMPLEMENTED; - return; - } - - template - inline void GetMaxIdx(const Tensor &pick_score, - const Tensor &argmax, - const int group) { - // NOT_IMPLEMENTED; - return; - } -} // namespace mshadow - -namespace mxnet { -namespace op { - - template<> - Operator *CreateOp(ChannelOperatorParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ChannelOperatorOp(param); - }); - return op; - } - - Operator *ChannelOperatorProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - std::vector out_type, aux_type; - CHECK(InferType(in_type, &out_type, &aux_type)); - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); - } - - DMLC_REGISTER_PARAMETER(ChannelOperatorParam); - - MXNET_REGISTER_OP_PROPERTY(_contrib_ChannelOperator, ChannelOperatorProp) - .describe("Performs channel operation on inputs, including GroupMax, GroupSoftmax, GroupPick " - "and ChannelPick. This layer is designed for FCIS ") - .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") - .add_argument("pick_idx", "Symbol", "In GroupPick or ChannelPick mode, pick_idx is used to" - "pick specific group or channel") - .add_arguments(ChannelOperatorParam::__FIELDS__()); -} // namespace op -} // namespace mxnet diff --git a/src/operator/contrib/channel_operator.cu b/src/operator/contrib/channel_operator.cu deleted file mode 100644 index 30f51e338754..000000000000 --- a/src/operator/contrib/channel_operator.cu +++ /dev/null @@ -1,301 +0,0 @@ -/*! - * Copyright (c) 2017 Microsoft - * Licensed under The Apache-2.0 License [see LICENSE for details] - * \file channel_operator.cu - * \brief - * \author Haozhi Qi, Yi Li, Guodong Zhang, Jifeng Dai -*/ -#include "./channel_operator-inl.h" -#include -#include -#include -#include -#include "../../common/cuda_utils.h" -#include "../mxnet_op.h" - -#define ChannelOperator_CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - cudaError_t error = condition; \ - CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ - } while (0) -#define CUDA_KERNEL_LOOP(i, n) \ -for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) - -namespace mshadow { -namespace cuda { - template - __global__ void GroupMaxForwardKernel( - const int count, - const DType* bottom_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* top_data, - DType* max_idx_data) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int g = (index / spatial_dim) % group; - int n = index / spatial_dim / group; - - DType max_val = -FLT_MAX; - int max_idx = -1; - for (int i = 0; i < channels_in_group; ++i) { - int c = g*channels_in_group + i; - int bottom_index = (n*channels + c)*spatial_dim + s; - if (bottom_data[bottom_index] > max_val) { - max_val = bottom_data[bottom_index]; - max_idx = c; - } - } - top_data[index] = max_val; - max_idx_data[index] = max_idx; - } - } - - template - __global__ void GroupPickForwardKernel( - const int count, - const DType* bottom_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* top_data, - const DType* pick_idx_data) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int c = (index / spatial_dim) % channels_in_group; - int n = index / spatial_dim / channels_in_group; - int g = pick_idx_data[n]; - int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; - - top_data[index] = (g < group && g >= 0) ? bottom_data[bottom_index] : DType(0); - } - } - - template - inline void GroupMaxForward(const Tensor &out, - const Tensor &data, - const Tensor &max_idx, - const int group) { - // LOG(INFO) << "GroupMaxForward"; - const DType *bottom_data = data.dptr_; - DType *top_data = out.dptr_; - DType *max_idx_data = max_idx.dptr_; - const int count = out.shape_.Size(); - const int channels = data.size(1); - const int height = data.size(2); - const int width = data.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(out.stream_); - GroupMaxForwardKernel << > >( - count, bottom_data, channels, group, - channels_in_group, spatial_dim, top_data, max_idx_data); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - - template - inline void GroupPickForward(const Tensor &out, - const Tensor &data, - const Tensor &pick_idx, - const int group) { - // LOG(INFO) << "GroupPickForward"; - const DType *bottom_data = data.dptr_; - DType *top_data = out.dptr_; - const DType *pick_idx_data = pick_idx.dptr_; - const int count = out.shape_.Size(); - const int channels = data.size(1); - const int height = data.size(2); - const int width = data.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(out.stream_); - GroupPickForwardKernel << > >( - count, bottom_data, channels, group, - channels_in_group, spatial_dim, top_data, pick_idx_data); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - - - template - __global__ void GroupMaxBackwardAccKernel( - const int count, - const DType* top_diff, - const DType* max_idx_data, - const int channels, - const int group, - const int spatial_dim, - DType* bottom_diff) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int n = index / spatial_dim / group; - - int c = max_idx_data[index]; - int bottom_index = (n*channels + c)*spatial_dim + s; - bottom_diff[bottom_index] = top_diff[index]; - } - } - - template - __global__ void GroupPickBackwardAccKernel( - const int count, - const DType* top_diff, - const DType* pick_idx_data, - const int channels, - const int group, - const int channels_in_group, - const int spatial_dim, - DType* bottom_diff) { - CUDA_KERNEL_LOOP(index, count) { - int s = index % spatial_dim; - int c = (index / spatial_dim) % channels_in_group; - int n = index / spatial_dim / channels_in_group; - int g = pick_idx_data[n]; - - int bottom_index = (n*channels + g*channels_in_group + c)*spatial_dim + s; - bottom_diff[bottom_index] = (g < group && g >= 0) ? top_diff[index] : DType(0); - } - } - - - template - inline void GroupMaxBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &max_idx, - const int group) { - // LOG(INFO) << "GroupMaxBackward"; - const DType *top_diff = out_grad.dptr_; - DType *bottom_diff = in_grad.dptr_; - const DType *max_idx_data = max_idx.dptr_; - const int count = out_grad.shape_.Size(); - const int channels = in_grad.size(1); - const int height = in_grad.size(2); - const int width = in_grad.size(3); - const int spatial_dim = height * width; - cudaStream_t stream = Stream::GetStream(in_grad.stream_); - GroupMaxBackwardAccKernel << > >( - count, top_diff, max_idx_data, channels, group, spatial_dim, bottom_diff); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - - template - inline void GroupPickBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &pick_idx, - const int group) { - // LOG(INFO) << "GroupPickBackward"; - const DType *top_diff = out_grad.dptr_; - DType *bottom_diff = in_grad.dptr_; - const DType *pick_idx_data = pick_idx.dptr_; - const int count = out_grad.shape_.Size(); - const int channels = in_grad.size(1); - const int height = in_grad.size(2); - const int width = in_grad.size(3); - const int spatial_dim = height * width; - const int channels_in_group = channels / group; - cudaStream_t stream = Stream::GetStream(in_grad.stream_); - GroupPickBackwardAccKernel << > >( - count, top_diff, pick_idx_data, channels, group, - channels_in_group, spatial_dim, bottom_diff); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } - // GetMaxIdx - template - __global__ void GetMaxIdxKernel( - const int count, - const DType* pick_score_data, - DType* argmax_data, - const int group) { - CUDA_KERNEL_LOOP(index, count) { - const DType* offset_pick_score_data = pick_score_data + index*group; - int max_idx = -1; - DType max_val = -FLT_MAX; - for (int i = 1; i < group; ++i) { - max_idx = offset_pick_score_data[i] > max_val ? i : max_idx; - max_val = offset_pick_score_data[i] > max_val ? offset_pick_score_data[i] : max_val; - } - argmax_data[index] = static_cast(max_idx); - } - } - - template - inline void GetMaxIdx(const Tensor &pick_score, - const Tensor &argmax, - const int group) { - // LOG(INFO) << "GroupPickBackward"; - const DType *pick_score_data = pick_score.dptr_; - DType *argmax_data = argmax.dptr_; - const int count = argmax.shape_.Size(); - - cudaStream_t stream = Stream::GetStream(argmax.stream_); - GetMaxIdxKernel << > >( - count, pick_score_data, argmax_data, group); - ChannelOperator_CUDA_CHECK(cudaPeekAtLastError()); - } -} // namespace cuda - - template - inline void GroupMaxForward(const Tensor &out, - const Tensor &data, - const Tensor &max_idx, - const int group) { - cuda::GroupMaxForward(out, data, max_idx, group); - } - template - inline void GroupPickForward(const Tensor &out, - const Tensor &data, - const Tensor &pick_idx, - const int group) { - cuda::GroupPickForward(out, data, pick_idx, group); - } - - template - inline void GroupMaxBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &max_idx, - const int group) { - cuda::GroupMaxBackwardAcc(in_grad, out_grad, max_idx, group); - } - - template - inline void GroupPickBackwardAcc(const Tensor &in_grad, - const Tensor &out_grad, - const Tensor &pick_idx, - const int group) { - cuda::GroupPickBackwardAcc(in_grad, out_grad, pick_idx, group); - } - - template - inline void GetMaxIdx(const Tensor &pick_score, - const Tensor &argmax, - const int group) { - cuda::GetMaxIdx(pick_score, argmax, group); - } - -} // namespace mshadow - - -namespace mxnet { -namespace op { - - template<> - Operator* CreateOp(ChannelOperatorParam param, int dtype) { - Operator* op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ChannelOperatorOp(param); - }); - return op; - } - -} // namespace op -} // namespace mxnet diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 82c20cdb17df..e30ff26397a4 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -2,6 +2,7 @@ import numpy as np import mxnet as mx import random +import itertools from numpy.testing import assert_allclose from mxnet.test_utils import * @@ -3109,6 +3110,104 @@ def create_operator(self, ctx, shapes, dtypes): check_numeric_gradient(op, [x]) +def test_psroipooling(): + for num_rois in [1, 2]: + for num_classes, num_group in itertools.product([2, 3], [2, 3]): + for image_height, image_width in itertools.product([168, 224], [168, 224]): + print num_rois, num_classes, num_group, (image_height, image_width) + for grad_nodes in [['im_data']]: + spatial_scale = 0.0625 + feat_height = image_height * spatial_scale + feat_width = image_width * spatial_scale + im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) + rois_data = np.zeros([num_rois, 5]) + rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) + rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1)) + + im_data_var = mx.symbol.Variable(name="im_data") + rois_data_var = mx.symbol.Variable(name="rois_data") + op = mx.contrib.sym.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale, + group_size=num_group, pooled_size=num_group, + output_dim=num_classes, name='test_op') + rtol, atol = 1e-2, 1e-4 + check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + +def test_deformable_convolution(): + + for num_batch in [1, 2]: + for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): + for input_height, input_width in itertools.product([5, 6], [5, 6]): + for dilate in [(1, 1), (2, 2)]: + print num_batch, num_channel_data, num_deformable_group, (input_height, input_width), dilate + for grad_nodes in [['im_data'], ['offset_data']]: + output_height = input_height + output_width = input_width + im_data = np.random.rand(num_batch, num_channel_data, input_height, input_width) + offset_data = \ + np.random.rand(num_batch, num_deformable_group * 3 * 3 * 2, output_height, output_width)\ + * 0.8 + 0.1 + + weight = np.random.normal(0, 0.001, (num_channel_data, num_channel_data, 3, 3)) + bias = np.zeros(num_channel_data) + + im_data_var = mx.symbol.Variable(name="im_data") + offset_data_var = mx.symbol.Variable(name="offset_data") + weight_var = mx.symbol.Variable(name="weight") + bias_var = mx.symbol.Variable(name="bias") + op = mx.contrib.sym.DeformableConvolution(name='test_op', data=im_data_var, + offset=offset_data_var, + weight=weight_var, bias=bias_var, + num_filter=num_channel_data, pad=dilate, + kernel=(3, 3), stride=(1, 1), dilate=dilate, + num_deformable_group=num_deformable_group) + if grad_nodes[0] == 'offset_data': + # wider tolerance needed for coordinate differential + rtol, atol = 1.0, 1e-2 + else: + rtol, atol = 0.05, 1e-4 + print rtol, atol + check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + + +def test_deformable_psroipooling(): + + for num_rois in [1, 2]: + for num_classes, num_group in itertools.product([2, 3], [2, 3]): + for image_height, image_width in itertools.product([168, 224], [168, 224]): + print num_rois, num_classes, num_group, (image_height, image_width) + for grad_nodes in [['im_data'], ['offset_data']]: + spatial_scale = 0.0625 + feat_height = image_height * spatial_scale + feat_width = image_width * spatial_scale + im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) + rois_data = np.zeros([num_rois, 5]) + rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) + rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1)) + offset_data = np.random.rand(num_rois, 2*num_classes, num_group, num_group) * 0.1 + + im_data_var = mx.symbol.Variable(name="im_data") + rois_data_var = mx.symbol.Variable(name="rois_data") + offset_data_var = mx.symbol.Variable(name="offset_data") + op = mx.contrib.sym.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var, + trans=offset_data_var, spatial_scale=spatial_scale, + sample_per_part=4, group_size=num_group, + pooled_size=num_group, output_dim=num_classes, + trans_std=0.1, no_trans=False, name='test_op') + if grad_nodes[0] == 'offset_data': + # wider tolerance needed for coordinate differential + rtol, atol = 1.0, 1e-2 + else: + rtol, atol = 1e-2, 1e-4 + print rtol, atol + check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) + + + + + if __name__ == '__main__': test_custom_op() test_log_softmax() @@ -3178,3 +3277,7 @@ def create_operator(self, ctx, shapes, dtypes): test_quantization_op() test_relu() test_sigmoid() + test_psroipooling() + test_deformable_convolution() + test_deformable_psroipooling() + From ea06bfdfffaf6317a52b3f63928827def93eebb0 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 27 May 2017 18:53:51 +0800 Subject: [PATCH 11/18] remove redundant print --- tests/python/unittest/test_operator.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index e30ff26397a4..994e08e9dd17 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3114,7 +3114,6 @@ def test_psroipooling(): for num_rois in [1, 2]: for num_classes, num_group in itertools.product([2, 3], [2, 3]): for image_height, image_width in itertools.product([168, 224], [168, 224]): - print num_rois, num_classes, num_group, (image_height, image_width) for grad_nodes in [['im_data']]: spatial_scale = 0.0625 feat_height = image_height * spatial_scale @@ -3139,7 +3138,6 @@ def test_deformable_convolution(): for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): for input_height, input_width in itertools.product([5, 6], [5, 6]): for dilate in [(1, 1), (2, 2)]: - print num_batch, num_channel_data, num_deformable_group, (input_height, input_width), dilate for grad_nodes in [['im_data'], ['offset_data']]: output_height = input_height output_width = input_width @@ -3166,7 +3164,6 @@ def test_deformable_convolution(): rtol, atol = 1.0, 1e-2 else: rtol, atol = 0.05, 1e-4 - print rtol, atol check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, grad_nodes=grad_nodes, ctx=mx.gpu(0)) @@ -3176,7 +3173,6 @@ def test_deformable_psroipooling(): for num_rois in [1, 2]: for num_classes, num_group in itertools.product([2, 3], [2, 3]): for image_height, image_width in itertools.product([168, 224], [168, 224]): - print num_rois, num_classes, num_group, (image_height, image_width) for grad_nodes in [['im_data'], ['offset_data']]: spatial_scale = 0.0625 feat_height = image_height * spatial_scale @@ -3200,7 +3196,6 @@ def test_deformable_psroipooling(): rtol, atol = 1.0, 1e-2 else: rtol, atol = 1e-2, 1e-4 - print rtol, atol check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, grad_nodes=grad_nodes, ctx=mx.gpu(0)) From 5ed2600a31f50e3a924ec999f2b3ce22d52ee2a4 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 27 May 2017 19:37:16 +0800 Subject: [PATCH 12/18] fix unittest code --- tests/python/unittest/test_operator.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 994e08e9dd17..e12ab4526890 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3116,8 +3116,8 @@ def test_psroipooling(): for image_height, image_width in itertools.product([168, 224], [168, 224]): for grad_nodes in [['im_data']]: spatial_scale = 0.0625 - feat_height = image_height * spatial_scale - feat_width = image_width * spatial_scale + feat_height = np.int(image_height * spatial_scale) + feat_width = np.int(image_width * spatial_scale) im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) rois_data = np.zeros([num_rois, 5]) rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) @@ -3129,8 +3129,10 @@ def test_psroipooling(): group_size=num_group, pooled_size=num_group, output_dim=num_classes, name='test_op') rtol, atol = 1e-2, 1e-4 - check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, - grad_nodes=grad_nodes, ctx=mx.gpu(0)) + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) def test_deformable_convolution(): @@ -3164,8 +3166,10 @@ def test_deformable_convolution(): rtol, atol = 1.0, 1e-2 else: rtol, atol = 0.05, 1e-4 - check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, - grad_nodes=grad_nodes, ctx=mx.gpu(0)) + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) def test_deformable_psroipooling(): @@ -3175,8 +3179,8 @@ def test_deformable_psroipooling(): for image_height, image_width in itertools.product([168, 224], [168, 224]): for grad_nodes in [['im_data'], ['offset_data']]: spatial_scale = 0.0625 - feat_height = image_height * spatial_scale - feat_width = image_width * spatial_scale + feat_height = np.int(image_height * spatial_scale) + feat_width = np.int(image_width * spatial_scale) im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width) rois_data = np.zeros([num_rois, 5]) rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1)) @@ -3196,8 +3200,10 @@ def test_deformable_psroipooling(): rtol, atol = 1.0, 1e-2 else: rtol, atol = 1e-2, 1e-4 - check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, - grad_nodes=grad_nodes, ctx=mx.gpu(0)) + # By now we only have gpu implementation + if mx.Context.default_ctx.device_type == 'gpu': + check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol, + grad_nodes=grad_nodes, ctx=mx.gpu(0)) From 5548f67c984044bbafba2e3fe40eafd87c0a5864 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 27 May 2017 20:45:55 +0800 Subject: [PATCH 13/18] dummy commit to trigger building check --- tests/python/unittest/test_operator.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index e12ab4526890..d73728178300 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3135,7 +3135,6 @@ def test_psroipooling(): grad_nodes=grad_nodes, ctx=mx.gpu(0)) def test_deformable_convolution(): - for num_batch in [1, 2]: for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]): for input_height, input_width in itertools.product([5, 6], [5, 6]): @@ -3173,7 +3172,6 @@ def test_deformable_convolution(): def test_deformable_psroipooling(): - for num_rois in [1, 2]: for num_classes, num_group in itertools.product([2, 3], [2, 3]): for image_height, image_width in itertools.product([168, 224], [168, 224]): @@ -3206,9 +3204,6 @@ def test_deformable_psroipooling(): grad_nodes=grad_nodes, ctx=mx.gpu(0)) - - - if __name__ == '__main__': test_custom_op() test_log_softmax() From 2dd44db7db992eee104dbecbdd26b9d3e7780960 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 27 May 2017 21:52:47 +0800 Subject: [PATCH 14/18] dummy commit to trigger building check --- tests/python/unittest/test_operator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index d73728178300..651c20d43b28 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -3204,6 +3204,7 @@ def test_deformable_psroipooling(): grad_nodes=grad_nodes, ctx=mx.gpu(0)) + if __name__ == '__main__': test_custom_op() test_log_softmax() From fae6a13594302a9bde41f0729bbe0add54d0673a Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Fri, 9 Jun 2017 10:13:11 +0800 Subject: [PATCH 15/18] Update deformable_convolution-inl.h --- src/operator/contrib/deformable_convolution-inl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h index ba2707d64391..da979e707aee 100644 --- a/src/operator/contrib/deformable_convolution-inl.h +++ b/src/operator/contrib/deformable_convolution-inl.h @@ -269,7 +269,7 @@ class DeformableConvolutionOp : public Operator { col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; // input/output image size (#channels * height * width) input_dim_ = ishape.ProdShape(1, ishape.ndim()); - input_offset_dim_ = ishape.ProdShape(1, offset_shape.ndim()); + input_offset_dim_ = offset_shape.ProdShape(1, offset_shape.ndim()); output_dim_ = oshape.ProdShape(1, oshape.ndim()); num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; num_kernels_col2im_ = input_dim_; From 320998a17aea17719a9a22704505f4abdc73fe66 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 17 Jun 2017 22:29:34 +0800 Subject: [PATCH 16/18] Update deformable_im2col.h --- src/operator/contrib/nn/deformable_im2col.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h index aeed47b8b1ba..9d6180034c71 100644 --- a/src/operator/contrib/nn/deformable_im2col.h +++ b/src/operator/contrib/nn/deformable_im2col.h @@ -91,7 +91,7 @@ inline void deformable_im2col(mshadow::Stream* s, const TShape& pad, const TShape& stride, const TShape& dilation, const uint32_t deformable_group, DType* data_col) { if (2 == kernel_shape.ndim()) { - LOG(FATAL) << "not implemented"; + LOG(FATAL) << "only implemented in GPU"; } else { LOG(FATAL) << "not implemented"; } @@ -120,7 +120,7 @@ inline void deformable_col2im(mshadow::Stream* s, const TShape& dilation, const uint32_t deformable_group, DType* grad_im, OpReqType req) { index_t num_spatial_axes = kernel_shape.ndim(); - LOG(FATAL) << "not implemented"; + LOG(FATAL) << "only implemented in GPU"; } @@ -146,7 +146,7 @@ inline void deformable_col2im_coord(mshadow::Stream* s, const TShape& col_shape, const TShape& kernel_shape, const TShape& pad, const TShape& stride, const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { - LOG(FATAL) << "not implemented"; + LOG(FATAL) << "only implemented in GPU"; } } // namespace op From 3d188c2de556fd1d785bed6da4a25833c6d4a6c4 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 17 Jun 2017 22:39:22 +0800 Subject: [PATCH 17/18] Update deformable_convolution.cc --- .../contrib/deformable_convolution.cc | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/operator/contrib/deformable_convolution.cc b/src/operator/contrib/deformable_convolution.cc index a5916a578304..5af91a0aa407 100644 --- a/src/operator/contrib/deformable_convolution.cc +++ b/src/operator/contrib/deformable_convolution.cc @@ -36,21 +36,14 @@ Operator *DeformableConvolutionProp::CreateOperatorEx(Context ctx, } MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolution, DeformableConvolutionProp) -.describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. +.describe(R"code(Compute 2-D deformable convolution on 4-D input. -In the 2-D convolution, given input data with shape *(batch_size, -channel, height, width)*, the output is computed by +The deformable convolution operation is described in https://arxiv.org/abs/1703.06211 -.. math:: - - out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star - weight[i,j,:,:] - -where :math:`\star` is the 2-D cross-correlation operator. - -For general 2-D convolution, the shapes are +For 2-D deformable convolution, the shapes are - **data**: *(batch_size, channel, height, width)* +- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)* - **weight**: *(num_filter, channel, kernel[0], kernel[1])* - **bias**: *(num_filter,)* - **out**: *(batch_size, num_filter, out_height, out_width)*. @@ -67,7 +60,7 @@ then we have:: If ``no_bias`` is set to be true, then the ``bias`` term is ignored. The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height, -width)*. We can choose other layouts such as *NHWC*. +width)*. If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` evenly into *g* parts along the channel axis, and also evenly split ``weight`` @@ -75,6 +68,13 @@ along the first dimension. Next compute the convolution on the *i*-th part of the data with the *i*-th weight part. The output is obtained by concating all the *g* results. +If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the +input ``offset`` evenly into *dg* parts along the channel axis, and also evenly +split ``out`` evenly into *dg* parts along the channel axis. Next compute the +deformable convolution, apply the *i*-th part of the offset part on the *i*-th +out. + + Both ``weight`` and ``bias`` are learnable parameters. From b8da6adc25e9ded1aa807a20774f4706ceb09f47 Mon Sep 17 00:00:00 2001 From: Yuwen Xiong Date: Sat, 17 Jun 2017 22:44:21 +0800 Subject: [PATCH 18/18] Update deformable_psroi_pooling.cc --- src/operator/contrib/deformable_psroi_pooling.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc index 7dce9be9b866..290bad2a76cd 100644 --- a/src/operator/contrib/deformable_psroi_pooling.cc +++ b/src/operator/contrib/deformable_psroi_pooling.cc @@ -82,10 +82,9 @@ namespace op { DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) - .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " - "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " - "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " - "the number of region bounding boxes after DeformablePSROIPooling") + .describe("Performs deformable position-sensitive region-of-interest pooling on inputs." + "The DeformablePSROIPooling operation is described in https://arxiv.org/abs/1703.06211." + "batch_size will change to the number of region bounding boxes after DeformablePSROIPooling") .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners "