diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index dbbb73b1361a..843ad8a056fc 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -137,3 +137,8 @@ List of Contributors
 * [Roshani Nagmote](https://github.com/Roshrini)
 * [Chetan Khatri](https://github.com/chetkhatri/)
 * [James Liu](https://github.com/jamesliu/)
+* [Yuwen Xiong](https://github.com/Orpine/)
+* [Haozhi Qi](https://github.com/Oh233/)
+* [Yi Li](https://github.com/liyi14/)
+* [Guodong Zhang](https://github.com/gd-zhang/)
+* [Xizhou Zhu](https://github.com/einsiedler0408/)
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
new file mode 100644
index 000000000000..da979e707aee
--- /dev/null
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -0,0 +1,488 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution-inl.h
+ * \brief
+ * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../nn/im2col.h"
+#include "./nn/deformable_im2col.h"
+
+
+namespace mxnet {
+namespace op {
+
+namespace conv {
+  enum DeformableConvolutionOpInputs { kData, kOffset, kWeight, kBias };
+  enum DeformableConvolutionOpOutputs { kOut };
+  enum DeformableConvolutionOpResource { kTempSpace };
+}
+
+struct DeformableConvolutionParam : public dmlc::Parameter<DeformableConvolutionParam> {
+  TShape kernel;
+  TShape stride;
+  TShape dilate;
+  TShape pad;
+  uint32_t num_filter;
+  uint32_t num_group;
+  uint32_t num_deformable_group;
+  uint64_t workspace;
+  bool no_bias;
+  dmlc::optional<int> layout;
+  DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) {
+    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+      .describe("convolution stride: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(dilate).set_default(TShape())
+      .describe("convolution dilate: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+      .describe("pad for convolution: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
+      .describe("convolution filter(channel) number");
+    DMLC_DECLARE_FIELD(num_group).set_default(1)
+      .describe("Number of group partitions.");
+    DMLC_DECLARE_FIELD(num_deformable_group).set_default(1)
+      .describe("Number of deformable group partitions.");
+    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
+      .describe("Maximum temperal workspace allowed for convolution (MB).");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+      .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(layout)
+      .add_enum("NCW", mshadow::kNCW)
+      .add_enum("NCHW", mshadow::kNCHW)
+      .add_enum("NCDHW", mshadow::kNCDHW)
+      .set_default(dmlc::optional<int>())
+      .describe("Set layout for input, output and weight. Empty for\n    "
+        "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
+  }
+};
+
+template<typename xpu, typename DType>
+class DeformableConvolutionOp : public Operator {
+ public:
+  explicit DeformableConvolutionOp(DeformableConvolutionParam p) {
+    this->param_ = p;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    CHECK(param_.layout.value() == mshadow::kNCW ||
+      param_.layout.value() == mshadow::kNCHW ||
+      param_.layout.value() == mshadow::kNCDHW)
+      << "Only support NCW, NCHW and NCDHW layout";
+  }
+
+  virtual void Forward(const OpContext &ctx,
+    const std::vector<TBlob> &in_data,
+    const std::vector<OpReqType> &req,
+    const std::vector<TBlob> &out_data,
+    const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[conv::kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 3 : 4;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1U);
+    LayerSetUp(in_data[conv::kData].shape_,
+               in_data[conv::kOffset].shape_,
+               out_data[conv::kOut].shape_);
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    TShape col_buffer_shape(num_spatial_axes_ + 1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = out_data[0].shape_[i + 1];
+    }
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // initialize weight and col_buffer 3D tensors for using gemm
+    index_t M = conv_out_channels_ / group_;
+    index_t N = conv_out_spatial_dim_;
+    index_t K = kernel_dim_;
+    Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, M, K), s);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, N), s);
+    Tensor<xpu, 4, DType> output_4d = out_data[conv::kOut].get_with_shape<xpu, 4, DType>(
+      Shape4(num_, group_, M, N), s);
+    for (index_t n = 0; n < num_; ++n) {
+      // transform image to col_buffer in order to use gemm
+      deformable_im2col(s, in_data[conv::kData].dptr<DType>() + n*input_dim_,
+        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_, in_data[conv::kData].shape_,
+        col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+        param_.num_deformable_group, col_buffer.dptr<DType>());
+      Tensor<xpu, 3, DType> output_3d = output_4d[n];
+      for (index_t g = 0; g < group_; ++g) {
+        ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
+      }
+    }
+    if (bias_term_) {
+      Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
+      Tensor<xpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<xpu, 3, DType>(
+        Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
+      // has bias term, broadcast it to the same shape of output_3d in channel dim
+      output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+    const std::vector<TBlob>& out_grad,
+    const std::vector<TBlob>& in_data,
+    const std::vector<TBlob>& out_data,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& in_grad,
+    const std::vector<TBlob>& aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    size_t expected = param_.no_bias == 0 ? 4 : 3;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
+    LayerSetUp(in_grad[conv::kData].shape_,
+               in_grad[conv::kOffset].shape_,
+               out_grad[conv::kOut].shape_);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    TShape col_buffer_shape(num_spatial_axes_ + 1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1];
+    }
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // initialize weight and col_buffer 3D tensors for using gemm
+    // For computing dLoss/d(in_data[kData])
+    index_t M = kernel_dim_;
+    index_t N = conv_out_spatial_dim_;
+    index_t K = conv_out_channels_ / group_;
+    Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, M), s);
+    Tensor<xpu, 4, DType> out_grad_4d = out_grad[conv::kOut].get_with_shape<xpu, 4, DType>(
+      Shape4(num_, group_, K, N), s);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(group_, M, N), s);
+    // For computing dLoss/dWeight
+    Tensor<xpu, 3, DType> dweight_3d = in_grad[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, M), s);
+
+    Tensor<xpu, 1, DType> data_grad = in_grad[conv::kData].FlatTo1D<xpu, DType>(s);
+    data_grad = 0;
+
+
+    for (index_t n = 0; n < num_; ++n) {
+      Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
+      for (index_t g = 0; g < group_; ++g) {
+        col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
+      }
+
+      // gradient w.r.t. input coordinate data
+      deformable_col2im_coord(s, col_buffer.dptr<DType>(),
+        in_data[conv::kData].dptr<DType>() + n*input_dim_,
+        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
+        in_grad[conv::kData].shape_, col_buffer.shape_,
+        param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group,
+        in_grad[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
+        req[conv::kData]);
+
+      // gradient w.r.t. input data
+      deformable_col2im(s, col_buffer.dptr<DType>(),
+        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
+        in_grad[conv::kData].shape_, col_buffer.shape_,
+        param_.kernel, param_.pad, param_.stride, param_.dilate, param_.num_deformable_group,
+        in_grad[conv::kData].dptr<DType>() + n*input_dim_,
+        req[conv::kData]);
+
+      // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+      im2col(s, in_data[conv::kData].dptr<DType>() + n*input_dim_, in_data[conv::kData].shape_,
+        col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+        col_buffer.dptr<DType>());
+      for (index_t g = 0; g < group_; ++g) {
+        if (0 == n) {
+          ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight],
+            dot(out_grad_3d[g], col_buffer_3d[g].T()));
+        } else {
+          dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T());
+        }
+      }
+    }
+
+    // gradient w.r.t bias
+    if (bias_term_) {
+      Tensor<xpu, 1, DType> dbias = in_grad[conv::kBias].get<xpu, 1, DType>(s);
+      Tensor<xpu, 3, DType> dout = out_grad[conv::kOut].get_with_shape<xpu, 3, DType>(
+        Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
+      ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout));
+    }
+  }
+
+ private:
+  void LayerSetUp(const TShape& ishape, const TShape& offset_shape, const TShape& oshape) {
+    channel_axis_ = 1;  // hard code channel axis
+    const index_t first_spatial_axis = channel_axis_ + 1;
+    const index_t num_axes = param_.kernel.ndim() + 2;
+    num_spatial_axes_ = num_axes - first_spatial_axis;
+    is_1x1_ = true;
+    for (index_t i = 0; i < param_.kernel.ndim(); ++i) {
+      is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
+      if (!is_1x1_) break;
+    }
+
+    // batch size
+    num_ = ishape[0];
+    // number of input channels
+    channels_ = ishape[1];
+    group_ = param_.num_group;
+    conv_out_channels_ = param_.num_filter;
+    conv_in_channels_ = channels_;
+    bias_term_ = !param_.no_bias;
+    kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
+    weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
+    conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
+    col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
+    output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+    // size of the column buffer used for storing im2col-ed pixels
+    col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
+    // input/output image size (#channels * height * width)
+    input_dim_ = ishape.ProdShape(1, ishape.ndim());
+    input_offset_dim_ = offset_shape.ProdShape(1, offset_shape.ndim());
+    output_dim_ = oshape.ProdShape(1, oshape.ndim());
+    num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
+    num_kernels_col2im_ = input_dim_;
+  }
+
+ private:
+  DeformableConvolutionParam param_;
+  index_t channel_axis_;  // channel axis of the input
+  index_t channels_;  // number of channels of input image
+  index_t num_spatial_axes_;  // number of spatial axes
+  index_t num_;  // batch size
+  index_t group_;  // number of groups
+  index_t conv_out_channels_;  // number of output channels (num_filter)
+  index_t conv_out_spatial_dim_;  // number of pixels of output images per channel
+  index_t conv_in_channels_;  // number of input channels
+  index_t kernel_dim_;  // number of input channels per group * kernel size
+  index_t weight_offset_;  // number of output channels per group * kernel_dim_
+  index_t col_offset_;
+  index_t output_offset_;
+  index_t col_buffer_size_;
+  index_t input_dim_;
+  index_t input_offset_dim_;
+  index_t output_dim_;
+  index_t num_kernels_im2col_;
+  index_t num_kernels_col2im_;
+  bool bias_term_;  // has bias term?
+  bool is_1x1_;
+};  // class ConvolutionOp
+
+template<typename xpu>
+Operator* CreateOp(DeformableConvolutionParam param, int dtype,
+  std::vector<TShape> *in_shape,
+  std::vector<TShape> *out_shape,
+  Context ctx);
+
+#if DMLC_USE_CXX11
+class DeformableConvolutionProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return{ "data", "offset", "weight", "bias" };
+    } else {
+      return{ "data", "offset", "weight" };
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      LOG(FATAL) << "not implemented";
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+    std::vector<TShape> *out_shape,
+    std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 4U) << "Input:[data, offset, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, offset, weight]";
+    }
+    out_shape->resize(1, TShape());
+    const TShape &dshp = (*in_shape)[conv::kData];
+    const TShape &oshp = (*in_shape)[conv::kOffset];
+    if (dshp.ndim() == 0) return false;
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshp.ndim(), 4U) \
+        << "Input data should be 4D in batch-num_filter-y-x";
+      CHECK_EQ(oshp.ndim(), 4U) \
+        << "Input offset should be 4D in batch-num_filter-y-x";
+      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+        param_.kernel[0], param_.kernel[1]);
+      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \
+        << "input num_filter must divide deformable group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+        << "incorrect dilate size: " << param_.dilate;
+      Shape<4> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
+        (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
+      oshape[3] = (dshape[3] + 2 * param_.pad[1] -
+        (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+      CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \
+        << "output num_filter must divide deformable group size";
+      CHECK_EQ(oshape[2], offsetshape[2]) \
+        << "output height must equal to offset map height";
+      CHECK_EQ(oshape[3], offsetshape[3]) \
+        << "output width must equal to offset map width";
+      CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \
+        << "offset filter must divide deformable group size";
+      CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \
+               param_.num_deformable_group) \
+        << "offset filter must divide deformable group size";
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+      dshape[0] = oshape[0];
+      if (param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0];
+      }
+      if (param_.stride[1] == 1) {
+        dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCHW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input";
+      }
+      if (dshape[3] != 0) {
+        CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input";
+      }
+      return true;
+    } else {
+      LOG(FATAL) << "not implemented";
+      return false;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+    std::vector<int> *out_type,
+    std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+          << "Expected " << dtype << " v.s. given "
+          << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new DeformableConvolutionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_DeformableConvolution";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return{ out_grad[conv::kOut], in_data[conv::kData],
+            in_data[conv::kOffset], in_data[conv::kWeight] };
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return{ ResourceRequest::kTempSpace };
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return{ ResourceRequest::kTempSpace };
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+    std::vector<int> *in_type) const override;
+
+ private:
+  DeformableConvolutionParam param_;
+};  // class ConvolutionProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_INL_H_
diff --git a/src/operator/contrib/deformable_convolution.cc b/src/operator/contrib/deformable_convolution.cc
new file mode 100644
index 000000000000..5af91a0aa407
--- /dev/null
+++ b/src/operator/contrib/deformable_convolution.cc
@@ -0,0 +1,89 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution.cc
+ * \brief
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+*/
+
+#include "./deformable_convolution-inl.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(DeformableConvolutionParam);
+
+template<>
+Operator* CreateOp<cpu>(DeformableConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DeformableConvolutionOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *DeformableConvolutionProp::CreateOperatorEx(Context ctx,
+                                            std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+}
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolution, DeformableConvolutionProp)
+.describe(R"code(Compute 2-D deformable convolution on 4-D input.
+
+The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+For 2-D deformable convolution, the shapes are
+
+- **data**: *(batch_size, channel, height, width)*
+- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)*
+- **weight**: *(num_filter, channel, kernel[0], kernel[1])*
+- **bias**: *(num_filter,)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*.
+
+Define::
+
+  f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1
+
+then we have::
+
+  out_height=f(height, kernel[0], pad[0], stride[0], dilate[0])
+  out_width=f(width, kernel[1], pad[1], stride[1], dilate[1])
+
+If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+
+The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height,
+width)*. 
+
+If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
+evenly into *g* parts along the channel axis, and also evenly split ``weight``
+along the first dimension. Next compute the convolution on the *i*-th part of
+the data with the *i*-th weight part. The output is obtained by concating all
+the *g* results.
+
+If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the
+input ``offset`` evenly into *dg* parts along the channel axis, and also evenly
+split ``out`` evenly into *dg* parts along the channel axis. Next compute the
+deformable convolution, apply the *i*-th part of the offset part on the *i*-th
+out.
+
+
+Both ``weight`` and ``bias`` are learnable parameters.
+
+
+)code" ADD_FILELINE)
+.add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionOp.")
+.add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionOp.")
+.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
+.add_arguments(DeformableConvolutionParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/deformable_convolution.cu b/src/operator/contrib/deformable_convolution.cu
new file mode 100644
index 000000000000..f690cc1ce24c
--- /dev/null
+++ b/src/operator/contrib/deformable_convolution.cu
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution.cu
+ * \brief
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+*/
+
+#include "./deformable_convolution-inl.h"
+#include <vector>
+
+namespace mxnet {
+namespace op {
+
+  template<>
+  Operator* CreateOp<gpu>(DeformableConvolutionParam param, int dtype,
+    std::vector<TShape> *in_shape,
+    std::vector<TShape> *out_shape,
+    Context ctx) {
+    Operator *op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new DeformableConvolutionOp<gpu, DType>(param);
+    })
+      return op;
+  }
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/contrib/deformable_psroi_pooling-inl.h b/src/operator/contrib/deformable_psroi_pooling-inl.h
new file mode 100644
index 000000000000..16a98f76bcff
--- /dev/null
+++ b/src/operator/contrib/deformable_psroi_pooling-inl.h
@@ -0,0 +1,286 @@
+/*!
+* Copyright (c) 2017 Microsoft
+* Licensed under The Apache-2.0 License [see LICENSE for details]
+* \file deformable_psroi_pooling-inl.h
+* \brief deformable psroi pooling operator and symbol
+* \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_
+#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../mshadow_op.h"
+#include "../operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+  // Declare enumeration of input order to make code more intuitive.
+  // These enums are only visible within this header
+namespace deformablepsroipool {
+  enum DeformablePSROIPoolingOpInputs { kData, kBox, kTrans };
+  enum DeformablePSROIPoolingOpOutputs { kOut, kTopCount };
+}  // deformablepsroipool
+
+struct DeformablePSROIPoolingParam : public dmlc::Parameter<DeformablePSROIPoolingParam> {
+  // TShape pooled_size;
+  float spatial_scale;
+  int output_dim;
+  int group_size;
+  int pooled_size;
+  int part_size;
+  int sample_per_part;
+  float trans_std;
+  bool no_trans;
+  DMLC_DECLARE_PARAMETER(DeformablePSROIPoolingParam) {
+    DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0)
+      .describe("Ratio of input feature map height (or w) to raw image height (or w). "
+        "Equals the reciprocal of total stride in convolutional layers");
+    DMLC_DECLARE_FIELD(output_dim).describe("fix output dim");
+    DMLC_DECLARE_FIELD(group_size).describe("fix group size");
+    DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size");
+    DMLC_DECLARE_FIELD(part_size).set_default(0).describe("fix part size");
+    DMLC_DECLARE_FIELD(sample_per_part).set_default(1).describe("fix samples per part");
+    DMLC_DECLARE_FIELD(trans_std).set_default(0.0).set_range(0.0, 1.0)
+      .describe("fix transition std");
+    DMLC_DECLARE_FIELD(no_trans).set_default(false)
+      .describe("Whether to disable trans parameter.");
+  }
+};
+
+template<typename xpu, typename DType>
+class DeformablePSROIPoolingOp : public Operator {
+ public:
+  explicit DeformablePSROIPoolingOp(DeformablePSROIPoolingParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+    const std::vector<TBlob> &in_data,
+    const std::vector<OpReqType> &req,
+    const std::vector<TBlob> &out_data,
+    const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.no_trans? 2 : 3;
+    size_t out_expected = 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(out_data[deformablepsroipool::kOut].shape_[0],
+             in_data[deformablepsroipool::kBox].shape_[0]);
+    CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0],
+             in_data[deformablepsroipool::kBox].shape_[0]);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4, DType> data = in_data[deformablepsroipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[deformablepsroipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[deformablepsroipool::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> top_count = out_data[deformablepsroipool::kTopCount]
+                                        .get<xpu, 4, DType>(s);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(bbox.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    CHECK_EQ(top_count.CheckContiguous(), true);
+    out = -FLT_MAX;
+    top_count = 0.0f;
+
+    Tensor<xpu, 4, DType> trans;
+    if (!param_.no_trans) {
+      trans = in_data[deformablepsroipool::kTrans].get<xpu, 4, DType>(s);
+    }
+    DeformablePSROIPoolForward(out, data, bbox, trans, top_count, param_.no_trans,
+      param_.spatial_scale, param_.output_dim, param_.group_size, param_.pooled_size,
+      param_.part_size, param_.sample_per_part, param_.trans_std);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+    const std::vector<TBlob> &out_grad,
+    const std::vector<TBlob> &in_data,
+    const std::vector<TBlob> &out_data,
+    const std::vector<OpReqType> &req,
+    const std::vector<TBlob> &in_grad,
+    const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.no_trans ? 2 : 3;
+    size_t out_expected = 2;
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(out_grad[deformablepsroipool::kOut].shape_[0],
+             in_data[deformablepsroipool::kBox].shape_[0]);
+    CHECK_EQ(out_data[deformablepsroipool::kTopCount].shape_[0],
+             in_data[deformablepsroipool::kBox].shape_[0]);
+    CHECK_NE(req[deformablepsroipool::kData], kWriteInplace) <<
+      "DeformablePSROIPooling: Backward doesn't support kWriteInplace.";
+    CHECK_NE(req[deformablepsroipool::kBox], kWriteInplace) <<
+      "DeformablePSROIPooling: Backward doesn't support kWriteInplace.";
+    // CHECK_NE(req[deformablepsroipool::kTrans], kWriteInplace) <<
+    //  "DeformablePSROIPooling: Backward doesn't support kWriteInplace.";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4, DType> grad_out = out_grad[deformablepsroipool::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> data = in_data[deformablepsroipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[deformablepsroipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> top_count = out_data[deformablepsroipool::kTopCount]
+                                        .get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> grad_in = in_grad[deformablepsroipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grad_roi = in_grad[deformablepsroipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> grad_trans;
+    Tensor<xpu, 4, DType> trans;
+    if (!param_.no_trans) {
+      CHECK_EQ(in_grad.size(), 3);
+      trans = in_data[deformablepsroipool::kTrans].get<xpu, 4, DType>(s);
+      grad_trans = in_grad[deformablepsroipool::kTrans].get<xpu, 4, DType>(s);
+    }
+
+    CHECK_EQ(grad_out.CheckContiguous(), true);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(bbox.CheckContiguous(), true);
+    CHECK_EQ(top_count.CheckContiguous(), true);
+    CHECK_EQ(grad_in.CheckContiguous(), true);
+
+    Assign(grad_in, req[deformablepsroipool::kData], 0);
+    if (!param_.no_trans) {
+      Assign(grad_trans, req[deformablepsroipool::kTrans], 0);
+    }
+    DeformablePSROIPoolBackwardAcc(grad_in, grad_trans, grad_out, data, bbox, trans,
+      top_count, param_.no_trans, param_.spatial_scale, param_.output_dim, param_.group_size,
+      param_.pooled_size, param_.part_size, param_.sample_per_part, param_.trans_std);
+    Assign(grad_roi, req[deformablepsroipool::kBox], 0);
+  }
+
+ private:
+  DeformablePSROIPoolingParam param_;
+};  // class DeformablePSROIPoolingOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(DeformablePSROIPoolingParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class DeformablePSROIPoolingProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (param_.no_trans) {
+      return{ "data", "rois" };
+    } else {
+      return{ "data", "rois", "trans" };
+    }
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return{ "output", "top_count" };
+  }
+
+  int NumOutputs() const override {
+    return 2;
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+    if (param_.part_size == 0) {
+      param_.part_size = param_.pooled_size;
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+    std::vector<TShape> *out_shape,
+    std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (param_.no_trans) {
+      CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, rois, trans]";
+      // trans: [num_rois, 2, pooled_h, pooled_w]
+      TShape tshape = in_shape->at(deformablepsroipool::kTrans);
+      CHECK_EQ(tshape.ndim(), 4) << "trans should be a 4D tensor of shape";
+    }
+
+    // data: [batch_size, c, h, w]
+    TShape dshape = in_shape->at(deformablepsroipool::kData);
+    CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor";
+
+    // bbox: [num_rois, 5]
+    TShape bshape = in_shape->at(deformablepsroipool::kBox);
+    CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]";
+    CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
+
+    // out: [num_rois, c, pooled_h, pooled_w]
+    // top_count: [num_rois, c, pooled_h, pooled_w]
+    out_shape->clear();
+    out_shape->push_back(
+      Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size));
+    out_shape->push_back(
+      Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size));
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+    std::vector<int> *out_type,
+    std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 2);
+    int dtype = (*in_type)[0];
+    CHECK_EQ(dtype, (*in_type)[1]);
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    DeformablePSROIPoolingProp* deformable_psroi_pooling_sym = new DeformablePSROIPoolingProp();
+    deformable_psroi_pooling_sym->param_ = this->param_;
+    return deformable_psroi_pooling_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_DeformablePSROIPooling";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    if (param_.no_trans) {
+      return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData],
+              in_data[deformablepsroipool::kBox], out_data[deformablepsroipool::kTopCount] };
+    } else {
+      return{ out_grad[deformablepsroipool::kOut], in_data[deformablepsroipool::kData],
+              in_data[deformablepsroipool::kBox], in_data[deformablepsroipool::kTrans],
+              out_data[deformablepsroipool::kTopCount] };
+    }
+  }
+
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+    std::vector<int> *in_type) const override;
+
+
+ private:
+  DeformablePSROIPoolingParam param_;
+};  // class DeformablePSROIPoolingProp
+#endif
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_DEFORMABLE_PSROI_POOLING_INL_H_
diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc
new file mode 100644
index 000000000000..290bad2a76cd
--- /dev/null
+++ b/src/operator/contrib/deformable_psroi_pooling.cc
@@ -0,0 +1,96 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cc
+ * \brief 
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+#include "./deformable_psroi_pooling-inl.h"
+#include <mshadow/base.h>
+#include <mshadow/tensor.h>
+#include <mshadow/packet-inl.h>
+#include <mshadow/dot_engine-inl.h>
+#include <cassert>
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace mshadow {
+  template<typename DType>
+  inline void DeformablePSROIPoolForward(const Tensor<cpu, 4, DType> &out,
+    const Tensor<cpu, 4, DType> &data,
+    const Tensor<cpu, 2, DType> &bbox,
+    const Tensor<cpu, 4, DType> &trans,
+    const Tensor<cpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    // NOT_IMPLEMENTED;
+    return;
+  }
+
+  template<typename DType>
+  inline void DeformablePSROIPoolBackwardAcc(const Tensor<cpu, 4, DType> &in_grad,
+    const Tensor<cpu, 4, DType> &trans_grad,
+    const Tensor<cpu, 4, DType> &out_grad,
+    const Tensor<cpu, 4, DType> &data,
+    const Tensor<cpu, 2, DType> &bbox,
+    const Tensor<cpu, 4, DType> &trans,
+    const Tensor<cpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    // NOT_IMPLEMENTED;
+    return;
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+
+  template<>
+  Operator *CreateOp<cpu>(DeformablePSROIPoolingParam param, int dtype) {
+    Operator* op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new DeformablePSROIPoolingOp<cpu, DType>(param);
+    });
+    return op;
+  }
+
+  Operator *DeformablePSROIPoolingProp::CreateOperatorEx(
+    Context ctx, std::vector<TShape> *in_shape,
+    std::vector<int> *in_type) const {
+    std::vector<TShape> out_shape, aux_shape;
+    std::vector<int> out_type, aux_type;
+    CHECK(InferType(in_type, &out_type, &aux_type));
+    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+    DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+  }
+
+  DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam);
+
+  MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp)
+    .describe("Performs deformable position-sensitive region-of-interest pooling on inputs."
+      "The DeformablePSROIPooling operation is described in https://arxiv.org/abs/1703.06211."
+      "batch_size will change to the number of region bounding boxes after DeformablePSROIPooling")
+    .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps")
+    .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of "
+      "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners "
+      "of designated region of interest. batch_index indicates the index of corresponding image "
+      "in the input data")
+    .add_argument("trans", "Symbol", "transition parameter")
+    .add_arguments(DeformablePSROIPoolingParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu
new file mode 100644
index 000000000000..f9eb01a26e38
--- /dev/null
+++ b/src/operator/contrib/deformable_psroi_pooling.cu
@@ -0,0 +1,415 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+#include "./deformable_psroi_pooling-inl.h"
+#include <mshadow/tensor.h>
+#include <mshadow/cuda/reduce.cuh>
+#include <algorithm>
+#include <vector>
+#include "../../common/cuda_utils.h"
+#include "../mxnet_op.h"
+
+#define DeformablePSROIPOOLING_CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+#define CUDA_KERNEL_LOOP(i, n) \
+for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+namespace mshadow {
+namespace cuda {
+  template <typename DType>
+  __device__ DType bilinear_interp(
+    const DType* data,
+    const DType x,
+    const DType y,
+    const int width,
+    const int height) {
+    int x1 = floor(x);
+    int x2 = ceil(x);
+    int y1 = floor(y);
+    int y2 = ceil(y);
+    DType dist_x = static_cast<DType>(x - x1);
+    DType dist_y = static_cast<DType>(y - y1);
+    DType value11 = data[y1*width + x1];
+    DType value12 = data[y2*width + x1];
+    DType value21 = data[y1*width + x2];
+    DType value22 = data[y2*width + x2];
+    DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12
+      + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22;
+    return value;
+  }
+
+  template <typename DType>
+  __global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const DType* bottom_data,
+    const DType spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const DType* bottom_rois, const DType* bottom_trans,
+    const bool no_trans,
+    const DType trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    DType* top_data,
+    DType* top_count) {
+    CUDA_KERNEL_LOOP(index, count) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int ctop = (index / pooled_width / pooled_height) % output_dim;
+      int n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      const DType* offset_bottom_rois = bottom_rois + n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+      // Force too small ROIs to be 1x1
+      DType roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+      DType roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+      // Compute w and h at bottom
+      DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+      DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+      DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
+      DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
+
+      int part_h = floor(static_cast<DType>(ph) / pooled_height*part_size);
+      int part_w = floor(static_cast<DType>(pw) / pooled_width*part_size);
+      int class_id = ctop / channels_each_class;
+      DType trans_x = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+      DType trans_y = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2 + 1)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+
+      DType wstart = static_cast<DType>(pw)* bin_size_w
+        + roi_start_w;
+      wstart += trans_x * roi_width;
+      DType hstart = static_cast<DType>(ph) * bin_size_h
+        + roi_start_h;
+      hstart += trans_y * roi_height;
+
+      DType sum = 0;
+      int count = 0;
+      int gw = floor(static_cast<DType>(pw) * group_size / pooled_width);
+      int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
+      gw = min(max(gw, 0), group_size - 1);
+      gh = min(max(gh, 0), group_size - 1);
+
+      const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+      for (int ih = 0; ih < sample_per_part; ih++) {
+        for (int iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw*sub_bin_size_w;
+          DType h = hstart + ih*sub_bin_size_h;
+          // bilinear interpolation
+          if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) {
+            continue;
+          }
+          w = min(max(w, 0.), width - 1.);
+          h = min(max(h, 0.), height - 1.);
+          int c = (ctop*group_size + gh)*group_size + gw;
+          DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height);
+          sum += val;
+          count++;
+        }
+      }
+      top_data[index] = count == 0 ? static_cast<DType>(0) : sum / count;
+      top_count[index] = count;
+    }
+  }
+
+  template<typename DType>
+  inline void DeformablePSROIPoolForward(const Tensor<gpu, 4, DType> &out,
+    const Tensor<gpu, 4, DType> &data,
+    const Tensor<gpu, 2, DType> &bbox,
+    const Tensor<gpu, 4, DType> &trans,
+    const Tensor<gpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    // LOG(INFO) << "DeformablePSROIPoolForward";
+    const DType *bottom_data = data.dptr_;
+    const DType *bottom_rois = bbox.dptr_;
+    const DType *bottom_trans = no_trans ? NULL : trans.dptr_;
+    DType *top_data = out.dptr_;
+    DType *top_count_data = top_count.dptr_;
+    const int count = out.shape_.Size();
+    const int channels = data.size(1);
+    const int height = data.size(2);
+    const int width = data.size(3);
+    const int pooled_height = pooled_size;
+    const int pooled_width = pooled_size;
+    const int num_classes = no_trans ? 1 : trans.size(1) / 2;
+    const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+    cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
+    DeformablePSROIPoolForwardKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
+      kBaseThreadNum, 0, stream >> >(
+      count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width,
+      bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim,
+      group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+  }
+
+
+  template <typename DType>
+  __global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const DType* top_diff,
+    const DType* top_count,
+    const int num_rois,
+    const DType spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    DType* bottom_data_diff, DType* bottom_trans_diff,
+    const DType* bottom_data,
+    const DType* bottom_rois,
+    const DType* bottom_trans,
+    const bool no_trans,
+    const DType trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class) {
+    CUDA_KERNEL_LOOP(index, count) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int ctop = (index / pooled_width / pooled_height) % output_dim;
+      int n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      const DType* offset_bottom_rois = bottom_rois + n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+      // Force too small ROIs to be 1x1
+      DType roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+      DType roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+      // Compute w and h at bottom
+      DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+      DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+      DType sub_bin_size_h = bin_size_h / static_cast<DType>(sample_per_part);
+      DType sub_bin_size_w = bin_size_w / static_cast<DType>(sample_per_part);
+
+      int part_h = floor(static_cast<DType>(ph) / pooled_height*part_size);
+      int part_w = floor(static_cast<DType>(pw) / pooled_width*part_size);
+      int class_id = ctop / channels_each_class;
+      DType trans_x = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+      DType trans_y = no_trans ? static_cast<DType>(0) :
+        bottom_trans[(((n * num_classes + class_id) * 2 + 1)
+                        * part_size + part_h)
+                        * part_size + part_w] * trans_std;
+
+      DType wstart = static_cast<DType>(pw)* bin_size_w
+        + roi_start_w;
+      wstart += trans_x * roi_width;
+      DType hstart = static_cast<DType>(ph) * bin_size_h
+        + roi_start_h;
+      hstart += trans_y * roi_height;
+
+      if (top_count[index] <= 0) {
+        continue;
+      }
+      DType diff_val = top_diff[index] / top_count[index];
+      const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+      DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+      int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
+      int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
+      gw = min(max(gw, 0), group_size - 1);
+      gh = min(max(gh, 0), group_size - 1);
+
+      for (int ih = 0; ih < sample_per_part; ih++) {
+        for (int iw = 0; iw < sample_per_part; iw++) {
+          DType w = wstart + iw*sub_bin_size_w;
+          DType h = hstart + ih*sub_bin_size_h;
+          // bilinear interpolation
+          if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) {
+            continue;
+          }
+          w = min(max(w, 0.), width - 1.);
+          h = min(max(h, 0.), height - 1.);
+          int c = (ctop*group_size + gh)*group_size + gw;
+          // backward on feature
+          int x0 = floor(w);
+          int x1 = ceil(w);
+          int y0 = floor(h);
+          int y1 = ceil(h);
+          DType dist_x = w - x0, dist_y = h - y0;
+          DType q00 = (1 - dist_x)*(1 - dist_y);
+          DType q01 = (1 - dist_x)*dist_y;
+          DType q10 = dist_x*(1 - dist_y);
+          DType q11 = dist_x*dist_y;
+          int bottom_index_base = c * height *width;
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val);
+          atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val);
+
+          if (no_trans) {
+            continue;
+          }
+          DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0];
+          DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0];
+          DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1];
+          DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1];
+          DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y))
+            *trans_std*diff_val;
+          diff_x *= roi_width;
+          DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x))
+            *trans_std*diff_val;
+          diff_y *= roi_height;
+
+          atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2)
+                                           * part_size + part_h)
+                                           * part_size + part_w, diff_x);
+          atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)
+                                           * part_size + part_h)
+                                           * part_size + part_w, diff_y);
+        }
+      }
+    }
+  }
+
+
+  template<typename DType>
+  inline void DeformablePSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
+    const Tensor<gpu, 4, DType> &trans_grad,
+    const Tensor<gpu, 4, DType> &out_grad,
+    const Tensor<gpu, 4, DType> &data,
+    const Tensor<gpu, 2, DType> &bbox,
+    const Tensor<gpu, 4, DType> &trans,
+    const Tensor<gpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    // LOG(INFO) << "DeformablePSROIPoolBackward";
+    const DType *top_diff = out_grad.dptr_;
+    const DType *bottom_data = data.dptr_;
+    const DType *bottom_rois = bbox.dptr_;
+    const DType *bottom_trans = no_trans ? NULL : trans.dptr_;
+    DType *bottom_data_diff = in_grad.dptr_;
+    DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_;
+    const DType *top_count_data = top_count.dptr_;
+    const int count = out_grad.shape_.Size();
+    const int num_rois = bbox.size(0);
+    const int channels = in_grad.size(1);
+    const int height = in_grad.size(2);
+    const int width = in_grad.size(3);
+    const int pooled_height = pooled_size;
+    const int pooled_width = pooled_size;
+    const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2;
+    const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+    cudaStream_t stream = Stream<gpu>::GetStream(in_grad.stream_);
+    DeformablePSROIPoolBackwardAccKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
+      kBaseThreadNum, 0, stream >> >(
+      count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width,
+      pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+      bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part,
+      group_size, part_size, num_classes, channels_each_class);
+    DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+  }
+
+}  // namespace cuda
+
+  template<typename DType>
+  inline void DeformablePSROIPoolForward(const Tensor<gpu, 4, DType> &out,
+    const Tensor<gpu, 4, DType> &data,
+    const Tensor<gpu, 2, DType> &bbox,
+    const Tensor<gpu, 4, DType> &trans,
+    const Tensor<gpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    cuda::DeformablePSROIPoolForward(out, data, bbox, trans, top_count, no_trans, spatial_scale,
+      output_dim, group_size, pooled_size, part_size, sample_per_part, trans_std);
+  }
+
+  template<typename DType>
+  inline void DeformablePSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
+    const Tensor<gpu, 4, DType> &trans_grad,
+    const Tensor<gpu, 4, DType> &out_grad,
+    const Tensor<gpu, 4, DType> &data,
+    const Tensor<gpu, 2, DType> &bbox,
+    const Tensor<gpu, 4, DType> &trans,
+    const Tensor<gpu, 4, DType> &top_count,
+    const bool no_trans,
+    const float spatial_scale,
+    const int output_dim,
+    const int group_size,
+    const int pooled_size,
+    const int part_size,
+    const int sample_per_part,
+    const float trans_std) {
+    cuda::DeformablePSROIPoolBackwardAcc(in_grad, trans_grad, out_grad, data, bbox, trans,
+      top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+  }
+
+}  // namespace mshadow
+
+
+namespace mxnet {
+namespace op {
+
+  template<>
+  Operator* CreateOp<gpu>(DeformablePSROIPoolingParam param, int dtype) {
+    Operator* op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new DeformablePSROIPoolingOp<gpu, DType>(param);
+    });
+    return op;
+  }
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
new file mode 100644
index 000000000000..48f3535b5eab
--- /dev/null
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -0,0 +1,301 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file multi_proposal-inl.h
+ * \brief MultiProposal Operator
+ * \author Piotr Teterwak, Bing Xu, Jian Guo, Xizhou Zhu
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_
+#define MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <ctime>
+#include <cstring>
+#include <iostream>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+// extend NumericalParam
+namespace mxnet {
+namespace op {
+
+/*!
+* \brief structure for numerical tuple input
+* \tparam VType data type of param
+*/
+template<typename VType>
+struct NumericalParam {
+  NumericalParam() {}
+  explicit NumericalParam(VType *begin, VType *end) {
+    int32_t size = static_cast<int32_t>(end - begin);
+    info.resize(size);
+    for (int i = 0; i < size; ++i) {
+      info[i] = *(begin + i);
+    }
+  }
+  inline size_t ndim() const {
+    return info.size();
+  }
+  std::vector<VType> info;
+};
+
+template<typename VType>
+inline std::istream &operator>>(std::istream &is, NumericalParam<VType> &param) {
+  while (true) {
+    char ch = is.get();
+    if (ch == '(') break;
+    if (!isspace(ch)) {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  VType idx;
+  std::vector<VType> tmp;
+  // deal with empty case
+  size_t pos = is.tellg();
+  char ch = is.get();
+  if (ch == ')') {
+    param.info = tmp;
+    return is;
+  }
+  is.seekg(pos);
+  // finish deal
+  while (is >> idx) {
+    tmp.push_back(idx);
+    char ch;
+    do {
+      ch = is.get();
+    } while (isspace(ch));
+    if (ch == ',') {
+      while (true) {
+        ch = is.peek();
+        if (isspace(ch)) {
+          is.get(); continue;
+        }
+        if (ch == ')') {
+          is.get(); break;
+        }
+        break;
+      }
+      if (ch == ')') break;
+    } else if (ch == ')') {
+      break;
+    } else {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  param.info = tmp;
+  return is;
+}
+
+template<typename VType>
+inline std::ostream &operator<<(std::ostream &os, const NumericalParam<VType> &param) {
+  os << '(';
+  for (index_t i = 0; i < param.info.size(); ++i) {
+    if (i != 0) os << ',';
+    os << param.info[i];
+  }
+  // python style tuple
+  if (param.info.size() == 1) os << ',';
+  os << ')';
+  return os;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+namespace mxnet {
+namespace op {
+
+namespace proposal {
+enum MultiProposalOpInputs {kClsProb, kBBoxPred, kImInfo};
+enum MultiProposalOpOutputs {kOut, kScore};
+enum MultiProposalForwardResource {kTempResource};
+}  // proposal
+
+struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
+  int rpn_pre_nms_top_n;
+  int rpn_post_nms_top_n;
+  float threshold;
+  int rpn_min_size;
+  NumericalParam<float> scales;
+  NumericalParam<float> ratios;
+  int feature_stride;
+  bool output_score;
+  bool iou_loss;
+  DMLC_DECLARE_PARAMETER(MultiProposalParam) {
+    float tmp[] = {0, 0, 0, 0};
+    DMLC_DECLARE_FIELD(rpn_pre_nms_top_n).set_default(6000)
+    .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
+    DMLC_DECLARE_FIELD(rpn_post_nms_top_n).set_default(300)
+    .describe("Overlap threshold used for non-maximum"
+              "suppresion(suppress boxes with IoU >= this threshold");
+    DMLC_DECLARE_FIELD(threshold).set_default(0.7)
+    .describe("NMS value, below which to suppress.");
+    DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
+    .describe("Minimum height or width in proposal");
+    tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
+    DMLC_DECLARE_FIELD(scales).set_default(NumericalParam<float>(tmp, tmp + 4))
+    .describe("Used to generate anchor windows by enumerating scales");
+    tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
+    DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam<float>(tmp, tmp + 3))
+    .describe("Used to generate anchor windows by enumerating ratios");
+    DMLC_DECLARE_FIELD(feature_stride).set_default(16)
+    .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
+              "for example the product of all stride's prior to this layer.");
+    DMLC_DECLARE_FIELD(output_score).set_default(false)
+    .describe("Add score to outputs");
+    DMLC_DECLARE_FIELD(iou_loss).set_default(false)
+    .describe("Usage of IoU Loss");
+  }
+};
+
+template<typename xpu>
+Operator *CreateOp(MultiProposalParam param);
+
+#if DMLC_USE_CXX11
+class MultiProposalProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]";
+    const TShape &dshape = in_shape->at(proposal::kClsProb);
+    if (dshape.ndim() == 0) return false;
+    Shape<4> bbox_pred_shape;
+    bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]);
+    SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred,
+                       bbox_pred_shape);
+    Shape<2> im_info_shape;
+    im_info_shape = Shape2(dshape[0], 3);
+    SHAPE_ASSIGN_CHECK(*in_shape, proposal::kImInfo, im_info_shape);
+    out_shape->clear();
+    // output
+    out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 5));
+    // score
+    out_shape->push_back(Shape2(dshape[0] * param_.rpn_post_nms_top_n, 1));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new MultiProposalProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_MultiProposal";
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {};
+  }
+
+  int NumVisibleOutputs() const override {
+    if (param_.output_score) {
+      return 2;
+    } else {
+      return 1;
+    }
+  }
+
+  int NumOutputs() const override {
+    return 2;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"cls_prob", "bbox_pred", "im_info"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "score"};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  MultiProposalParam param_;
+};  // class MultiProposalProp
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+//========================
+// Anchor Generation Utils
+//========================
+namespace mxnet {
+namespace op {
+namespace utils {
+
+inline void _MakeAnchor(float w,
+                        float h,
+                        float x_ctr,
+                        float y_ctr,
+                        std::vector<float> *out_anchors) {
+  out_anchors->push_back(x_ctr - 0.5f * (w - 1.0f));
+  out_anchors->push_back(y_ctr - 0.5f * (h - 1.0f));
+  out_anchors->push_back(x_ctr + 0.5f * (w - 1.0f));
+  out_anchors->push_back(y_ctr + 0.5f * (h - 1.0f));
+  out_anchors->push_back(0.0f);
+}
+
+inline void _Transform(float scale,
+                       float ratio,
+                       const std::vector<float>& base_anchor,
+                       std::vector<float>  *out_anchors) {
+  float w = base_anchor[2] - base_anchor[1] + 1.0f;
+  float h = base_anchor[3] - base_anchor[1] + 1.0f;
+  float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f);
+  float y_ctr = base_anchor[1] + 0.5 * (h - 1.0f);
+  float size = w * h;
+  float size_ratios = std::floor(size / ratio);
+  float new_w = std::floor(std::sqrt(size_ratios) + 0.5f) * scale;
+  float new_h = std::floor((new_w / scale * ratio) + 0.5f) * scale;
+
+  _MakeAnchor(new_w, new_h, x_ctr,
+             y_ctr, out_anchors);
+}
+
+// out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
+inline void GenerateAnchors(const std::vector<float>& base_anchor,
+                            const std::vector<float>& ratios,
+                            const std::vector<float>& scales,
+                            std::vector<float> *out_anchors) {
+  for (size_t j = 0; j < ratios.size(); ++j) {
+    for (size_t k = 0; k < scales.size(); ++k) {
+      _Transform(scales[k], ratios[j], base_anchor, out_anchors);
+    }
+  }
+}
+
+}  // namespace utils
+}  // namespace op
+}  // namespace mxnet
+
+#endif  //  MXNET_OPERATOR_CONTRIB_MULTI_PROPOSAL_INL_H_
diff --git a/src/operator/contrib/multi_proposal.cc b/src/operator/contrib/multi_proposal.cc
new file mode 100644
index 000000000000..c8f75eaec547
--- /dev/null
+++ b/src/operator/contrib/multi_proposal.cc
@@ -0,0 +1,63 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file multi_proposal.cc
+ * \brief
+ * \author Xizhou Zhu
+*/
+
+#include "./multi_proposal-inl.h"
+
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu>
+class MultiProposalOp : public Operator{
+ public:
+  explicit MultiProposalOp(MultiProposalParam param) {
+    this->param_ = param;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    LOG(FATAL) << "not implemented";
+  }
+
+ private:
+  MultiProposalParam param_;
+};  // class MultiProposalOp
+
+template<>
+Operator *CreateOp<cpu>(MultiProposalParam param) {
+  return new MultiProposalOp<cpu>(param);
+}
+
+Operator* MultiProposalProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(MultiProposalParam);
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp)
+.describe("Generate region proposals via RPN")
+.add_argument("cls_score", "NDArray-or-Symbol", "Score of how likely proposal is object.")
+.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals")
+.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.")
+.add_arguments(MultiProposalParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
new file mode 100644
index 000000000000..052d777d5fac
--- /dev/null
+++ b/src/operator/contrib/multi_proposal.cu
@@ -0,0 +1,593 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file multi_proposal.cu
+ * \brief MultiProposal Operator
+ * \author Shaoqing Ren, Xizhou Zhu, Jian Guo
+*/
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mshadow/tensor.h>
+#include <mshadow/cuda/reduce.cuh>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <ctime>
+#include <iostream>
+
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "./multi_proposal-inl.h"
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define FRCNN_CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+} while (0)
+
+namespace mshadow {
+namespace cuda {
+namespace multi_proposal {
+
+// scores are (b, 2 * anchor, h, w)
+// workspace_proposals are (b, h * w * anchor, 5)
+// w defines "x" and h defines "y"
+// count should be total anchors numbers, h * w * anchors
+template<typename Dtype>
+__global__ void ProposalGridKernel(const int count,
+                                   const int num_anchors,
+                                   const int height,
+                                   const int width,
+                                   const int feature_stride,
+                                   const Dtype* scores,
+                                   Dtype* workspace_proposals) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    int a = index % num_anchors;
+    int w = (index / num_anchors) % width;
+    int h = (index / num_anchors / width) % height;
+    int b = index / num_anchors / width / height;
+
+    workspace_proposals[index * 5 + 0] = workspace_proposals[a * 5 + 0] + w * feature_stride;
+    workspace_proposals[index * 5 + 1] = workspace_proposals[a * 5 + 1] + h * feature_stride;
+    workspace_proposals[index * 5 + 2] = workspace_proposals[a * 5 + 2] + w * feature_stride;
+    workspace_proposals[index * 5 + 3] = workspace_proposals[a * 5 + 3] + h * feature_stride;
+    workspace_proposals[index * 5 + 4] =
+        scores[((b * (2 * num_anchors) + a + num_anchors) * height + h) * width + w];
+  }
+}
+
+// boxes are (b, h * w * anchor, 5)
+// deltas are (b, 4 * anchor, h, w)
+// out_pred_boxes are (b, h * w * anchor, 5)
+// count should be total anchors numbers, b * h * w * anchors
+// in-place write: boxes and out_pred_boxes are the same location
+template<typename Dtype>
+__global__ void BBoxPredKernel(const int count,
+                               const int num_anchors,
+                               const int feat_height,
+                               const int feat_width,
+                               const int feature_stride,
+                               const Dtype* im_infos,
+                               const Dtype* boxes,
+                               const Dtype* deltas,
+                               Dtype* out_pred_boxes) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    int a = index % num_anchors;
+    int w = (index / num_anchors) % feat_width;
+    int h = (index / num_anchors / feat_width) % feat_height;
+    int b = index / num_anchors / feat_width / feat_height;
+
+    float im_height = im_infos[b * 3];
+    float im_width = im_infos[b * 3 + 1];
+    int real_height = static_cast<int>(im_height / feature_stride);
+    int real_width = static_cast<int>(im_width / feature_stride);
+
+    float width = boxes[index * 5 + 2] - boxes[index * 5 + 0] + 1.0f;
+    float height = boxes[index * 5 + 3] - boxes[index * 5 + 1] + 1.0f;
+    float ctr_x = boxes[index * 5 + 0] + 0.5f * (width - 1.0f);
+    float ctr_y = boxes[index * 5 + 1] + 0.5f * (height - 1.0f);
+
+    int ba = (b * num_anchors + a);
+    float dx = deltas[((ba * 4) * feat_height + h) * feat_width + w];
+    float dy = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w];
+    float dw = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w];
+    float dh = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w];
+
+    float pred_ctr_x = dx * width + ctr_x;
+    float pred_ctr_y = dy * height + ctr_y;
+    float pred_w = exp(dw) * width;
+    float pred_h = exp(dh) * height;
+
+    float pred_x1 = pred_ctr_x - 0.5f * (pred_w - 1.0f);
+    float pred_y1 = pred_ctr_y - 0.5f * (pred_h - 1.0f);
+    float pred_x2 = pred_ctr_x + 0.5f * (pred_w - 1.0f);
+    float pred_y2 = pred_ctr_y + 0.5f * (pred_h - 1.0f);
+
+    pred_x1 = max(min(pred_x1, im_width - 1.0f), 0.0f);
+    pred_y1 = max(min(pred_y1, im_height - 1.0f), 0.0f);
+    pred_x2 = max(min(pred_x2, im_width - 1.0f), 0.0f);
+    pred_y2 = max(min(pred_y2, im_height - 1.0f), 0.0f);
+
+    out_pred_boxes[index * 5 + 0] = pred_x1;
+    out_pred_boxes[index * 5 + 1] = pred_y1;
+    out_pred_boxes[index * 5 + 2] = pred_x2;
+    out_pred_boxes[index * 5 + 3] = pred_y2;
+
+    if (h >= real_height || w >= real_width) {
+      out_pred_boxes[index * 5 + 4] = -1.0f;
+    }
+  }
+}
+
+// boxes are (b, h * w * anchor, 5)
+// deltas are (b, 4 * anchor, h, w)
+// out_pred_boxes are (b, h * w * anchor, 5)
+// count should be total anchors numbers, b * h * w * anchors
+// in-place write: boxes and out_pred_boxes are the same location
+template<typename Dtype>
+__global__ void IoUPredKernel(const int count,
+                              const int num_anchors,
+                              const int feat_height,
+                              const int feat_width,
+                              const int feature_stride,
+                              const Dtype* im_infos,
+                              const Dtype* boxes,
+                              const Dtype* deltas,
+                              Dtype* out_pred_boxes) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    int a = index % num_anchors;
+    int w = (index / num_anchors) % feat_width;
+    int h = (index / num_anchors / feat_width) % feat_height;
+    int b = index / num_anchors / feat_width / feat_height;
+
+    float im_height = im_infos[b * 3];
+    float im_width = im_infos[b * 3 + 1];
+    int real_height = static_cast<int>(im_height / feature_stride);
+    int real_width = static_cast<int>(im_width / feature_stride);
+
+    float x1 = boxes[index * 5 + 0];
+    float y1 = boxes[index * 5 + 1];
+    float x2 = boxes[index * 5 + 2];
+    float y2 = boxes[index * 5 + 3];
+
+    int ba = (b * num_anchors + a);
+    float dx1 = deltas[((ba * 4) * feat_height + h) * feat_width + w];
+    float dy1 = deltas[((ba * 4 + 1) * feat_height + h) * feat_width + w];
+    float dx2 = deltas[((ba * 4 + 2) * feat_height + h) * feat_width + w];
+    float dy2 = deltas[((ba * 4 + 3) * feat_height + h) * feat_width + w];
+
+    float pred_x1 = max(min(x1 + dx1, im_width - 1.0f), 0.0f);
+    float pred_y1 = max(min(y1 + dy1, im_height - 1.0f), 0.0f);
+    float pred_x2 = max(min(x2 + dx2, im_width - 1.0f), 0.0f);
+    float pred_y2 = max(min(y2 + dy2, im_height - 1.0f), 0.0f);
+
+    out_pred_boxes[index * 5 + 0] = pred_x1;
+    out_pred_boxes[index * 5 + 1] = pred_y1;
+    out_pred_boxes[index * 5 + 2] = pred_x2;
+    out_pred_boxes[index * 5 + 3] = pred_y2;
+
+    if (h >= real_height || w >= real_width) {
+      out_pred_boxes[index * 5 + 4] = -1.0f;
+    }
+  }
+}
+
+// filter box with stride less than rpn_min_size
+// filter: set score to zero
+// dets (b, n, 5)
+template<typename Dtype>
+__global__ void FilterBoxKernel(const int count,
+                                const int count_anchors,
+                                const float original_min_size,
+                                const Dtype* im_infos,
+                                Dtype* dets) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    int b = index / count_anchors;
+    float iw = dets[index * 5 + 2] - dets[index * 5 + 0] + 1.0f;
+    float ih = dets[index * 5 + 3] - dets[index * 5 + 1] + 1.0f;
+    float min_size = original_min_size * im_infos[b * 3 + 2];
+    if (iw < min_size || ih < min_size) {
+      dets[index * 5 + 0] -= min_size / 2;
+      dets[index * 5 + 1] -= min_size / 2;
+      dets[index * 5 + 2] += min_size / 2;
+      dets[index * 5 + 3] += min_size / 2;
+      dets[index * 5 + 4] = -1.0f;
+    }
+  }
+}
+
+// copy score and init order
+// dets (n, 5); score (n, ); order (n, )
+// count should be n (total anchors or proposals)
+template<typename Dtype>
+__global__ void CopyScoreKernel(const int count,
+                                const Dtype* dets,
+                                Dtype* score,
+                                int* order) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    score[index] = dets[index * 5 + 4];
+    order[index] = index;
+  }
+}
+
+// reorder proposals according to order and keep the top_n proposals
+// prev_dets (n, 5); order (n, ); dets (n, 5)
+// count should be output anchor numbers (top_n)
+template<typename Dtype>
+__global__ void ReorderProposalsKernel(const int count,
+                                       const Dtype* prev_dets,
+                                       const int* order,
+                                       Dtype* dets) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    const int order_i = order[index];
+    for (int j = 0; j < 5; j ++) {
+      dets[index * 5 + j] = prev_dets[order_i * 5 + j];
+    }
+  }
+}
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, uint64_t *dev_mask) {
+  const int threadsPerBlock = sizeof(uint64_t) * 8;
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _nms(const mshadow::Tensor<gpu, 2>& boxes,
+          const float nms_overlap_thresh,
+          int *keep,
+          int *num_out) {
+  const int threadsPerBlock = sizeof(uint64_t) * 8;
+  const int boxes_num = boxes.size(0);
+  const int boxes_dim = boxes.size(1);
+
+  float* boxes_dev = boxes.dptr_;
+  uint64_t* mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  FRCNN_CUDA_CHECK(cudaMalloc(&mask_dev,
+                              boxes_num * col_blocks * sizeof(uint64_t)));
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+  FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
+  FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                              mask_dev,
+                              sizeof(uint64_t) * boxes_num * col_blocks,
+                              cudaMemcpyDeviceToHost));
+
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep[num_to_keep++] = i;
+      uint64_t *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  *num_out = num_to_keep;
+
+  FRCNN_CUDA_CHECK(cudaFree(mask_dev));
+}
+
+// copy proposals to output
+// dets (top_n, 5); keep (top_n, ); out (top_n, )
+// count should be top_n (total anchors or proposals)
+template<typename Dtype>
+__global__ void PrepareOutput(const int count,
+                              const Dtype* dets,
+                              const int* keep,
+                              const int out_size,
+                              const int image_index,
+                              Dtype* out,
+                              Dtype* score) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+       index < count;
+       index += blockDim.x * gridDim.x) {
+    out[index * 5] = image_index;
+    if (index < out_size) {
+      int keep_i = keep[index];
+      for (int j = 0; j < 4; ++j) {
+        out[index * 5 + j + 1] = dets[keep_i * 5 + j];
+      }
+      score[index] = dets[keep_i * 5 + 4];
+    } else {
+      int keep_i = keep[index % out_size];
+      for (int j = 0; j < 4; ++j) {
+        out[index * 5 + j + 1] = dets[keep_i * 5 + j];
+      }
+      score[index] = dets[keep_i * 5 + 4];
+    }
+  }
+}
+}  // namespace multi_proposal
+}  // namespace cuda
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+
+template<typename xpu>
+class MultiProposalGPUOp : public Operator{
+ public:
+  explicit MultiProposalGPUOp(MultiProposalParam param) {
+    this->param_ = param;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    using namespace mshadow::cuda;
+    using namespace mshadow::cuda::multi_proposal;
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 2);
+    CHECK_GT(req.size(), 1);
+    CHECK_EQ(req[proposal::kOut], kWriteTo);
+    /*CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1)
+      << "Sorry, multiple images each device is not implemented.";*/
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4> scores = in_data[proposal::kClsProb].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> im_info = in_data[proposal::kImInfo].get<xpu, 2, real_t>(s);
+
+    Tensor<xpu, 2> out = out_data[proposal::kOut].get<xpu, 2, real_t>(s);
+    Tensor<xpu, 2> out_score = out_data[proposal::kScore].get<xpu, 2, real_t>(s);
+
+    int num_images = scores.size(0);
+    int num_anchors = scores.size(1) / 2;
+    int height = scores.size(2);
+    int width = scores.size(3);
+    int count_anchors = num_anchors * height * width;  // count of total anchors
+    int count = num_images * count_anchors;
+    // set to -1 for max
+    int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n
+                                                           : count_anchors;
+    rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors);
+    int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);
+
+    // Generate first anchors based on base anchor
+    std::vector<float> base_anchor(4);
+    base_anchor[0] = 0.0;
+    base_anchor[1] = 0.0;
+    base_anchor[2] = param_.feature_stride - 1.0;
+    base_anchor[3] = param_.feature_stride - 1.0;
+    CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size());
+    std::vector<float> anchors;
+    utils::GenerateAnchors(base_anchor,
+                           param_.ratios.info,
+                           param_.scales.info,
+                           &anchors);
+
+    // Copy generated anchors to GPU
+    float* workspace_proposals_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr,
+                                sizeof(float) * num_images * count_anchors * 5));
+    Tensor<xpu, 3> workspace_proposals(workspace_proposals_ptr,
+                                       Shape3(num_images, count_anchors, 5));
+    FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0],
+                                sizeof(float) * anchors.size(), cudaMemcpyHostToDevice));
+
+    // Copy proposals to a mesh grid
+    dim3 dimGrid((count + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock);
+    dim3 dimBlock(kMaxThreadsPerBlock);
+    CheckLaunchParam(dimGrid, dimBlock, "ProposalGrid");
+    ProposalGridKernel<<<dimGrid, dimBlock>>>(
+      count, num_anchors, height, width, param_.feature_stride,
+      scores.dptr_, workspace_proposals.dptr_);
+    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+    // Transform anchors and bbox_deltas into bboxes
+    CheckLaunchParam(dimGrid, dimBlock, "BBoxPred");
+    if (param_.iou_loss) {
+      IoUPredKernel<<<dimGrid, dimBlock>>>(
+        count, num_anchors, height, width, param_.feature_stride, im_info.dptr_,
+        workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
+    } else {
+      BBoxPredKernel<<<dimGrid, dimBlock>>>(
+        count, num_anchors, height, width, param_.feature_stride, im_info.dptr_,
+        workspace_proposals.dptr_, bbox_deltas.dptr_, workspace_proposals.dptr_);
+    }
+    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+    // filter boxes with less than rpn_min_size
+    CheckLaunchParam(dimGrid, dimBlock, "FilterBox");
+    FilterBoxKernel<<<dimGrid, dimBlock>>>(
+      count, count_anchors, param_.rpn_min_size, im_info.dptr_, workspace_proposals.dptr_);
+    FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+
+
+    dimGrid = dim3((count_anchors + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock);
+    dimBlock = dim3(kMaxThreadsPerBlock);
+    // Copy score to a continuous memory
+    float* score_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&score_ptr, sizeof(float) * count_anchors));
+    Tensor<xpu, 1> score(score_ptr, Shape1(count_anchors));
+    int* order_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&order_ptr, sizeof(int) * count_anchors));
+    Tensor<xpu, 1, int> order(order_ptr, Shape1(count_anchors));
+
+    float* workspace_ordered_proposals_ptr = NULL;
+    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr,
+        sizeof(float) * rpn_pre_nms_top_n * 5));
+    Tensor<xpu, 2> workspace_ordered_proposals(workspace_ordered_proposals_ptr,
+        Shape2(rpn_pre_nms_top_n, 5));
+
+    int* keep;
+    FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * rpn_pre_nms_top_n));
+
+    for (int b = 0; b < num_images; b++) {
+        CheckLaunchParam(dimGrid, dimBlock, "CopyScore");
+        CopyScoreKernel << <dimGrid, dimBlock >> >(
+            count_anchors, workspace_proposals.dptr_ + b * count_anchors * 5,
+            score.dptr_, order.dptr_);
+        FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+        // argsort score, save order
+        thrust::stable_sort_by_key(thrust::device,
+            score.dptr_,
+            score.dptr_ + score.size(0),
+            order.dptr_,
+            thrust::greater<real_t>());
+        FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+        // Reorder proposals according to order
+
+        dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+        CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
+        ReorderProposalsKernel << <dimGrid, dimBlock >> >(
+            rpn_pre_nms_top_n, workspace_proposals.dptr_ + b * count_anchors * 5,
+            order.dptr_, workspace_ordered_proposals.dptr_);
+        FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+
+        // perform nms
+        std::vector<int> _keep(workspace_ordered_proposals.size(0));
+        int out_size = 0;
+        _nms(workspace_ordered_proposals,
+            param_.threshold,
+            &_keep[0],
+            &out_size);
+
+        // copy nms result to gpu
+        FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
+            cudaMemcpyHostToDevice));
+
+        // copy results after nms
+        dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+        CheckLaunchParam(dimGrid, dimBlock, "PrepareOutput");
+        PrepareOutput << <dimGrid, dimBlock >> >(
+            rpn_post_nms_top_n, workspace_ordered_proposals.dptr_, keep, out_size, b,
+            out.dptr_ + b * rpn_post_nms_top_n * 5, out_score.dptr_ + b * rpn_post_nms_top_n);
+        FRCNN_CUDA_CHECK(cudaPeekAtLastError());
+    }
+    // free temporary memory
+    FRCNN_CUDA_CHECK(cudaFree(keep));
+    FRCNN_CUDA_CHECK(cudaFree(workspace_ordered_proposals_ptr));
+    FRCNN_CUDA_CHECK(cudaFree(workspace_proposals_ptr));
+    FRCNN_CUDA_CHECK(cudaFree(score_ptr));
+    FRCNN_CUDA_CHECK(cudaFree(order_ptr));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), 3);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);
+
+    // can not assume the grad would be zero
+    Assign(gscores, req[proposal::kClsProb], 0);
+    Assign(gbbox, req[proposal::kBBoxPred], 0);
+    Assign(ginfo, req[proposal::kImInfo], 0);
+  }
+
+ private:
+  MultiProposalParam param_;
+};  // class MultiProposalGPUOp
+
+template<>
+Operator* CreateOp<gpu>(MultiProposalParam param) {
+  return new MultiProposalGPUOp<gpu>(param);
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/nn/deformable_im2col.cuh b/src/operator/contrib/nn/deformable_im2col.cuh
new file mode 100644
index 000000000000..d9e7b970ca84
--- /dev/null
+++ b/src/operator/contrib/nn/deformable_im2col.cuh
@@ -0,0 +1,525 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+ */
+
+#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_
+#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <cstring>
+#include <vector>
+#include "../../mxnet_op.h"
+#include "../../../common/cuda_utils.h"
+
+
+
+namespace mxnet {
+namespace op {
+
+template <typename DType>
+__device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int data_width, 
+  const int height, const int width, DType h, DType w) {
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high;
+  int w_high;
+  if (h_low >= height - 1) {
+    h_high = h_low = height - 1;
+    h = (DType)h_low;
+  }
+  else {
+    h_high = h_low + 1;
+  }
+
+  if (w_low >= width - 1) {
+    w_high = w_low = width - 1;
+    w = (DType)w_low;
+  }
+  else {
+    w_high = w_low + 1;
+  }
+
+  DType lh = h - h_low;
+  DType lw = w - w_low;
+  DType hh = 1 - lh, hw = 1 - lw;
+
+  DType v1 = bottom_data[h_low * data_width + w_low];
+  DType v2 = bottom_data[h_low * data_width + w_high];
+  DType v3 = bottom_data[h_high * data_width + w_low];
+  DType v4 = bottom_data[h_high * data_width + w_high];
+  DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename DType>
+__device__ DType get_gradient_weight(DType argmax_h, DType argmax_w, 
+  const int h, const int w, const int height, const int width) {
+
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) {
+    //empty
+    return 0;
+  }
+
+  argmax_h = max(argmax_h, (DType)0.0f);
+  argmax_w = max(argmax_w, (DType)0.0f);
+
+  int argmax_h_low = (int)argmax_h;
+  int argmax_w_low = (int)argmax_w;
+  int argmax_h_high;
+  int argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = (DType)argmax_h_low;
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1)
+  {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = (DType)argmax_w_low;
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+  DType weight = 0;
+  if (h == argmax_h_low) {
+    if (w == argmax_w_low) {
+      weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+    }
+  } else if (h == argmax_h_high) {
+    if (w == argmax_w_low) {
+      weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+    } else if (w == argmax_w_high) {
+      weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+    }
+  }
+  return weight;
+}
+
+
+template <typename DType>
+__device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w,
+  const int height, const int width, const DType* im_data,
+  const int data_width, const int bp_dir) {
+
+  if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width)
+  {
+    //empty
+    return 0;
+  }
+
+  if (argmax_h < 0) argmax_h = 0;
+  if (argmax_w < 0) argmax_w = 0;
+
+  int argmax_h_low = (int)argmax_h;
+  int argmax_w_low = (int)argmax_w;
+  int argmax_h_high;
+  int argmax_w_high;
+  if (argmax_h_low >= height - 1) {
+    argmax_h_high = argmax_h_low = height - 1;
+    argmax_h = (DType)argmax_h_low;
+  } else {
+    argmax_h_high = argmax_h_low + 1;
+  }
+  if (argmax_w_low >= width - 1) {
+    argmax_w_high = argmax_w_low = width - 1;
+    argmax_w = (DType)argmax_w_low;
+  } else {
+    argmax_w_high = argmax_w_low + 1;
+  }
+  DType weight = 0;
+
+  if (bp_dir == 0) {
+    weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+
+/*!
+ * \brief deformable_im2col gpu kernel.
+ * DO NOT call this directly. Use wrapper function im2col() instead;
+ */
+template <typename DType>
+__global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im, const DType* data_offset,
+  const int height, const int width, const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w,
+  const int stride_h, const int stride_w,
+  const int dilation_h, const int dilation_w,
+  const int channel_per_deformable_group,
+  const int height_col, const int width_col,
+  DType* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int c_im = (index / width_col) / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    DType* data_col_ptr = data_col + (c_col * height_col + h_col) * width_col + w_col;
+    const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in;
+    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
+
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+        const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+        DType val = static_cast<DType>(0);
+        const DType h_im = h_in + i * dilation_h + offset_h;
+        const DType w_im = w_in + j * dilation_w + offset_w;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          const DType map_h = i * dilation_h + offset_h;
+          const DType map_w = j * dilation_w + offset_w;
+          const int cur_height = height - h_in;
+          const int cur_width = width - w_in;
+          val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+
+
+
+
+
+/*!\brief
+ * cpu function of deformable_im2col algorithm
+ * \param s device stream
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...)
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param data_col column buffer pointer
+ */
+template <typename DType>
+inline void deformable_im2col(mshadow::Stream<gpu>* s,
+  const DType* data_im, const DType* data_offset, 
+  const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride, const TShape& dilation, 
+  const uint32_t deformable_group, DType* data_col) {
+  // num_axes should be smaller than block size
+  index_t num_spatial_axes = kernel_shape.ndim();
+  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  index_t channel_per_deformable_group = im_shape[1] / deformable_group;
+  index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
+  using namespace mxnet_op;
+  switch (num_spatial_axes) {
+  case 2:
+    deformable_im2col_gpu_kernel<DType> // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+           0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        num_kernels, data_im, data_offset, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
+        pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], channel_per_deformable_group,
+        col_shape[1], col_shape[2], data_col);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_im2col_gpu_kernel);
+    break;
+  default:
+    LOG(FATAL) << "im2col_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+}
+
+
+/*!
+* \brief deformable_col2im gpu kernel.
+* \brief DO NOT call this directly. Use wrapper function deformable_col2im() instead;
+*/
+template <typename DType>
+__global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col, const DType* data_offset,
+  const int channels, const int height, const int width,
+  const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w,
+  const int stride_h, const int stride_w,
+  const int dilation_h, const int dilation_w,
+  const int channel_per_deformable_group,
+  const int height_col, const int width_col,
+  DType* grad_im, OpReqType req) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col) % kernel_w;
+    const int i = (index / width_col / height_col / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+    const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+    const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const DType cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+          cur_w + dx >= 0 && cur_w + dx < width &&
+          abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+          abs(cur_inv_w_data - (cur_w + dx)) < 1
+          ) {
+          int cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx;
+          DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+/*!\brief
+ * gpu function of deformable_col2im algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_im pointer of a image (C, H, W,...) in the image batch
+ */
+template <typename DType>
+inline void deformable_col2im(mshadow::Stream<gpu>* s,
+  const DType* data_col, const DType* data_offset,
+  const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride,
+  const TShape& dilation, const uint32_t deformable_group,
+  DType* grad_im, OpReqType req) {
+  index_t num_spatial_axes = kernel_shape.ndim();
+  index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
+  index_t channel_per_deformable_group = im_shape[1] / deformable_group;
+  index_t num_kernels = col_shape.ProdShape(0, col_shape.ndim());
+  // num_axes should be smaller than block size
+  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  using namespace mxnet_op;
+  switch (num_spatial_axes) {
+  case 2:
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    deformable_col2im_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        num_kernels, data_col, data_offset, im_shape[1], im_shape[2], im_shape[3],
+        kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
+        dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_im, req);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel);
+    break;
+  default:
+    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+}
+
+
+/*!
+ * \brief deformable_col2im_coord gpu kernel.
+ * \brief DO NOT call this directly. Use wrapper function deformable_col2im_coord() instead;
+ */
+template <typename DType>
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* data_col, 
+  const DType* data_im, const DType* data_offset,
+  const int channels, const int height, const int width,
+  const int kernel_h, const int kernel_w,
+  const int pad_h, const int pad_w,
+  const int stride_h, const int stride_w,
+  const int dilation_h, const int dilation_w,
+  const int channel_per_deformable_group,
+  const int height_col, const int width_col,
+  DType* grad_offset, OpReqType req) {
+  CUDA_KERNEL_LOOP(index, n) {
+    DType val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = index / width_col / height_col;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const DType* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * width_col * height_col;
+    const DType* data_im_ptr = data_im + deformable_group_index * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) {
+      const int col_pos = ((col_c * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col) % kernel_w;
+      int i = (col_pos / width_col / height_col / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+      const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+      DType inv_h = h_in + i * dilation_h + offset_h;
+      DType inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -1;
+      }
+      const DType weight = get_coordinate_weight(
+        inv_h, inv_w,
+        height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+/*!\brief
+ * gpu function of deformable_col2im_coord algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch
+ */
+template <typename DType>
+inline void deformable_col2im_coord(mshadow::Stream<gpu>* s,
+  const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape,
+  const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride,
+  const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) {
+  index_t num_spatial_axes = kernel_shape.ndim();
+  index_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_shape[0] * kernel_shape[1] * deformable_group;
+  index_t channel_per_deformable_group = col_shape[0] / deformable_group;
+  // num_axes should be smaller than block size
+  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  using namespace mxnet_op;
+  switch (num_spatial_axes) {
+  case 2:
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+
+    deformable_col2im_coord_gpu_kernel<DType> << <cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+      0, mshadow::Stream<gpu>::GetStream(s) >> >(
+        num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3],
+        kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
+        dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset, req);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel);
+    break;
+  default:
+    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+      << num_spatial_axes << " spatial axes";
+  }
+}
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_CUH_
diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h
new file mode 100644
index 000000000000..9d6180034c71
--- /dev/null
+++ b/src/operator/contrib/nn/deformable_im2col.h
@@ -0,0 +1,157 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai
+ */
+
+#ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_
+#define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <vector>
+#include "../../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*!\brief 
+ * cpu function of deformable_im2col algorithm
+ * \param s device stream
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...)
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param data_col column buffer pointer
+ */
+template <typename DType>
+inline void deformable_im2col(mshadow::Stream<cpu>* s,
+  const DType* data_im, const DType* data_offset,
+  const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride, const TShape& dilation,
+  const uint32_t deformable_group, DType* data_col) {
+  if (2 == kernel_shape.ndim()) {
+    LOG(FATAL) << "only implemented in GPU";
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+
+/*!\brief
+ * cpu function of deformable_col2im algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_im pointer of a image (C, H, W,...) in the image batch
+ */
+template <typename DType>
+inline void deformable_col2im(mshadow::Stream<cpu>* s,
+  const DType* data_col, const DType* data_offset,
+  const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride,
+  const TShape& dilation, const uint32_t deformable_group,
+  DType* grad_im, OpReqType req) {
+  index_t num_spatial_axes = kernel_shape.ndim();
+  LOG(FATAL) << "only implemented in GPU";
+}
+
+
+/*!\brief
+ * cpu function of deformable_col2im_coord algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param data_offset pointer of offset (C, H, W, ...) in the offset batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param deformable_group #offset group that deformable convolution use
+ * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch
+ */
+
+template <typename DType>
+inline void deformable_col2im_coord(mshadow::Stream<cpu>* s,
+  const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape,
+  const TShape& col_shape, const TShape& kernel_shape,
+  const TShape& pad, const TShape& stride,
+  const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) {
+  LOG(FATAL) << "only implemented in GPU";
+}
+
+}  // namespace op
+}  // namespace mxnet
+#ifdef __CUDACC__
+#include "./deformable_im2col.cuh"
+#endif
+#endif  // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_
diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h
new file mode 100644
index 000000000000..3a3a9c34927c
--- /dev/null
+++ b/src/operator/contrib/psroi_pooling-inl.h
@@ -0,0 +1,222 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file psroi_pooling-inl.h
+ * \brief psroi pooling operator and symbol
+ * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_
+#define MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../mshadow_op.h"
+#include "../operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+// Declare enumeration of input order to make code more intuitive.
+// These enums are only visible within this header
+namespace psroipool {
+enum PSROIPoolingOpInputs {kData, kBox};
+enum PSROIPoolingOpOutputs {kOut};
+}  // psroipool
+
+struct PSROIPoolingParam : public dmlc::Parameter<PSROIPoolingParam> {
+  // TShape pooled_size;
+  float spatial_scale;
+  int output_dim;
+  int pooled_size;
+  int group_size;
+  DMLC_DECLARE_PARAMETER(PSROIPoolingParam) {
+    DMLC_DECLARE_FIELD(spatial_scale).set_range(0.0, 1.0)
+    .describe("Ratio of input feature map height (or w) to raw image height (or w). "
+    "Equals the reciprocal of total stride in convolutional layers");
+    DMLC_DECLARE_FIELD(output_dim).describe("fix output dim");
+  DMLC_DECLARE_FIELD(pooled_size).describe("fix pooled size");
+    DMLC_DECLARE_FIELD(group_size).set_default(0).describe("fix group size");
+  }
+};
+
+template<typename xpu, typename DType>
+class PSROIPoolingOp : public Operator {
+ public:
+  explicit PSROIPoolingOp(PSROIPoolingParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4, DType> data = in_data[psroipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[psroipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[psroipool::kOut].get<xpu, 4, DType>(s);
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(bbox.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    out = -FLT_MAX;
+    PSROIPoolForward(out, data, bbox, param_.spatial_scale, param_.output_dim, param_.group_size);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_grad[psroipool::kOut].shape_[0], in_data[psroipool::kBox].shape_[0]);
+    CHECK_NE(req[psroipool::kData], kWriteInplace) <<
+      "ROIPooling: Backward doesn't support kWriteInplace.";
+    CHECK_NE(req[psroipool::kBox], kWriteInplace) <<
+      "ROIPooling: Backward doesn't support kWriteInplace.";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    Tensor<xpu, 4, DType> grad_out = out_grad[psroipool::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> bbox = in_data[psroipool::kBox].get<xpu, 2, DType>(s);
+    Tensor<xpu, 4, DType> grad_in = in_grad[psroipool::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grad_roi = in_grad[psroipool::kBox].get<xpu, 2, DType>(s);
+
+    CHECK_EQ(grad_out.CheckContiguous(), true);
+    CHECK_EQ(bbox.CheckContiguous(), true);
+    CHECK_EQ(grad_in.CheckContiguous(), true);
+
+    if (kAddTo == req[psroipool::kData] || kWriteTo == req[psroipool::kData]) {
+      if (kWriteTo == req[psroipool::kData]) {
+        grad_in = 0.0f;
+      }
+      PSROIPoolBackwardAcc(grad_in, grad_out, bbox, param_.spatial_scale,
+                           param_.output_dim, param_.group_size);
+    }
+    if (kWriteTo == req[psroipool::kBox]) {
+      grad_roi = 0.0f;
+    }
+  }
+
+ private:
+  PSROIPoolingParam param_;
+};  // class PSROIPoolingOp
+
+// Decalre Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(PSROIPoolingParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class PSROIPoolingProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "rois"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output"};
+  }
+
+  int NumOutputs() const override {
+    return 1;
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  if (param_.group_size == 0) {
+    param_.group_size = param_.pooled_size;
+  }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]";
+
+    // data: [batch_size, c, h, w]
+    TShape dshape = in_shape->at(psroipool::kData);
+    CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor";
+
+    // bbox: [num_rois, 5]
+    TShape bshape = in_shape->at(psroipool::kBox);
+    CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]";
+    CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
+
+    // out: [num_rois, c, pooled_h, pooled_w]
+    out_shape->clear();
+    out_shape->push_back(
+         Shape4(bshape[0], param_.output_dim, param_.pooled_size, param_.pooled_size));
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 2);
+    int dtype = (*in_type)[0];
+    CHECK_EQ(dtype, (*in_type)[1]);
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    PSROIPoolingProp* psroi_pooling_sym = new PSROIPoolingProp();
+    psroi_pooling_sym->param_ = this->param_;
+    return psroi_pooling_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_PSROIPooling";
+  }
+
+  // decalre dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[psroipool::kOut], in_data[psroipool::kBox]};
+  }
+
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+
+ private:
+  PSROIPoolingParam param_;
+};  // class PSROIPoolingProp
+#endif
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_PSROI_POOLING_INL_H_
diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc
new file mode 100644
index 000000000000..ad25aec8eee8
--- /dev/null
+++ b/src/operator/contrib/psroi_pooling.cc
@@ -0,0 +1,80 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file psroi_pooling.cc
+ * \brief psroi pooling operator
+ * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai
+*/
+#include "./psroi_pooling-inl.h"
+#include <mshadow/base.h>
+#include <mshadow/tensor.h>
+#include <mshadow/packet-inl.h>
+#include <mshadow/dot_engine-inl.h>
+#include <cassert>
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace mshadow {
+template<typename DType>
+inline void PSROIPoolForward(const Tensor<cpu, 4, DType> &out,
+                           const Tensor<cpu, 4, DType> &data,
+                           const Tensor<cpu, 2, DType> &bbox,
+                           const float spatial_scale_,
+                           const int output_dim_,
+                           const int group_size_) {
+  // NOT_IMPLEMENTED;
+  return;
+}
+
+template<typename DType>
+inline void PSROIPoolBackwardAcc(const Tensor<cpu, 4, DType> &in_grad,
+                            const Tensor<cpu, 4, DType> &out_grad,
+                            const Tensor<cpu, 2, DType> &bbox,
+                            const float spatial_scale_,
+                            const int output_dim_,
+                            const int group_size_) {
+  // NOT_IMPLEMENTED;
+  return;
+}
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(PSROIPoolingParam param, int dtype) {
+  Operator* op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new PSROIPoolingOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *PSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+}
+
+DMLC_REGISTER_PARAMETER(PSROIPoolingParam);
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_PSROIPooling, PSROIPoolingProp)
+.describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by "
+"spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled "
+"by max pooling to a fixed size output indicated by pooled_size. batch_size will change to "
+"the number of region bounding boxes after PSROIPooling")
+.add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps")
+.add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of "
+"[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners "
+"of designated region of interest. batch_index indicates the index of corresponding image "
+"in the input data")
+.add_arguments(PSROIPoolingParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu
new file mode 100644
index 000000000000..962c874c6d1a
--- /dev/null
+++ b/src/operator/contrib/psroi_pooling.cu
@@ -0,0 +1,260 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file psroi_pooling.cu
+ * \brief psroi pooling operator
+ * \author Yi Li, Tairui Chen, Guodong Zhang, Haozhi Qi, Jifeng Dai
+*/
+#include "./psroi_pooling-inl.h"
+#include <mshadow/tensor.h>
+#include <mshadow/cuda/reduce.cuh>
+#include <algorithm>
+#include <vector>
+#include "../../common/cuda_utils.h"
+#include "../mxnet_op.h"
+
+#define PSROIPOOLING_CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+#define CUDA_KERNEL_LOOP(i, n) \
+for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
+
+namespace mshadow {
+namespace cuda {
+
+template <typename DType>
+__global__ void PSROIPoolForwardKernel(
+  const int count,
+  const DType* bottom_data,
+  const DType spatial_scale,
+  const int channels,
+  const int height, const int width,
+  const int pooled_height, const int pooled_width,
+  const DType* bottom_rois,
+  const int output_dim,
+  const int group_size,
+  DType* top_data) {
+  CUDA_KERNEL_LOOP(index, count) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const DType* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale;
+    DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale;
+    DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale;
+    DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    DType roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+    DType roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+    DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+    int hstart = floor(static_cast<DType>(ph) * bin_size_h
+                        + roi_start_h);
+    int wstart = floor(static_cast<DType>(pw)* bin_size_w
+                        + roi_start_w);
+    int hend = ceil(static_cast<DType>(ph + 1) * bin_size_h
+                      + roi_start_h);
+    int wend = ceil(static_cast<DType>(pw + 1) * bin_size_w
+                      + roi_start_w);
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
+    int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+    int c = (ctop*group_size + gh)*group_size + gw;
+
+    const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
+    DType out_sum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h*width + w;
+        out_sum += offset_bottom_data[bottom_index];
+      }
+    }
+
+    DType bin_area = (hend - hstart)*(wend - wstart);
+    top_data[index] = is_empty? (DType)0. : out_sum/bin_area;
+  }
+}
+
+template<typename DType>
+inline void PSROIPoolForward(const Tensor<gpu, 4, DType> &out,
+                           const Tensor<gpu, 4, DType> &data,
+                           const Tensor<gpu, 2, DType> &bbox,
+                           const float spatial_scale,
+                           const int output_dim_,
+                           const int group_size_) {
+  const DType *bottom_data = data.dptr_;
+  const DType *bottom_rois = bbox.dptr_;
+  DType *top_data = out.dptr_;
+  const int count = out.shape_.Size();
+  const int channels = data.size(1);
+  const int height = data.size(2);
+  const int width = data.size(3);
+  const int pooled_height = out.size(2);
+  const int pooled_width = out.size(3);
+  cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
+  PSROIPoolForwardKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
+    kBaseThreadNum, 0, stream >> >(
+      count, bottom_data, spatial_scale, channels, height, width,
+      pooled_height, pooled_width, bottom_rois, output_dim_, group_size_, top_data);
+  PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+}
+
+
+template <typename DType>
+__global__ void PSROIPoolBackwardAccKernel(
+  const int count,
+  const DType* top_diff,
+  const int num_rois,
+  const DType spatial_scale,
+  const int channels,
+  const int height, const int width,
+  const int pooled_height, const int pooled_width,
+  const int group_size,
+  const int output_dim,
+  DType* bottom_diff,
+  const DType* bottom_rois) {
+  CUDA_KERNEL_LOOP(index, count) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const DType* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    DType roi_start_w = static_cast<DType>(round(offset_bottom_rois[1])) * spatial_scale;
+    DType roi_start_h = static_cast<DType>(round(offset_bottom_rois[2])) * spatial_scale;
+    DType roi_end_w = static_cast<DType>(round(offset_bottom_rois[3]) + 1.) * spatial_scale;
+    DType roi_end_h = static_cast<DType>(round(offset_bottom_rois[4]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    DType roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+    DType roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    DType bin_size_h = roi_height / static_cast<DType>(pooled_height);
+    DType bin_size_w = roi_width / static_cast<DType>(pooled_width);
+
+    int hstart = floor(static_cast<DType>(ph)* bin_size_h
+      + roi_start_h);
+    int wstart = floor(static_cast<DType>(pw)* bin_size_w
+      + roi_start_w);
+    int hend = ceil(static_cast<DType>(ph + 1) * bin_size_h
+      + roi_start_h);
+    int wend = ceil(static_cast<DType>(pw + 1) * bin_size_w
+      + roi_start_w);
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Compute c at bottom
+    int gw = floor(static_cast<DType>(pw)* group_size / pooled_width);
+    int gh = floor(static_cast<DType>(ph)* group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+    int c = (ctop*group_size + gh)*group_size + gw;
+    DType* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    DType bin_area = (hend - hstart)*(wend - wstart);
+    DType diff_val = is_empty ? (DType)0. : top_diff[index] / bin_area;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h*width + w;
+        atomicAdd(offset_bottom_diff + bottom_index, diff_val);
+      }
+    }
+  }
+}
+
+
+template<typename DType>
+inline void PSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
+                            const Tensor<gpu, 4, DType> &out_grad,
+                            const Tensor<gpu, 2, DType> &bbox,
+                            const float spatial_scale,
+                            const int output_dim_,
+                            const int group_size_) {
+  // LOG(INFO) << "PSROIPoolBackward";
+  const DType *top_diff = out_grad.dptr_;
+  const DType *bottom_rois = bbox.dptr_;
+  DType *bottom_diff = in_grad.dptr_;
+  const int count = out_grad.shape_.Size();
+  const int num_rois = bbox.size(0);
+  const int channels = in_grad.size(1);
+  const int height = in_grad.size(2);
+  const int width = in_grad.size(3);
+  const int pooled_height = out_grad.size(2);
+  const int pooled_width = out_grad.size(3);
+  cudaStream_t stream = Stream<gpu>::GetStream(in_grad.stream_);
+  PSROIPoolBackwardAccKernel<DType> << <mxnet::op::mxnet_op::cuda_get_num_blocks(count),
+    kBaseThreadNum, 0, stream >> >(
+      count, top_diff, num_rois, spatial_scale, channels, height, width,
+      pooled_height, pooled_width, group_size_, output_dim_, bottom_diff, bottom_rois);
+  PSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError());
+}
+
+}  // namespace cuda
+
+template<typename DType>
+inline void PSROIPoolForward(const Tensor<gpu, 4, DType> &out,
+                           const Tensor<gpu, 4, DType> &data,
+                           const Tensor<gpu, 2, DType> &bbox,
+                           const float spatial_scale,
+                           const int output_dim_,
+                           const int group_size_) {
+  cuda::PSROIPoolForward(out, data, bbox, spatial_scale, output_dim_, group_size_);
+}
+
+template<typename DType>
+inline void PSROIPoolBackwardAcc(const Tensor<gpu, 4, DType> &in_grad,
+                            const Tensor<gpu, 4, DType> &out_grad,
+                            const Tensor<gpu, 2, DType> &bbox,
+                            const float spatial_scale,
+                            const int output_dim_,
+                            const int group_size_) {
+  cuda::PSROIPoolBackwardAcc(in_grad, out_grad, bbox, spatial_scale, output_dim_, group_size_);
+}
+
+}  // namespace mshadow
+
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator* CreateOp<gpu>(PSROIPoolingParam param, int dtype) {
+  Operator* op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new PSROIPoolingOp<gpu, DType>(param);
+  });
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 48e44133216b..4b884f523789 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1093,6 +1093,166 @@ def test_unfuse():
         check_rnn_consistency(fused, stack)
         check_rnn_consistency(stack, fused)
 
+def test_psroipooling_with_type():
+    np.random.seed(1234)
+    arg_params = {
+        'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
+
+    # plain psroipooling
+    sym = mx.contrib.sym.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool')
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'psroipool_data': (1, 18, 14, 14),
+                 'psroipool_rois': (2, 5),
+                 'type_dict': {'psroipool_data': np.float64, 'psroipool_rois': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'psroipool_data': (1, 18, 14, 14),
+                 'psroipool_rois': (2, 5),
+                 'type_dict': {'psroipool_data': np.float32, 'psroipool_rois': np.float32}},
+                {'ctx': mx.gpu(0),
+                 'psroipool_data': (1, 18, 14, 14),
+                 'psroipool_rois': (2, 5),
+                 'type_dict': {'psroipool_data': np.float16, 'psroipool_rois': np.float16}},
+                ]
+
+    check_consistency(sym, ctx_list, grad_req={'psroipool_data': 'write',
+                                               'psroipool_rois': 'null'}, arg_params=arg_params)
+
+def test_deformable_psroipooling_with_type():
+    np.random.seed(1234)
+    arg_params = {
+        'deformable_psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
+
+    # deformable psroipooling
+    sym = mx.contrib.sym.DeformablePSROIPooling(spatial_scale=0.0625, sample_per_part=4, group_size=3, pooled_size=3,
+                                                output_dim=2, trans_std=0.1, no_trans=False, name='deformable_psroipool')
+
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float64, 'deformable_psroipool_rois': np.float64,
+                               'deformable_psroipool_trans': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float32, 'deformable_psroipool_rois': np.float32,
+                               'deformable_psroipool_trans': np.float32}},
+                {'ctx': mx.gpu(0),
+                 'deformable_psroipool_data': (1, 18, 14, 14),
+                 'deformable_psroipool_rois': (2, 5),
+                 'deformable_psroipool_trans': (2, 4, 3, 3),
+                 'type_dict': {'deformable_psroipool_data': np.float16, 'deformable_psroipool_rois': np.float16,
+                               'deformable_psroipool_trans': np.float16}},
+                ]
+
+    check_consistency(sym, ctx_list, grad_req={'deformable_psroipool_data': 'write',
+                                               'deformable_psroipool_rois': 'null',
+                                               'deformable_psroipool_trans': 'write'}, arg_params=arg_params)
+
+def test_deformable_convolution_with_type():
+    np.random.seed(1234)
+    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv')
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 10, 10),
+                 'deformable_conv_offset': (2, 18, 8, 8),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 10, 10),
+                 'deformable_conv_offset': (2, 18, 8, 8),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                # {'ctx': mx.gpu(0),
+                #  'deformable_conv_data': (2, 2, 10, 10),
+                #  'deformable_conv_offset': (2, 18, 8, 8),
+                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_conv_offset': np.float16}},
+                ]
+    # wider tolerance needed for true-fp16 NCHW test above
+    tol = {np.dtype(np.float16): 0.5,
+               np.dtype(np.float32): 1e-3,
+               np.dtype(np.float64): 1e-5,
+               np.dtype(np.uint8): 0,
+               np.dtype(np.int32): 0}
+    check_consistency(sym, ctx_list, tol=tol)
+    # test ability to turn off training on bias
+    check_consistency(sym, ctx_list, grad_req={'deformable_conv_data': 'write',
+                                               'deformable_conv_offset': 'write',
+                                               'deformable_conv_weight': 'write',
+                                               'deformable_conv_bias': 'null'}, tol=tol)
+def test_deformable_convolution_options():
+    # 2D convolution
+
+    # Pad > 0
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 7, 7),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 7, 7),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                # {'ctx': mx.gpu(0),
+                #  'deformable_conv_data': (2, 2, 7, 7),
+                #  'deformable_offset': (2, 18, 7, 7),
+                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
+                ]
+    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
+    check_consistency(sym, ctx_list)
+
+    # Stride > 1
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                # {'ctx': mx.gpu(0),
+                #  'deformable_conv_data': (2, 2, 7, 7),
+                # 'deformable_conv_offset': (2, 18, 3, 3),
+                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
+                ]
+    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
+    check_consistency(sym, ctx_list)
+
+    # Dilate > 1
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 18, 3, 3),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                # {'ctx': mx.gpu(0),
+                #  'deformable_conv_data': (2, 2, 7, 7),
+                # 'deformable_conv_offset': (2, 18, 3, 3),
+                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
+                ]
+    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
+    check_consistency(sym, ctx_list)
+
+    # Deformable group > 1
+    # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
+    ctx_list = [{'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 36, 5, 5),
+                 'type_dict': {'deformable_conv_data': np.float64, 'deformable_conv_offset': np.float64}},
+                {'ctx': mx.gpu(0),
+                 'deformable_conv_data': (2, 2, 7, 7),
+                 'deformable_conv_offset': (2, 36, 5, 5),
+                 'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
+                # {'ctx': mx.gpu(0),
+                #  'deformable_conv_data': (2, 2, 7, 7),
+                #  'deformable_conv_offset': (2, 36, 5, 5),
+                #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
+                ]
+    sym = mx.contrib.sym.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2,
+                                               name='deformable_conv')
 def test_residual_fused():
     cell = mx.rnn.ResidualCell(
             mx.rnn.FusedRNNCell(50, num_layers=3, mode='lstm',
@@ -1144,3 +1304,8 @@ def test_residual_fused():
     test_take_with_type()
     test_bilinear_sampler_with_type()
     test_grid_generator_with_type()
+    test_psroipooling_with_type()
+    test_deformable_psroipooling_with_type()
+    test_deformable_convolution_options()
+    test_deformable_convolution_with_type()
+
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 79795e9386b6..2cdac27c3edb 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2,6 +2,7 @@
 import numpy as np
 import mxnet as mx
 import random
+import itertools
 from numpy.testing import assert_allclose
 from mxnet.test_utils import *
 
@@ -3160,6 +3161,101 @@ def create_operator(self, ctx, shapes, dtypes):
     check_numeric_gradient(op, [x])
 
 
+def test_psroipooling():
+    for num_rois in [1, 2]:
+        for num_classes, num_group in itertools.product([2, 3], [2, 3]):
+            for image_height, image_width in itertools.product([168, 224], [168, 224]):
+                for grad_nodes in [['im_data']]:
+                    spatial_scale = 0.0625
+                    feat_height = np.int(image_height * spatial_scale)
+                    feat_width = np.int(image_width * spatial_scale)
+                    im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width)
+                    rois_data = np.zeros([num_rois, 5])
+                    rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1))
+                    rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1))
+
+                    im_data_var = mx.symbol.Variable(name="im_data")
+                    rois_data_var = mx.symbol.Variable(name="rois_data")
+                    op = mx.contrib.sym.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale,
+                                                     group_size=num_group, pooled_size=num_group,
+                                                     output_dim=num_classes, name='test_op')
+                    rtol, atol = 1e-2, 1e-4
+                    # By now we only have gpu implementation
+                    if mx.Context.default_ctx.device_type == 'gpu':
+                        check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol,
+                                               grad_nodes=grad_nodes, ctx=mx.gpu(0))
+
+def test_deformable_convolution():
+    for num_batch in [1, 2]:
+        for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]):
+            for input_height, input_width in itertools.product([5, 6], [5, 6]):
+                for dilate in [(1, 1), (2, 2)]:
+                    for grad_nodes in [['im_data'], ['offset_data']]:
+                        output_height = input_height
+                        output_width = input_width
+                        im_data = np.random.rand(num_batch, num_channel_data, input_height, input_width)
+                        offset_data = \
+                            np.random.rand(num_batch, num_deformable_group * 3 * 3 * 2, output_height, output_width)\
+                            * 0.8 + 0.1
+
+                        weight = np.random.normal(0, 0.001, (num_channel_data, num_channel_data, 3, 3))
+                        bias = np.zeros(num_channel_data)
+
+                        im_data_var = mx.symbol.Variable(name="im_data")
+                        offset_data_var = mx.symbol.Variable(name="offset_data")
+                        weight_var = mx.symbol.Variable(name="weight")
+                        bias_var = mx.symbol.Variable(name="bias")
+                        op = mx.contrib.sym.DeformableConvolution(name='test_op', data=im_data_var,
+                                                                  offset=offset_data_var,
+                                                                  weight=weight_var, bias=bias_var,
+                                                                  num_filter=num_channel_data, pad=dilate,
+                                                                  kernel=(3, 3), stride=(1, 1), dilate=dilate,
+                                                                  num_deformable_group=num_deformable_group)
+                        if grad_nodes[0] == 'offset_data':
+                            # wider tolerance needed for coordinate differential
+                            rtol, atol = 1.0, 1e-2
+                        else:
+                            rtol, atol = 0.05, 1e-4
+                        # By now we only have gpu implementation
+                        if mx.Context.default_ctx.device_type == 'gpu':
+                            check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol,
+                                                   grad_nodes=grad_nodes, ctx=mx.gpu(0))
+
+
+def test_deformable_psroipooling():
+    for num_rois in [1, 2]:
+        for num_classes, num_group in itertools.product([2, 3], [2, 3]):
+            for image_height, image_width in itertools.product([168, 224], [168, 224]):
+                for grad_nodes in [['im_data'], ['offset_data']]:
+                    spatial_scale = 0.0625
+                    feat_height = np.int(image_height * spatial_scale)
+                    feat_width = np.int(image_width * spatial_scale)
+                    im_data = np.random.rand(1, num_classes*num_group*num_group, feat_height, feat_width)
+                    rois_data = np.zeros([num_rois, 5])
+                    rois_data[:, [1,3]] = np.sort(np.random.rand(num_rois, 2)*(image_width-1))
+                    rois_data[:, [2,4]] = np.sort(np.random.rand(num_rois, 2)*(image_height-1))
+                    offset_data = np.random.rand(num_rois, 2*num_classes, num_group, num_group) * 0.1
+
+                    im_data_var = mx.symbol.Variable(name="im_data")
+                    rois_data_var = mx.symbol.Variable(name="rois_data")
+                    offset_data_var = mx.symbol.Variable(name="offset_data")
+                    op = mx.contrib.sym.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var, 
+                                                               trans=offset_data_var, spatial_scale=spatial_scale, 
+                                                               sample_per_part=4, group_size=num_group, 
+                                                               pooled_size=num_group, output_dim=num_classes, 
+                                                               trans_std=0.1, no_trans=False, name='test_op')
+                    if grad_nodes[0] == 'offset_data':
+                        # wider tolerance needed for coordinate differential
+                        rtol, atol = 1.0, 1e-2
+                    else:
+                        rtol, atol = 1e-2, 1e-4
+                    # By now we only have gpu implementation
+                    if mx.Context.default_ctx.device_type == 'gpu':
+                        check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol,
+                                               grad_nodes=grad_nodes, ctx=mx.gpu(0))
+
+
+
 def test_laop():
     # Temporarily disabled until lapack is enabled by default
     return
@@ -3409,7 +3505,7 @@ def test_laop():
     if grad_check == 1:
       check_numeric_gradient(test_sumlogdiag, [a])
 
-    
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()