diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index db6b9b8c885..38064c446aa 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,7 +50,7 @@ jobs:
         run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}}
       - name: Install mmdet dependencies
         run: |
-          pip install mmcv
+          pip install mmcv-full==latest+torch${{matrix.torch}}+cu101 -f https://openmmlab.oss-accelerate.aliyuncs.com/mmcv/dist/index.html
           pip install -r requirements.txt
           pip install "git+https://github.com/open-mmlab/cocoapi.git#subdirectory=pycocotools"
       - name: Lint with flake8
@@ -60,7 +60,7 @@ jobs:
       - name: Format with yapf
         run: yapf -r -d mmdet/ tools/ configs/ tests/
       - name: Check docstring
-        run: interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --exclude mmdet/ops --ignore-regex "__repr__" --fail-under 80 mmdet
+        run: interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 80 mmdet
       - name: Build and install
         env:
           CUDA_ARCH: ${{matrix.cuda_arch}}
diff --git a/.isort.cfg b/.isort.cfg
index 0fff944ee29..947555524bb 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -3,6 +3,6 @@ line_length = 79
 multi_line_output = 0
 known_standard_library = setuptools
 known_first_party = mmdet
-known_third_party = PIL,asynctest,cityscapesscripts,cv2,matplotlib,mmcv,numpy,onnx,pycocotools,pytest,robustness_eval,roi_align,roi_pool,seaborn,six,terminaltables,torch,torchvision
+known_third_party = PIL,asynctest,cityscapesscripts,cv2,matplotlib,mmcv,numpy,onnx,pycocotools,pytest,robustness_eval,seaborn,six,terminaltables,torch,torchvision
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
diff --git a/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
index 97bdedb55dc..f90b78cef38 100644
--- a/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
+++ b/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -38,7 +38,7 @@
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=[
@@ -95,7 +95,7 @@
         ],
         mask_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         mask_head=dict(
@@ -195,6 +195,6 @@
         min_bbox_size=0),
     rcnn=dict(
         score_thr=0.05,
-        nms=dict(type='nms', iou_thr=0.5),
+        nms=dict(type='nms', iou_threshold=0.5),
         max_per_img=100,
         mask_thr_binary=0.5))
diff --git a/configs/_base_/models/cascade_rcnn_r50_fpn.py b/configs/_base_/models/cascade_rcnn_r50_fpn.py
index dfe941500cb..303276b845f 100644
--- a/configs/_base_/models/cascade_rcnn_r50_fpn.py
+++ b/configs/_base_/models/cascade_rcnn_r50_fpn.py
@@ -38,7 +38,7 @@
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=[
@@ -178,4 +178,6 @@
         nms_thr=0.7,
         min_bbox_size=0),
     rcnn=dict(
-        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/configs/_base_/models/fast_rcnn_r50_fpn.py b/configs/_base_/models/fast_rcnn_r50_fpn.py
index 69bcc92dae5..b8d9570deea 100644
--- a/configs/_base_/models/fast_rcnn_r50_fpn.py
+++ b/configs/_base_/models/fast_rcnn_r50_fpn.py
@@ -20,7 +20,7 @@
         type='StandardRoIHead',
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=dict(
@@ -57,4 +57,6 @@
         debug=False))
 test_cfg = dict(
     rcnn=dict(
-        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/configs/_base_/models/faster_rcnn_r50_caffe_c4.py b/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
index 2dd990f2b8c..5a381636382 100644
--- a/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
+++ b/configs/_base_/models/faster_rcnn_r50_caffe_c4.py
@@ -43,7 +43,7 @@
             norm_eval=True),
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=1024,
             featmap_strides=[16]),
         bbox_head=dict(
@@ -111,4 +111,6 @@
         nms_thr=0.7,
         min_bbox_size=0),
     rcnn=dict(
-        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
diff --git a/configs/_base_/models/faster_rcnn_r50_fpn.py b/configs/_base_/models/faster_rcnn_r50_fpn.py
index 92ed16359e6..338a5c6b604 100644
--- a/configs/_base_/models/faster_rcnn_r50_fpn.py
+++ b/configs/_base_/models/faster_rcnn_r50_fpn.py
@@ -35,7 +35,7 @@
         type='StandardRoIHead',
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=dict(
@@ -103,7 +103,9 @@
         nms_thr=0.7,
         min_bbox_size=0),
     rcnn=dict(
-        score_thr=0.05, nms=dict(type='nms', iou_thr=0.5), max_per_img=100)
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100)
     # soft-nms is also supported for rcnn testing
-    # e.g., nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.05)
+    # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
 )
diff --git a/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
index c77f292c18d..b9b29b0b99d 100644
--- a/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
+++ b/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
@@ -43,7 +43,7 @@
             norm_eval=True),
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=1024,
             featmap_strides=[16]),
         bbox_head=dict(
@@ -122,6 +122,6 @@
         min_bbox_size=0),
     rcnn=dict(
         score_thr=0.05,
-        nms=dict(type='nms', iou_thr=0.5),
+        nms=dict(type='nms', iou_threshold=0.5),
         max_per_img=100,
         mask_thr_binary=0.5))
diff --git a/configs/_base_/models/mask_rcnn_r50_fpn.py b/configs/_base_/models/mask_rcnn_r50_fpn.py
index 470653bd149..4472bd0a80d 100644
--- a/configs/_base_/models/mask_rcnn_r50_fpn.py
+++ b/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -36,7 +36,7 @@
         type='StandardRoIHead',
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=dict(
@@ -55,7 +55,7 @@
             loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
         mask_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         mask_head=dict(
@@ -119,6 +119,6 @@
         min_bbox_size=0),
     rcnn=dict(
         score_thr=0.05,
-        nms=dict(type='nms', iou_thr=0.5),
+        nms=dict(type='nms', iou_threshold=0.5),
         max_per_img=100,
         mask_thr_binary=0.5))
diff --git a/configs/_base_/models/retinanet_r50_fpn.py b/configs/_base_/models/retinanet_r50_fpn.py
index f51f0863ced..a08b14f6099 100644
--- a/configs/_base_/models/retinanet_r50_fpn.py
+++ b/configs/_base_/models/retinanet_r50_fpn.py
@@ -56,5 +56,5 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
diff --git a/configs/_base_/models/ssd300.py b/configs/_base_/models/ssd300.py
index fde4df3e6d0..ee7cf3adc8a 100644
--- a/configs/_base_/models/ssd300.py
+++ b/configs/_base_/models/ssd300.py
@@ -43,7 +43,7 @@
     neg_pos_ratio=3,
     debug=False)
 test_cfg = dict(
-    nms=dict(type='nms', iou_thr=0.45),
+    nms=dict(type='nms', iou_threshold=0.45),
     min_bbox_size=0,
     score_thr=0.02,
     max_per_img=200)
diff --git a/configs/atss/atss_r50_fpn_1x_coco.py b/configs/atss/atss_r50_fpn_1x_coco.py
index f359f0bb9b4..e787622c24b 100644
--- a/configs/atss/atss_r50_fpn_1x_coco.py
+++ b/configs/atss/atss_r50_fpn_1x_coco.py
@@ -56,7 +56,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.6),
+    nms=dict(type='nms', iou_threshold=0.6),
     max_per_img=100)
 # optimizer
 optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
index 84651e7e566..081b998f6f5 100644
--- a/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/cascade_mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../cascade_rcnn/cascade_mask_rcnn_r101_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
index a25627b88e1..3b3683af235 100644
--- a/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/cascade_mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
index 2a9a87e77c2..daaa4729c82 100644
--- a/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/cascade_mask_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../cascade_rcnn/cascade_mask_rcnn_x101_32x4d_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
index 0c1ebd7bbd0..a01df33c94e 100644
--- a/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/cascade_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../cascade_rcnn/cascade_rcnn_r101_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
index e24c9dcc920..aa664bd61c7 100644
--- a/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/cascade_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
index 6ad09e9bc38..f5fee7e13cd 100644
--- a/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../faster_rcnn/faster_rcnn_r101_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
index 44259a4a04e..8787088f27a 100644
--- a/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
index 40396b91c89..1b695f0e190 100644
--- a/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r50_fpn_dpool_1x_coco.py
@@ -5,11 +5,8 @@
             type='SingleRoIExtractor',
             roi_layer=dict(
                 _delete_=True,
-                type='DeformRoIPoolingPack',
-                out_size=7,
-                out_channels=256,
-                no_trans=False,
-                group_size=1,
-                trans_std=0.1),
+                type='DeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32])))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
index 0452b7219a9..d1bcf3c102f 100644
--- a/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
index a18a5abe115..d0ab89c261f 100644
--- a/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r50_fpn_mdconv_c3-c5_group4_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCNv2', deformable_groups=4, fallback_on_stride=False),
+        dcn=dict(type='DCNv2', deform_groups=4, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py b/configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py
index cfeb6d92a2f..ad7b0346a63 100644
--- a/configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py
+++ b/configs/dcn/faster_rcnn_r50_fpn_mdpool_1x_coco.py
@@ -5,11 +5,8 @@
             type='SingleRoIExtractor',
             roi_layer=dict(
                 _delete_=True,
-                type='ModulatedDeformRoIPoolingPack',
-                out_size=7,
-                out_channels=256,
-                no_trans=False,
-                group_size=1,
-                trans_std=0.1),
+                type='ModulatedDeformRoIPoolPack',
+                output_size=7,
+                output_channels=256),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32])))
diff --git a/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
index 936c034ff9f..8357766f50f 100644
--- a/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/faster_rcnn_x101_32x4d_fpn_dconv_c3-c5_1x_coco.py
@@ -11,5 +11,5 @@
         frozen_stages=1,
         norm_cfg=dict(type='BN', requires_grad=True),
         style='pytorch',
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
index 16e19212966..cb340022ea2 100644
--- a/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/mask_rcnn_r101_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
index f942b5cde3d..ababe58dc3f 100644
--- a/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
+++ b/configs/dcn/mask_rcnn_r50_fpn_dconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py b/configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
index 7c8cc1e2109..5ca2a67cde6 100644
--- a/configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
+++ b/configs/dcn/mask_rcnn_r50_fpn_mdconv_c3-c5_1x_coco.py
@@ -1,5 +1,5 @@
 _base_ = '../mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py'
 model = dict(
     backbone=dict(
-        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
index a2a1e2a85c9..bbefd27aa02 100644
--- a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_0010_dcn_1x_coco.py
@@ -12,5 +12,5 @@
                 stages=(False, False, True, True),
                 position='after_conv2')
         ],
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
index 04ec96e7c12..b1f26c081da 100644
--- a/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
+++ b/configs/empirical_attention/faster_rcnn_r50_fpn_attention_1111_dcn_1x_coco.py
@@ -12,5 +12,5 @@
                 stages=(False, False, True, True),
                 position='after_conv2')
         ],
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py b/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py
index 05f61483e4b..8ba6b017ff6 100644
--- a/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py
+++ b/configs/faster_rcnn/faster_rcnn_r50_fpn_soft_nms_1x_coco.py
@@ -7,5 +7,5 @@
 test_cfg = dict(
     rcnn=dict(
         score_thr=0.05,
-        nms=dict(type='soft_nms', iou_thr=0.5),
+        nms=dict(type='soft_nms', iou_threshold=0.5),
         max_per_img=100))
diff --git a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 2a764e13dd1..d83fa17f173 100644
--- a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -10,7 +10,7 @@
         conv_bias=True,
         loss_bbox=dict(type='GIoULoss', loss_weight=1.0)))
 # training and testing settings
-test_cfg = dict(nms=dict(type='nms', iou_thr=0.6))
+test_cfg = dict(nms=dict(type='nms', iou_threshold=0.6))
 
 # dataset settings
 img_norm_cfg = dict(
diff --git a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_4x4_1x_coco.py b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_4x4_1x_coco.py
index 31a89a8cc5c..67edb415c5f 100644
--- a/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_4x4_1x_coco.py
+++ b/configs/fcos/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_dcn_4x4_1x_coco.py
@@ -3,7 +3,7 @@
 model = dict(
     pretrained='open-mmlab://detectron2/resnet50_caffe',
     backbone=dict(
-        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)),
     bbox_head=dict(
         norm_on_bbox=True,
@@ -13,7 +13,7 @@
         conv_bias=True,
         loss_bbox=dict(type='GIoULoss', loss_weight=1.0)))
 # training and testing settings
-test_cfg = dict(nms=dict(type='nms', iou_thr=0.6))
+test_cfg = dict(nms=dict(type='nms', iou_threshold=0.6))
 
 # dataset settings
 img_norm_cfg = dict(
diff --git a/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py
index b4a826fed36..4697e9e7efc 100644
--- a/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py
+++ b/configs/fcos/fcos_r50_caffe_fpn_4x4_1x_coco.py
@@ -56,7 +56,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
 img_norm_cfg = dict(
     mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
diff --git a/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 9c17c171e70..b0bcad9e101 100644
--- a/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/fcos/fcos_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -55,7 +55,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
 img_norm_cfg = dict(
     mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
diff --git a/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py b/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
index 9bafedd3d91..4b62c81212e 100644
--- a/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
+++ b/configs/foveabox/fovea_r50_fpn_4x4_1x_coco.py
@@ -45,7 +45,7 @@
 test_cfg = dict(
     nms_pre=1000,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
 data = dict(samples_per_gpu=4, workers_per_gpu=4)
 # optimizer
diff --git a/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py b/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
index 235ea248506..eab622b2e8b 100644
--- a/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
+++ b/configs/gfl/gfl_r101_fpn_dconv_c3-c5_mstrain_2x_coco.py
@@ -8,7 +8,7 @@
         out_indices=(0, 1, 2, 3),
         frozen_stages=1,
         norm_cfg=dict(type='BN', requires_grad=True),
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True),
         norm_eval=True,
         style='pytorch'))
diff --git a/configs/gfl/gfl_r50_fpn_1x_coco.py b/configs/gfl/gfl_r50_fpn_1x_coco.py
index 99ac7e43ef6..77a15ebce37 100644
--- a/configs/gfl/gfl_r50_fpn_1x_coco.py
+++ b/configs/gfl/gfl_r50_fpn_1x_coco.py
@@ -51,7 +51,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.6),
+    nms=dict(type='nms', iou_threshold=0.6),
     max_per_img=100)
 # optimizer
 optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
diff --git a/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py b/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
index 78948b65a13..a2370e234df 100644
--- a/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
+++ b/configs/gfl/gfl_x101_32x4d_fpn_dconv_c4-c5_mstrain_2x_coco.py
@@ -11,7 +11,7 @@
         out_indices=(0, 1, 2, 3),
         frozen_stages=1,
         norm_cfg=dict(type='BN', requires_grad=True),
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, False, True, True),
         norm_eval=True,
         style='pytorch'))
diff --git a/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
index c4bc3859dd3..1b40e039c1e 100644
--- a/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
+++ b/configs/grid_rcnn/grid_rcnn_r50_fpn_gn-head_2x_coco.py
@@ -39,7 +39,7 @@
         type='GridRoIHead',
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=dict(
@@ -56,7 +56,7 @@
             reg_class_agnostic=False),
         grid_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         grid_head=dict(
@@ -119,7 +119,9 @@
         nms_thr=0.7,
         min_bbox_size=0),
     rcnn=dict(
-        score_thr=0.03, nms=dict(type='nms', iou_thr=0.3), max_per_img=100))
+        score_thr=0.03,
+        nms=dict(type='nms', iou_threshold=0.3),
+        max_per_img=100))
 # optimizer
 optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
 optimizer_config = dict(grad_clip=None)
diff --git a/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py b/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
index 74c01e561b3..0fc528bfd49 100644
--- a/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
+++ b/configs/groie/faster_rcnn_r50_fpn_groie_1x_coco.py
@@ -5,7 +5,7 @@
         bbox_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='sum',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
diff --git a/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py b/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
index afc9ca9de50..8e4b4ab2351 100644
--- a/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
+++ b/configs/groie/grid_rcnn_r50_fpn_gn-head_groie_1x_coco.py
@@ -5,7 +5,7 @@
         bbox_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='sum',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
@@ -25,7 +25,7 @@
                 kv_stride=2)),
         grid_roi_extractor=dict(
             type='GenericRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
diff --git a/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
index d59e85f3b74..8b83722197c 100644
--- a/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
+++ b/configs/groie/mask_rcnn_r101_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
@@ -5,7 +5,7 @@
         bbox_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='sum',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
@@ -25,7 +25,7 @@
                 kv_stride=2)),
         mask_roi_extractor=dict(
             type='GenericRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
diff --git a/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py b/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
index 462fd5711dc..81dfb4873bd 100644
--- a/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
+++ b/configs/groie/mask_rcnn_r50_fpn_groie_1x_coco.py
@@ -5,7 +5,7 @@
         bbox_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='sum',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
@@ -25,7 +25,7 @@
                 kv_stride=2)),
         mask_roi_extractor=dict(
             type='GenericRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
diff --git a/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py b/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
index d16d83493ef..852c5ca7c5c 100644
--- a/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
+++ b/configs/groie/mask_rcnn_r50_fpn_syncbn-backbone_r4_gcb_c3-c5_groie_1x_coco.py
@@ -5,7 +5,7 @@
         bbox_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='sum',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
@@ -25,7 +25,7 @@
                 kv_stride=2)),
         mask_roi_extractor=dict(
             type='GenericRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=2),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32],
             pre_cfg=dict(
diff --git a/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
index 987f9a0f2f0..f6c487bf18f 100644
--- a/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
+++ b/configs/guided_anchoring/ga_retinanet_r101_caffe_fpn_mstrain_2x.py
@@ -87,7 +87,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
 # dataset settings
 dataset_type = 'CocoDataset'
diff --git a/configs/htc/htc_r50_fpn_1x_coco.py b/configs/htc/htc_r50_fpn_1x_coco.py
index 6e1880fb9c4..929cf464f60 100644
--- a/configs/htc/htc_r50_fpn_1x_coco.py
+++ b/configs/htc/htc_r50_fpn_1x_coco.py
@@ -3,7 +3,7 @@
     roi_head=dict(
         semantic_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[8]),
         semantic_head=dict(
diff --git a/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py b/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py
index a7bc763cd44..81ed3a8a03a 100644
--- a/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py
+++ b/configs/htc/htc_without_semantic_r50_fpn_1x_coco.py
@@ -44,7 +44,7 @@
         stage_loss_weights=[1, 0.5, 0.25],
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=[
@@ -101,7 +101,7 @@
         ],
         mask_roi_extractor=dict(
             type='SingleRoIExtractor',
-            roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         mask_head=[
@@ -216,7 +216,7 @@
         min_bbox_size=0),
     rcnn=dict(
         score_thr=0.001,
-        nms=dict(type='nms', iou_thr=0.5),
+        nms=dict(type='nms', iou_threshold=0.5),
         max_per_img=100,
         mask_thr_binary=0.5))
 img_norm_cfg = dict(
diff --git a/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py b/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
index 31415eee747..4a98ff28588 100644
--- a/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
+++ b/configs/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco.py
@@ -12,7 +12,7 @@
         norm_cfg=dict(type='BN', requires_grad=True),
         norm_eval=True,
         style='pytorch',
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
 # dataset settings
 img_norm_cfg = dict(
diff --git a/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py
index 3c4234cb2bc..5899444adf0 100644
--- a/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py
+++ b/configs/legacy_1.x/cascade_mask_rcnn_r50_fpn_1x_coco_v1.py
@@ -30,7 +30,10 @@
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
             roi_layer=dict(
-                type='RoIAlign', out_size=7, sample_num=2, aligned=False)),
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
         bbox_head=[
             dict(
                 type='Shared2FCBBoxHead',
@@ -69,5 +72,8 @@
         mask_roi_extractor=dict(
             type='SingleRoIExtractor',
             roi_layer=dict(
-                type='RoIAlign', out_size=14, sample_num=2, aligned=False))))
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
 dist_params = dict(backend='nccl', port=29515)
diff --git a/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py
index 482e1112838..1cb833cfbcd 100644
--- a/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py
+++ b/configs/legacy_1.x/faster_rcnn_r50_fpn_1x_coco_v1.py
@@ -22,7 +22,10 @@
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
             roi_layer=dict(
-                type='RoIAlign', out_size=7, sample_num=2, aligned=False),
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False),
             out_channels=256,
             featmap_strides=[4, 8, 16, 32]),
         bbox_head=dict(
diff --git a/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py b/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py
index 5f519002844..0b200610191 100644
--- a/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py
+++ b/configs/legacy_1.x/mask_rcnn_r50_fpn_1x_coco_v1.py
@@ -13,11 +13,17 @@
         bbox_roi_extractor=dict(
             type='SingleRoIExtractor',
             roi_layer=dict(
-                type='RoIAlign', out_size=7, sample_num=2, aligned=False)),
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
         mask_roi_extractor=dict(
             type='SingleRoIExtractor',
             roi_layer=dict(
-                type='RoIAlign', out_size=14, sample_num=2, aligned=False)),
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False)),
         bbox_head=dict(
             bbox_coder=dict(type='LegacyDeltaXYWHBBoxCoder'),
             loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))))
diff --git a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
index d0aefcee236..431e5ab3367 100644
--- a/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
+++ b/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
@@ -7,12 +7,18 @@
     roi_head=dict(
         bbox_roi_extractor=dict(
             roi_layer=dict(
-                type='RoIAlign', out_size=7, sample_num=2, aligned=False)),
+                type='RoIAlign',
+                output_size=7,
+                sampling_ratio=2,
+                aligned=False)),
         bbox_head=dict(
             loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
         mask_roi_extractor=dict(
             roi_layer=dict(
-                type='RoIAlign', out_size=14, sample_num=2, aligned=False))))
+                type='RoIAlign',
+                output_size=14,
+                sampling_ratio=2,
+                aligned=False))))
 # use caffe img_norm
 img_norm_cfg = dict(
     mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
diff --git a/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 0938fed7d3c..76dde57d8a4 100644
--- a/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/nas_fcos/nas_fcos_fcoshead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -22,7 +22,7 @@
         add_extra_convs=True,
         num_outs=5,
         norm_cfg=dict(type='BN'),
-        conv_cfg=dict(type='DCNv2', deformable_groups=2)),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
     bbox_head=dict(
         type='FCOSHead',
         num_classes=80,
@@ -55,7 +55,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.6),
+    nms=dict(type='nms', iou_threshold=0.6),
     max_per_img=100)
 
 img_norm_cfg = dict(
diff --git a/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
index 6fc61b060c6..a22f8f1998c 100644
--- a/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
+++ b/configs/nas_fcos/nas_fcos_nashead_r50_caffe_fpn_gn-head_4x4_1x_coco.py
@@ -22,7 +22,7 @@
         add_extra_convs=True,
         num_outs=5,
         norm_cfg=dict(type='BN'),
-        conv_cfg=dict(type='DCNv2', deformable_groups=2)),
+        conv_cfg=dict(type='DCNv2', deform_groups=2)),
     bbox_head=dict(
         type='NASFCOSHead',
         num_classes=80,
@@ -54,7 +54,7 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.6),
+    nms=dict(type='nms', iou_threshold=0.6),
     max_per_img=100)
 
 img_norm_cfg = dict(
diff --git a/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
index a61f40b7751..dc7f97554b2 100644
--- a/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
+++ b/configs/point_rend/point_rend_r50_caffe_fpn_mstrain_1x_coco.py
@@ -7,7 +7,8 @@
         mask_roi_extractor=dict(
             type='GenericRoIExtractor',
             aggregation='concat',
-            roi_layer=dict(_delete_=True, type='SimpleRoIAlign', out_size=14),
+            roi_layer=dict(
+                _delete_=True, type='SimpleRoIAlign', output_size=14),
             out_channels=256,
             featmap_strides=[4]),
         mask_head=dict(
diff --git a/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
index 39dd6f48f64..dd5153e6ef0 100644
--- a/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
+++ b/configs/regnet/mask_rcnn_regnetx-3.2GF_fpn_mdconv_c3-c5_1x_coco.py
@@ -2,5 +2,5 @@
 model = dict(
     pretrained='open-mmlab://regnetx_3.2gf',
     backbone=dict(
-        dcn=dict(type='DCNv2', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
index 41be70e23c1..241754cfb45 100644
--- a/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
+++ b/configs/reppoints/reppoints_moment_r101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
@@ -3,5 +3,5 @@
     pretrained='torchvision://resnet101',
     backbone=dict(
         depth=101,
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py b/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
index a1a0c23b1e2..6d1c89b2082 100644
--- a/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
+++ b/configs/reppoints/reppoints_moment_r50_fpn_1x_coco.py
@@ -62,6 +62,6 @@
     nms_pre=1000,
     min_bbox_size=0,
     score_thr=0.05,
-    nms=dict(type='nms', iou_thr=0.5),
+    nms=dict(type='nms', iou_threshold=0.5),
     max_per_img=100)
 optimizer = dict(lr=0.01)
diff --git a/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py b/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
index 34d577e6040..c33019da0cc 100644
--- a/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
+++ b/configs/reppoints/reppoints_moment_x101_fpn_dconv_c3-c5_gn-neck+head_2x_coco.py
@@ -11,5 +11,5 @@
         frozen_stages=1,
         norm_cfg=dict(type='BN', requires_grad=True),
         style='pytorch',
-        dcn=dict(type='DCN', deformable_groups=1, fallback_on_stride=False),
+        dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
         stage_with_dcn=(False, True, True, True)))
diff --git a/docs/config.md b/docs/config.md
index d4dd1ab5e76..a4d48935563 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -96,8 +96,8 @@ model = dict(
             type='SingleRoIExtractor',  # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/roi_heads/roi_extractors/single_level.py#L10 for details.
             roi_layer=dict(  # Config of RoI Layer
                 type='RoIAlign',  # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported. Refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/roi_align/roi_align.py#L79 for details.
-                out_size=7,  # The output size of feature maps.
-                sample_num=0),  # Sampling ratio when extracting the RoI features. 0 means adaptive ratio.
+                output_size=7,  # The output size of feature maps.
+                sampling_ratio=0),  # Sampling ratio when extracting the RoI features. 0 means adaptive ratio.
             out_channels=256,  # output channels of the extracted feature.
             featmap_strides=[4, 8, 16, 32]),  # Strides of multi-scale feature maps. It should be consistent to the architecture of the backbone.
         bbox_head=dict(  # Config of box head in the RoIHead.
@@ -122,8 +122,8 @@ model = dict(
             type='SingleRoIExtractor',  # Type of the RoI feature extractor, most of methods uses SingleRoIExtractor.
             roi_layer=dict(  # Config of RoI Layer that extracts features for instance segmentation
                 type='RoIAlign',  # Type of RoI Layer, DeformRoIPoolingPack and ModulatedDeformRoIPoolingPack are also supported
-                out_size=14,  # The output size of feature maps.
-                sample_num=0),  # Sampling ratio when extracting the RoI features.
+                output_size=14,  # The output size of feature maps.
+                sampling_ratio=0),  # Sampling ratio when extracting the RoI features.
             out_channels=256,  # Output channels of the extracted feature.
             featmap_strides=[4, 8, 16, 32]),  # Strides of multi-scale feature maps.
         mask_head=dict(  # Mask prediction head
diff --git a/docs/install.md b/docs/install.md
index ebe8cef9763..1681a9ea48d 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -45,14 +45,24 @@ conda install pytorch=1.3.1 cudatoolkit=9.2 torchvision=0.4.2 -c pytorch
 If you build PyTorch from source instead of installing the prebuilt pacakge,
 you can use more CUDA versions such as 9.0.
 
-c. Clone the mmdetection repository.
+c. Install mmcv, you can [install](https://github.com/open-mmlab/mmcv#install-with-pip) the pre-build mmcv.
+Or you can choose either to compile mmcv from source by the following command
+
+```
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+pip install -e .
+cd ..
+```
+
+d. Clone the mmdetection repository.
 
 ```shell
 git clone https://github.com/open-mmlab/mmdetection.git
 cd mmdetection
 ```
 
-d. Install build requirements and then install mmdetection.
+e. Install build requirements and then install mmdetection.
 (We install our forked version of pycocotools via the github repo instead of pypi
 for better compatibility with our repo.)
 
diff --git a/mmdet/VERSION b/mmdet/VERSION
index c043eea7767..bce4c60e36c 100644
--- a/mmdet/VERSION
+++ b/mmdet/VERSION
@@ -1 +1 @@
-2.2.1
+2.3.0rc0
diff --git a/mmdet/apis/inference.py b/mmdet/apis/inference.py
index 400e487f146..bfc74a19423 100644
--- a/mmdet/apis/inference.py
+++ b/mmdet/apis/inference.py
@@ -3,13 +3,13 @@
 import matplotlib.pyplot as plt
 import mmcv
 import torch
+from mmcv.ops import RoIAlign, RoIPool
 from mmcv.parallel import collate, scatter
 from mmcv.runner import load_checkpoint
 
 from mmdet.core import get_classes
 from mmdet.datasets.pipelines import Compose
 from mmdet.models import build_detector
-from mmdet.ops import RoIAlign, RoIPool
 
 
 def init_detector(config, checkpoint=None, device='cuda:0'):
diff --git a/mmdet/apis/test.py b/mmdet/apis/test.py
index 62997201fa5..a73c54f57e1 100644
--- a/mmdet/apis/test.py
+++ b/mmdet/apis/test.py
@@ -100,7 +100,9 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
         results.append(result)
 
         if rank == 0:
-            batch_size = len(data['img_metas'][0].data)
+            batch_size = (
+                len(data['img_meta'].data)
+                if 'img_meta' in data else len(data['img_metas'][0].data))
             for _ in range(batch_size * world_size):
                 prog_bar.update()
 
diff --git a/mmdet/core/bbox/samplers/score_hlr_sampler.py b/mmdet/core/bbox/samplers/score_hlr_sampler.py
index 8885df985b3..3089451acf5 100644
--- a/mmdet/core/bbox/samplers/score_hlr_sampler.py
+++ b/mmdet/core/bbox/samplers/score_hlr_sampler.py
@@ -1,6 +1,6 @@
 import torch
+from mmcv.ops import nms_match
 
-from mmdet.ops import nms_match
 from ..builder import BBOX_SAMPLERS
 from ..transforms import bbox2roi
 from .base_sampler import BaseSampler
diff --git a/mmdet/core/mask/structures.py b/mmdet/core/mask/structures.py
index 095c7aa69a5..b7888c39bbc 100644
--- a/mmdet/core/mask/structures.py
+++ b/mmdet/core/mask/structures.py
@@ -4,8 +4,7 @@
 import numpy as np
 import pycocotools.mask as maskUtils
 import torch
-
-from mmdet.ops.roi_align import roi_align
+from mmcv.ops.roi_align import roi_align
 
 
 class BaseInstanceMasks(metaclass=ABCMeta):
@@ -280,7 +279,7 @@ def crop_and_resize(self,
             gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
                 0, inds).to(dtype=rois.dtype)
             targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
-                                1.0, 0, True).squeeze(1)
+                                1.0, 0, 'avg', True).squeeze(1)
             resized_masks = (targets >= 0.5).cpu().numpy()
         else:
             resized_masks = []
diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
index a49e430e72f..b583dbffa82 100644
--- a/mmdet/core/post_processing/bbox_nms.py
+++ b/mmdet/core/post_processing/bbox_nms.py
@@ -1,6 +1,5 @@
 import torch
-
-from mmdet.ops.nms import batched_nms
+from mmcv.ops.nms import batched_nms
 
 
 def multiclass_nms(multi_bboxes,
diff --git a/mmdet/core/post_processing/merge_augs.py b/mmdet/core/post_processing/merge_augs.py
index b4e8d62732a..ed203d32daa 100644
--- a/mmdet/core/post_processing/merge_augs.py
+++ b/mmdet/core/post_processing/merge_augs.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
+from mmcv.ops import nms
 
-from mmdet.ops import nms
 from ..bbox import bbox_mapping_back
 
 
@@ -36,7 +36,9 @@ def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
                                               flip_direction)
         recovered_proposals.append(_proposals)
     aug_proposals = torch.cat(recovered_proposals, dim=0)
-    merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              rpn_test_cfg.nms_thr)
     scores = merged_proposals[:, 4]
     _, order = scores.sort(0, descending=True)
     num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py
index 53c0e966c86..5f9dee7bbc2 100644
--- a/mmdet/models/backbones/resnet.py
+++ b/mmdet/models/backbones/resnet.py
@@ -1,11 +1,10 @@
 import torch.nn as nn
 import torch.utils.checkpoint as cp
-from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
-                      kaiming_init)
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
+                      constant_init, kaiming_init)
 from mmcv.runner import load_checkpoint
 from torch.nn.modules.batchnorm import _BatchNorm
 
-from mmdet.ops import build_plugin_layer
 from mmdet.utils import get_root_logger
 from ..builder import BACKBONES
 from ..utils import ResLayer
diff --git a/mmdet/models/dense_heads/fovea_head.py b/mmdet/models/dense_heads/fovea_head.py
index 37047c7dfe0..afe70515410 100644
--- a/mmdet/models/dense_heads/fovea_head.py
+++ b/mmdet/models/dense_heads/fovea_head.py
@@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import DeformConv2d
 
 from mmdet.core import multi_apply, multiclass_nms
-from mmdet.ops import DeformConv
 from ..builder import HEADS
 from .anchor_free_head import AnchorFreeHead
 
@@ -16,17 +16,17 @@ def __init__(self,
                  in_channels,
                  out_channels,
                  kernel_size=3,
-                 deformable_groups=4):
+                 deform_groups=4):
         super(FeatureAlign, self).__init__()
         offset_channels = kernel_size * kernel_size * 2
         self.conv_offset = nn.Conv2d(
-            4, deformable_groups * offset_channels, 1, bias=False)
-        self.conv_adaption = DeformConv(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
             in_channels,
             out_channels,
             kernel_size=kernel_size,
             padding=(kernel_size - 1) // 2,
-            deformable_groups=deformable_groups)
+            deform_groups=deform_groups)
         self.relu = nn.ReLU(inplace=True)
 
     def init_weights(self):
@@ -53,13 +53,13 @@ def __init__(self,
                                                                          512)),
                  sigma=0.4,
                  with_deform=False,
-                 deformable_groups=4,
+                 deform_groups=4,
                  **kwargs):
         self.base_edge_list = base_edge_list
         self.scale_ranges = scale_ranges
         self.sigma = sigma
         self.with_deform = with_deform
-        self.deformable_groups = deformable_groups
+        self.deform_groups = deform_groups
         super().__init__(num_classes, in_channels, **kwargs)
 
     def _init_layers(self):
@@ -95,7 +95,7 @@ def _init_layers(self):
                 self.feat_channels,
                 self.feat_channels,
                 kernel_size=3,
-                deformable_groups=self.deformable_groups)
+                deform_groups=self.deform_groups)
             self.conv_cls = nn.Conv2d(
                 int(self.feat_channels * 4),
                 self.cls_out_channels,
diff --git a/mmdet/models/dense_heads/ga_retina_head.py b/mmdet/models/dense_heads/ga_retina_head.py
index e85c9ac474d..8822d1ca78e 100644
--- a/mmdet/models/dense_heads/ga_retina_head.py
+++ b/mmdet/models/dense_heads/ga_retina_head.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.ops import MaskedConv2d
 
-from mmdet.ops import MaskedConv2d
 from ..builder import HEADS
 from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
 
@@ -55,12 +55,12 @@ def _init_layers(self):
             self.feat_channels,
             self.feat_channels,
             kernel_size=3,
-            deformable_groups=self.deformable_groups)
+            deform_groups=self.deform_groups)
         self.feature_adaption_reg = FeatureAdaption(
             self.feat_channels,
             self.feat_channels,
             kernel_size=3,
-            deformable_groups=self.deformable_groups)
+            deform_groups=self.deform_groups)
         self.retina_cls = MaskedConv2d(
             self.feat_channels,
             self.num_anchors * self.cls_out_channels,
diff --git a/mmdet/models/dense_heads/ga_rpn_head.py b/mmdet/models/dense_heads/ga_rpn_head.py
index 4e3a262c09d..d7a9d824803 100644
--- a/mmdet/models/dense_heads/ga_rpn_head.py
+++ b/mmdet/models/dense_heads/ga_rpn_head.py
@@ -2,8 +2,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import normal_init
+from mmcv.ops import nms
 
-from mmdet.ops import nms
 from ..builder import HEADS
 from .guided_anchor_head import GuidedAnchorHead
 from .rpn_test_mixin import RPNTestMixin
@@ -117,15 +117,14 @@ def _get_bboxes_single(self,
                     as_tuple=False).squeeze()
                 proposals = proposals[valid_inds, :]
                 scores = scores[valid_inds]
-            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
             # NMS in current level
-            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals, _ = nms(proposals, scores, cfg.nms_thr)
             proposals = proposals[:cfg.nms_post, :]
             mlvl_proposals.append(proposals)
         proposals = torch.cat(mlvl_proposals, 0)
         if cfg.nms_across_levels:
             # NMS across multi levels
-            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1], cfg.nms_thr)
             proposals = proposals[:cfg.max_num, :]
         else:
             scores = proposals[:, 4]
diff --git a/mmdet/models/dense_heads/guided_anchor_head.py b/mmdet/models/dense_heads/guided_anchor_head.py
index 63689735fdb..4be75830ef5 100644
--- a/mmdet/models/dense_heads/guided_anchor_head.py
+++ b/mmdet/models/dense_heads/guided_anchor_head.py
@@ -1,12 +1,12 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import bias_init_with_prob, normal_init
+from mmcv.ops import DeformConv2d, MaskedConv2d
 
 from mmdet.core import (anchor_inside_flags, build_anchor_generator,
                         build_assigner, build_bbox_coder, build_sampler,
                         calc_region, force_fp32, images_to_levels, multi_apply,
                         multiclass_nms, unmap)
-from mmdet.ops import DeformConv, MaskedConv2d
 from ..builder import HEADS, build_loss
 from .anchor_head import AnchorHead
 
@@ -16,30 +16,30 @@ class FeatureAdaption(nn.Module):
 
     Feature Adaption Module is implemented based on DCN v1.
     It uses anchor shape prediction rather than feature map to
-    predict offsets of deformable conv layer.
+    predict offsets of deform conv layer.
 
     Args:
         in_channels (int): Number of channels in the input feature map.
         out_channels (int): Number of channels in the output feature map.
         kernel_size (int): Deformable conv kernel size.
-        deformable_groups (int): Deformable conv group size.
+        deform_groups (int): Deformable conv group size.
     """
 
     def __init__(self,
                  in_channels,
                  out_channels,
                  kernel_size=3,
-                 deformable_groups=4):
+                 deform_groups=4):
         super(FeatureAdaption, self).__init__()
         offset_channels = kernel_size * kernel_size * 2
         self.conv_offset = nn.Conv2d(
-            2, deformable_groups * offset_channels, 1, bias=False)
-        self.conv_adaption = DeformConv(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
             in_channels,
             out_channels,
             kernel_size=kernel_size,
             padding=(kernel_size - 1) // 2,
-            deformable_groups=deformable_groups)
+            deform_groups=deform_groups)
         self.relu = nn.ReLU(inplace=True)
 
     def init_weights(self):
@@ -74,7 +74,7 @@ class GuidedAnchorHead(AnchorHead):
         square_anchor_generator (dict): Config dict for square generator
         anchor_coder (dict): Config dict for anchor coder
         bbox_coder (dict): Config dict for bbox coder
-        deformable_groups: (int): Group number of DCN in
+        deform_groups: (int): Group number of DCN in
             FeatureAdaption module.
         loc_filter_thr (float): Threshold to filter out unconcerned regions.
         background_label (int | None): Label ID of background, set as 0 for
@@ -113,7 +113,7 @@ def __init__(
             target_stds=[1.0, 1.0, 1.0, 1.0]
         ),
         reg_decoded_bbox=False,
-        deformable_groups=4,
+        deform_groups=4,
         loc_filter_thr=0.01,
         background_label=None,
         train_cfg=None,
@@ -133,7 +133,7 @@ def __init__(
         self.in_channels = in_channels
         self.num_classes = num_classes
         self.feat_channels = feat_channels
-        self.deformable_groups = deformable_groups
+        self.deform_groups = deform_groups
         self.loc_filter_thr = loc_filter_thr
 
         # build approx_anchor_generator and square_anchor_generator
@@ -209,7 +209,7 @@ def _init_layers(self):
             self.in_channels,
             self.feat_channels,
             kernel_size=3,
-            deformable_groups=self.deformable_groups)
+            deform_groups=self.deform_groups)
         self.conv_cls = MaskedConv2d(self.feat_channels,
                                      self.num_anchors * self.cls_out_channels,
                                      1)
@@ -636,8 +636,8 @@ def loss_loc_single(self, loc_pred, loc_target, loc_weight,
                         loc_avg_factor):
         loss_loc = self.loss_loc(
             loc_pred.reshape(-1, 1),
-            loc_target.reshape(-1, 1).long(),
-            loc_weight.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
             avg_factor=loc_avg_factor)
         return loss_loc
 
diff --git a/mmdet/models/dense_heads/nasfcos_head.py b/mmdet/models/dense_heads/nasfcos_head.py
index 472ec73d3c4..994ce0455e1 100644
--- a/mmdet/models/dense_heads/nasfcos_head.py
+++ b/mmdet/models/dense_heads/nasfcos_head.py
@@ -23,7 +23,7 @@ def _init_layers(self):
             type='DCNv2',
             kernel_size=3,
             use_bias=True,
-            deformable_groups=2,
+            deform_groups=2,
             padding=1)
         conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
         conv1x1_config = dict(type='Conv', kernel_size=1)
diff --git a/mmdet/models/dense_heads/reppoints_head.py b/mmdet/models/dense_heads/reppoints_head.py
index 72c20c8a875..447b648c96c 100644
--- a/mmdet/models/dense_heads/reppoints_head.py
+++ b/mmdet/models/dense_heads/reppoints_head.py
@@ -2,10 +2,10 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.ops import DeformConv2d
 
 from mmdet.core import (PointGenerator, build_assigner, build_sampler,
                         images_to_levels, multi_apply, multiclass_nms, unmap)
-from mmdet.ops import DeformConv
 from ..builder import HEADS, build_loss
 from .anchor_free_head import AnchorFreeHead
 
@@ -57,7 +57,7 @@ def __init__(self,
         self.use_grid_points = use_grid_points
         self.center_init = center_init
 
-        # we use deformable conv to extract points features
+        # we use deform conv to extract points features
         self.dcn_kernel = int(np.sqrt(num_points))
         self.dcn_pad = int((self.dcn_kernel - 1) / 2)
         assert self.dcn_kernel * self.dcn_kernel == num_points, \
@@ -130,9 +130,10 @@ def _init_layers(self):
                     conv_cfg=self.conv_cfg,
                     norm_cfg=self.norm_cfg))
         pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
-        self.reppoints_cls_conv = DeformConv(self.feat_channels,
-                                             self.point_feat_channels,
-                                             self.dcn_kernel, 1, self.dcn_pad)
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
         self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
                                            self.cls_out_channels, 1, 1, 0)
         self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
@@ -140,10 +141,10 @@ def _init_layers(self):
                                                  1, 1)
         self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
                                                 pts_out_dim, 1, 1, 0)
-        self.reppoints_pts_refine_conv = DeformConv(self.feat_channels,
-                                                    self.point_feat_channels,
-                                                    self.dcn_kernel, 1,
-                                                    self.dcn_pad)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
         self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
                                                   pts_out_dim, 1, 1, 0)
 
diff --git a/mmdet/models/dense_heads/rpn_head.py b/mmdet/models/dense_heads/rpn_head.py
index d118b743f5c..84a8cb10d77 100644
--- a/mmdet/models/dense_heads/rpn_head.py
+++ b/mmdet/models/dense_heads/rpn_head.py
@@ -2,8 +2,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import normal_init
+from mmcv.ops import batched_nms
 
-from mmdet.ops import batched_nms
 from ..builder import HEADS
 from .anchor_head import AnchorHead
 from .rpn_test_mixin import RPNTestMixin
@@ -163,6 +163,6 @@ def _get_bboxes_single(self,
                 ids = ids[valid_inds]
 
         # TODO: remove the hard coded nms type
-        nms_cfg = dict(type='nms', iou_thr=cfg.nms_thr)
+        nms_cfg = dict(type='nms', iou_threshold=cfg.nms_thr)
         dets, keep = batched_nms(proposals, scores, ids, nms_cfg)
         return dets[:cfg.nms_post]
diff --git a/mmdet/models/losses/focal_loss.py b/mmdet/models/losses/focal_loss.py
index 3f42102e950..7f657072619 100644
--- a/mmdet/models/losses/focal_loss.py
+++ b/mmdet/models/losses/focal_loss.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
 
-from mmdet.ops import sigmoid_focal_loss as _sigmoid_focal_loss
 from ..builder import LOSSES
 from .utils import weight_reduce_loss
 
@@ -67,7 +67,7 @@ def sigmoid_focal_loss(pred,
     """
     # Function.apply does not accept keyword arguments, so the decorator
     # "weighted_loss" is not applicable
-    loss = _sigmoid_focal_loss(pred, target, gamma, alpha)
+    loss = _sigmoid_focal_loss(pred, target, gamma, alpha, None, 'none')
     if weight is not None:
         if weight.shape != loss.shape:
             if weight.size(0) == loss.size(0):
diff --git a/mmdet/models/necks/bfp.py b/mmdet/models/necks/bfp.py
index 2788f39bd99..863a55530ed 100644
--- a/mmdet/models/necks/bfp.py
+++ b/mmdet/models/necks/bfp.py
@@ -1,8 +1,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule, xavier_init
+from mmcv.cnn.bricks import NonLocal2d
 
-from mmdet.ops import NonLocal2D
 from ..builder import NECKS
 
 
@@ -55,7 +55,7 @@ def __init__(self,
                 conv_cfg=self.conv_cfg,
                 norm_cfg=self.norm_cfg)
         elif self.refine_type == 'non_local':
-            self.refine = NonLocal2D(
+            self.refine = NonLocal2d(
                 self.in_channels,
                 reduction=1,
                 use_scale=False,
diff --git a/mmdet/models/necks/fpn_carafe.py b/mmdet/models/necks/fpn_carafe.py
index 1b17a40e10b..b97a6aa7343 100644
--- a/mmdet/models/necks/fpn_carafe.py
+++ b/mmdet/models/necks/fpn_carafe.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, build_upsample_layer, xavier_init
+from mmcv.ops.carafe import CARAFEPack
 
-from mmdet.ops.carafe import CARAFEPack
 from ..builder import NECKS
 
 
diff --git a/mmdet/models/necks/nas_fpn.py b/mmdet/models/necks/nas_fpn.py
index ab34287e703..8e333ce65d4 100644
--- a/mmdet/models/necks/nas_fpn.py
+++ b/mmdet/models/necks/nas_fpn.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, caffe2_xavier_init
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
 
-from mmdet.ops.merge_cells import GlobalPoolingCell, SumCell
 from ..builder import NECKS
 
 
diff --git a/mmdet/models/necks/nasfcos_fpn.py b/mmdet/models/necks/nasfcos_fpn.py
index 1110a98e097..2daf79ef591 100644
--- a/mmdet/models/necks/nasfcos_fpn.py
+++ b/mmdet/models/necks/nasfcos_fpn.py
@@ -1,8 +1,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule, caffe2_xavier_init
+from mmcv.ops.merge_cells import ConcatCell
 
-from mmdet.ops.merge_cells import ConcatCell
 from ..builder import NECKS
 
 
diff --git a/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
index 7ac5a870951..98b741a089f 100644
--- a/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
+++ b/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -3,12 +3,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule, build_upsample_layer
+from mmcv.ops import Conv2d
+from mmcv.ops.carafe import CARAFEPack
 from torch.nn.modules.utils import _pair
 
 from mmdet.core import auto_fp16, force_fp32, mask_target
 from mmdet.models.builder import HEADS, build_loss
-from mmdet.ops import Conv2d
-from mmdet.ops.carafe import CARAFEPack
 
 BYTES_PER_FLOAT = 4
 # TODO: This memory limit may be too much or too little. It would be better to
diff --git a/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
index 78e1d4a2a73..f38a5c9d759 100644
--- a/mmdet/models/roi_heads/mask_heads/mask_point_head.py
+++ b/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -3,9 +3,9 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, normal_init
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
 
 from mmdet.models.builder import HEADS, build_loss
-from mmdet.ops import point_sample, rel_roi_point_to_rel_img_point
 
 
 @HEADS.register_module()
diff --git a/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/mmdet/models/roi_heads/mask_heads/maskiou_head.py
index f5722cd7ba3..1c09eae8f41 100644
--- a/mmdet/models/roi_heads/mask_heads/maskiou_head.py
+++ b/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -2,11 +2,11 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import kaiming_init, normal_init
+from mmcv.ops import Conv2d, Linear, MaxPool2d
 from torch.nn.modules.utils import _pair
 
 from mmdet.core import force_fp32
 from mmdet.models.builder import HEADS, build_loss
-from mmdet.ops import Conv2d, Linear, MaxPool2d
 
 
 @HEADS.register_module()
diff --git a/mmdet/models/roi_heads/point_rend_roi_head.py b/mmdet/models/roi_heads/point_rend_roi_head.py
index f23a2a81e06..309258f1f50 100644
--- a/mmdet/models/roi_heads/point_rend_roi_head.py
+++ b/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -2,9 +2,9 @@
 
 import torch
 import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
 
 from mmdet.core import bbox2roi, bbox_mapping, merge_aug_masks
-from mmdet.ops import point_sample, rel_roi_point_to_rel_img_point
 from .. import builder
 from ..builder import HEADS
 from .standard_roi_head import StandardRoIHead
diff --git a/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
index ae1f705be59..0e42b52f361 100644
--- a/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
+++ b/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -2,8 +2,7 @@
 
 import torch
 import torch.nn as nn
-
-from mmdet import ops
+from mmcv import ops
 
 
 class BaseRoIExtractor(nn.Module, metaclass=ABCMeta):
@@ -35,7 +34,7 @@ def build_roi_layers(self, layer_cfg, featmap_strides):
 
         Args:
             layer_cfg (dict): Dictionary to construct and config RoI layer
-                operation. Options are modules under ``mmdet/ops`` such as
+                operation. Options are modules under ``mmcv/ops`` such as
                 ``RoIAlign``.
             featmap_strides (int): The stride of input feature map w.r.t to the
                 original image size, which would be used to scale RoI
diff --git a/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
index 0e7aff0bad3..5b1db799d82 100644
--- a/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
+++ b/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -1,6 +1,7 @@
+from mmcv.cnn.bricks import build_plugin_layer
+
 from mmdet.core import force_fp32
 from mmdet.models.builder import ROI_EXTRACTORS
-from mmdet.ops.plugin import build_plugin_layer
 from .base_roi_extractor import BaseRoIExtractor
 
 
@@ -44,7 +45,7 @@ def forward(self, feats, rois, roi_scale_factor=None):
         if len(feats) == 1:
             return self.roi_layers[0](feats[0], rois)
 
-        out_size = self.roi_layers[0].out_size
+        out_size = self.roi_layers[0].output_size
         num_levels = len(feats)
         roi_feats = feats[0].new_zeros(
             rois.size(0), self.out_channels, *out_size)
diff --git a/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
index 94097eaa40a..a52857c7e34 100644
--- a/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
+++ b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -53,7 +53,7 @@ def map_roi_levels(self, rois, num_levels):
     @force_fp32(apply_to=('feats', ), out_fp16=True)
     def forward(self, feats, rois, roi_scale_factor=None):
         """Forward function."""
-        out_size = self.roi_layers[0].out_size
+        out_size = self.roi_layers[0].output_size
         num_levels = len(feats)
         roi_feats = feats[0].new_zeros(
             rois.size(0), self.out_channels, *out_size)
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
deleted file mode 100644
index e05334e76fc..00000000000
--- a/mmdet/ops/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from .context_block import ContextBlock
-from .conv_ws import ConvWS2d, conv_ws_2d
-from .corner_pool import CornerPool
-from .dcn import (DeformConv, DeformConvPack, DeformRoIPooling,
-                  DeformRoIPoolingPack, ModulatedDeformConv,
-                  ModulatedDeformConvPack, ModulatedDeformRoIPoolingPack,
-                  deform_conv, deform_roi_pooling, modulated_deform_conv)
-from .generalized_attention import GeneralizedAttention
-from .masked_conv import MaskedConv2d
-from .nms import batched_nms, nms, nms_match, soft_nms
-from .non_local import NonLocal2D
-from .plugin import build_plugin_layer
-from .point_sample import (SimpleRoIAlign, point_sample,
-                           rel_roi_point_to_rel_img_point)
-from .roi_align import RoIAlign, roi_align
-from .roi_pool import RoIPool, roi_pool
-from .saconv import SAConv2d
-from .sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss
-from .utils import get_compiler_version, get_compiling_cuda_version
-from .wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
-
-__all__ = [
-    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
-    'DeformConv', 'DeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
-    'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
-    'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv',
-    'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss',
-    'MaskedConv2d', 'ContextBlock', 'GeneralizedAttention', 'NonLocal2D',
-    'get_compiler_version', 'get_compiling_cuda_version', 'ConvWS2d',
-    'conv_ws_2d', 'build_plugin_layer', 'batched_nms', 'Conv2d',
-    'ConvTranspose2d', 'MaxPool2d', 'Linear', 'nms_match', 'CornerPool',
-    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
-    'SAConv2d'
-]
diff --git a/mmdet/ops/carafe/__init__.py b/mmdet/ops/carafe/__init__.py
deleted file mode 100644
index 029038f897f..00000000000
--- a/mmdet/ops/carafe/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
-
-__all__ = ['carafe', 'carafe_naive', 'CARAFE', 'CARAFENaive', 'CARAFEPack']
diff --git a/mmdet/ops/carafe/carafe.py b/mmdet/ops/carafe/carafe.py
deleted file mode 100644
index cd72f55e306..00000000000
--- a/mmdet/ops/carafe/carafe.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
-from torch.autograd import Function
-from torch.nn.modules.module import Module
-
-from . import carafe_ext, carafe_naive_ext
-
-
-class CARAFENaiveFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-        assert scale_factor >= 1
-        assert masks.size(1) == kernel_size * kernel_size * group_size
-        assert masks.size(-1) == features.size(-1) * scale_factor
-        assert masks.size(-2) == features.size(-2) * scale_factor
-        assert features.size(1) % group_size == 0
-        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
-        ctx.kernel_size = kernel_size
-        ctx.group_size = group_size
-        ctx.scale_factor = scale_factor
-        ctx.feature_size = features.size()
-        ctx.mask_size = masks.size()
-
-        n, c, h, w = features.size()
-        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
-        if features.is_cuda:
-            carafe_naive_ext.forward(features, masks, kernel_size, group_size,
-                                     scale_factor, output)
-        else:
-            raise NotImplementedError
-
-        if features.requires_grad or masks.requires_grad:
-            ctx.save_for_backward(features, masks)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        assert grad_output.is_cuda
-
-        features, masks = ctx.saved_tensors
-        kernel_size = ctx.kernel_size
-        group_size = ctx.group_size
-        scale_factor = ctx.scale_factor
-
-        grad_input = torch.zeros_like(features)
-        grad_masks = torch.zeros_like(masks)
-        carafe_naive_ext.backward(grad_output.contiguous(), features, masks,
-                                  kernel_size, group_size, scale_factor,
-                                  grad_input, grad_masks)
-
-        return grad_input, grad_masks, None, None, None
-
-
-carafe_naive = CARAFENaiveFunction.apply
-
-
-class CARAFENaive(Module):
-
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFENaive, self).__init__()
-
-        assert isinstance(kernel_size, int) and isinstance(
-            group_size, int) and isinstance(scale_factor, int)
-        self.kernel_size = kernel_size
-        self.group_size = group_size
-        self.scale_factor = scale_factor
-
-    def forward(self, features, masks):
-        return CARAFENaiveFunction.apply(features, masks, self.kernel_size,
-                                         self.group_size, self.scale_factor)
-
-
-class CARAFEFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
-        assert scale_factor >= 1
-        assert masks.size(1) == kernel_size * kernel_size * group_size
-        assert masks.size(-1) == features.size(-1) * scale_factor
-        assert masks.size(-2) == features.size(-2) * scale_factor
-        assert features.size(1) % group_size == 0
-        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
-        ctx.kernel_size = kernel_size
-        ctx.group_size = group_size
-        ctx.scale_factor = scale_factor
-        ctx.feature_size = features.size()
-        ctx.mask_size = masks.size()
-
-        n, c, h, w = features.size()
-        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
-        routput = features.new_zeros(output.size(), requires_grad=False)
-        rfeatures = features.new_zeros(features.size(), requires_grad=False)
-        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
-        if features.is_cuda:
-            carafe_ext.forward(features, rfeatures, masks, rmasks, kernel_size,
-                               group_size, scale_factor, routput, output)
-        else:
-            raise NotImplementedError
-
-        if features.requires_grad or masks.requires_grad:
-            ctx.save_for_backward(features, masks, rfeatures)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        assert grad_output.is_cuda
-
-        features, masks, rfeatures = ctx.saved_tensors
-        kernel_size = ctx.kernel_size
-        group_size = ctx.group_size
-        scale_factor = ctx.scale_factor
-
-        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
-        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
-        rgrad_input = torch.zeros_like(features, requires_grad=False)
-        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
-        grad_input = torch.zeros_like(features, requires_grad=False)
-        grad_masks = torch.zeros_like(masks, requires_grad=False)
-        carafe_ext.backward(grad_output.contiguous(), rfeatures, masks,
-                            kernel_size, group_size, scale_factor,
-                            rgrad_output, rgrad_input_hs, rgrad_input,
-                            rgrad_masks, grad_input, grad_masks)
-        return grad_input, grad_masks, None, None, None, None
-
-
-carafe = CARAFEFunction.apply
-
-
-class CARAFE(Module):
-    """ CARAFE: Content-Aware ReAssembly of FEatures
-
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
-
-    Args:
-        kernel_size (int): reassemble kernel size
-        group_size (int): reassemble group size
-        scale_factor (int): upsample ratio
-
-    Returns:
-        upsampled feature map
-    """
-
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFE, self).__init__()
-
-        assert isinstance(kernel_size, int) and isinstance(
-            group_size, int) and isinstance(scale_factor, int)
-        self.kernel_size = kernel_size
-        self.group_size = group_size
-        self.scale_factor = scale_factor
-
-    def forward(self, features, masks):
-        return CARAFEFunction.apply(features, masks, self.kernel_size,
-                                    self.group_size, self.scale_factor)
-
-
-@UPSAMPLE_LAYERS.register_module(name='carafe')
-class CARAFEPack(nn.Module):
-    """A unified package of CARAFE upsampler that contains: 1) channel
-    compressor 2) content encoder 3) CARAFE op.
-
-    Official implementation of ICCV 2019 paper
-    CARAFE: Content-Aware ReAssembly of FEatures
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
-
-    Args:
-        channels (int): input feature channels
-        scale_factor (int): upsample ratio
-        up_kernel (int): kernel size of CARAFE op
-        up_group (int): group size of CARAFE op
-        encoder_kernel (int): kernel size of content encoder
-        encoder_dilation (int): dilation of content encoder
-        compressed_channels (int): output channels of channels compressor
-
-    Returns:
-        upsampled feature map
-    """
-
-    def __init__(self,
-                 channels,
-                 scale_factor,
-                 up_kernel=5,
-                 up_group=1,
-                 encoder_kernel=3,
-                 encoder_dilation=1,
-                 compressed_channels=64):
-        super(CARAFEPack, self).__init__()
-        self.channels = channels
-        self.scale_factor = scale_factor
-        self.up_kernel = up_kernel
-        self.up_group = up_group
-        self.encoder_kernel = encoder_kernel
-        self.encoder_dilation = encoder_dilation
-        self.compressed_channels = compressed_channels
-        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
-                                            1)
-        self.content_encoder = nn.Conv2d(
-            self.compressed_channels,
-            self.up_kernel * self.up_kernel * self.up_group *
-            self.scale_factor * self.scale_factor,
-            self.encoder_kernel,
-            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
-            dilation=self.encoder_dilation,
-            groups=1)
-        self.init_weights()
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                xavier_init(m, distribution='uniform')
-        normal_init(self.content_encoder, std=0.001)
-
-    def kernel_normalizer(self, mask):
-        mask = F.pixel_shuffle(mask, self.scale_factor)
-        n, mask_c, h, w = mask.size()
-        mask_channel = int(mask_c / (self.up_kernel * self.up_kernel))
-        mask = mask.view(n, mask_channel, -1, h, w)
-
-        mask = F.softmax(mask, dim=2)
-        mask = mask.view(n, mask_c, h, w).contiguous()
-
-        return mask
-
-    def feature_reassemble(self, x, mask):
-        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
-        return x
-
-    def forward(self, x):
-        compressed_x = self.channel_compressor(x)
-        mask = self.content_encoder(compressed_x)
-        mask = self.kernel_normalizer(mask)
-
-        x = self.feature_reassemble(x, mask)
-        return x
diff --git a/mmdet/ops/carafe/grad_check.py b/mmdet/ops/carafe/grad_check.py
deleted file mode 100644
index 9ddb2398342..00000000000
--- a/mmdet/ops/carafe/grad_check.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import os.path as osp
-import sys
-
-import mmcv
-import torch
-from torch.autograd import gradcheck
-
-sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-from mmdet.ops.carafe import CARAFE, CARAFENaive  # noqa: E402, isort:skip
-from mmdet.ops.carafe import carafe, carafe_naive  # noqa: E402, isort:skip
-
-feat = torch.randn(2, 64, 3, 3, requires_grad=True, device='cuda:0').double()
-mask = torch.randn(
-    2, 100, 6, 6, requires_grad=True, device='cuda:0').sigmoid().double()
-
-print('Gradcheck for carafe...')
-test = gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
-print(test)
-
-print('Gradcheck for carafe naive...')
-test = gradcheck(CARAFENaive(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
-print(test)
-
-feat = torch.randn(
-    2, 1024, 100, 100, requires_grad=True, device='cuda:0').float()
-mask = torch.randn(
-    2, 25, 200, 200, requires_grad=True, device='cuda:0').sigmoid().float()
-loop_num = 500
-
-time_forward = 0
-time_backward = 0
-bar = mmcv.ProgressBar(loop_num)
-timer = mmcv.Timer()
-for i in range(loop_num):
-    x = carafe(feat.clone(), mask.clone(), 5, 1, 2)
-    torch.cuda.synchronize()
-    time_forward += timer.since_last_check()
-    x.sum().backward(retain_graph=True)
-    torch.cuda.synchronize()
-    time_backward += timer.since_last_check()
-    bar.update()
-forward_speed = (time_forward + 1e-3) * 1e3 / loop_num
-backward_speed = (time_backward + 1e-3) * 1e3 / loop_num
-print(f'\nCARAFE time forward: {forward_speed} '
-      f'ms/iter | time backward: {backward_speed} ms/iter')
-
-time_naive_forward = 0
-time_naive_backward = 0
-bar = mmcv.ProgressBar(loop_num)
-timer = mmcv.Timer()
-for i in range(loop_num):
-    x = carafe_naive(feat.clone(), mask.clone(), 5, 1, 2)
-    torch.cuda.synchronize()
-    time_naive_forward += timer.since_last_check()
-    x.sum().backward(retain_graph=True)
-    torch.cuda.synchronize()
-    time_naive_backward += timer.since_last_check()
-    bar.update()
-forward_speed = (time_naive_forward + 1e-3) * 1e3 / loop_num
-backward_speed = (time_naive_backward + 1e-3) * 1e3 / loop_num
-print('\nCARAFE naive time forward: '
-      f'{forward_speed} ms/iter | time backward: {backward_speed} ms/iter')
diff --git a/mmdet/ops/carafe/setup.py b/mmdet/ops/carafe/setup.py
deleted file mode 100644
index 9b2a46d4e00..00000000000
--- a/mmdet/ops/carafe/setup.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from setuptools import setup
-
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-NVCC_ARGS = [
-    '-D__CUDA_NO_HALF_OPERATORS__',
-    '-D__CUDA_NO_HALF_CONVERSIONS__',
-    '-D__CUDA_NO_HALF2_OPERATORS__',
-]
-
-setup(
-    name='carafe',
-    ext_modules=[
-        CUDAExtension(
-            'carafe_ext', [
-                'src/cuda/carafe_cuda.cpp', 'src/cuda/carafe_cuda_kernel.cu',
-                'src/carafe_ext.cpp'
-            ],
-            define_macros=[('WITH_CUDA', None)],
-            extra_compile_args={
-                'cxx': [],
-                'nvcc': NVCC_ARGS
-            }),
-        CUDAExtension(
-            'carafe_naive_ext', [
-                'src/cuda/carafe_naive_cuda.cpp',
-                'src/cuda/carafe_naive_cuda_kernel.cu',
-                'src/carafe_naive_ext.cpp'
-            ],
-            define_macros=[('WITH_CUDA', None)],
-            extra_compile_args={
-                'cxx': [],
-                'nvcc': NVCC_ARGS
-            })
-    ],
-    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/carafe/src/carafe_ext.cpp b/mmdet/ops/carafe/src/carafe_ext.cpp
deleted file mode 100644
index 7998ac2cd9a..00000000000
--- a/mmdet/ops/carafe/src/carafe_ext.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int carafe_forward_cuda(at::Tensor features, at::Tensor rfeatures,
-                        at::Tensor masks, at::Tensor rmasks, int kernel_size,
-                        int group_size, int scale_factor, at::Tensor routput,
-                        at::Tensor output);
-
-int carafe_backward_cuda(at::Tensor top_grad, at::Tensor rfeatures,
-                         at::Tensor masks, int kernel_size, int group_size,
-                         int scale_factor, at::Tensor rtop_grad,
-                         at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
-                         at::Tensor rmask_grad, at::Tensor bottom_grad,
-                         at::Tensor mask_grad);
-#endif
-
-int carafe_forward(at::Tensor features, at::Tensor rfeatures,
-                   at::Tensor masks, at::Tensor rmasks, int kernel_size,
-                   int group_size, int scale_factor, at::Tensor routput,
-                   at::Tensor output) {
-  if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return carafe_forward_cuda(features, rfeatures, masks, rmasks, kernel_size,
-                               group_size, scale_factor, routput, output);
-#else
-    AT_ERROR("carafe is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("carafe is not implemented on CPU");
-}
-
-int carafe_backward(at::Tensor top_grad, at::Tensor rfeatures,
-                    at::Tensor masks, int kernel_size, int group_size,
-                    int scale_factor, at::Tensor rtop_grad,
-                    at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
-                    at::Tensor rmask_grad, at::Tensor bottom_grad,
-                    at::Tensor mask_grad) {
-  if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return carafe_backward_cuda(top_grad, rfeatures, masks, kernel_size,
-        group_size, scale_factor, rtop_grad, rbottom_grad_hs, rbottom_grad,
-        rmask_grad, bottom_grad, mask_grad);
-#else
-    AT_ERROR("carafe is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("carafe is not implemented on CPU");
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &carafe_forward, "carafe forward");
-  m.def("backward", &carafe_backward, "carafe backward");
-}
diff --git a/mmdet/ops/carafe/src/carafe_naive_ext.cpp b/mmdet/ops/carafe/src/carafe_naive_ext.cpp
deleted file mode 100644
index 357b8625df8..00000000000
--- a/mmdet/ops/carafe/src/carafe_naive_ext.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/torch.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int carafe_naive_forward_cuda(at::Tensor features, at::Tensor masks,
-                              int kernel_size, int group_size, int scale_factor,
-                              at::Tensor output);
-
-int carafe_naive_backward_cuda(at::Tensor top_grad, at::Tensor features,
-                               at::Tensor masks, int kernel_size,
-                               int group_size, int scale_factor,
-                               at::Tensor bottom_grad, at::Tensor mask_grad);
-#endif
-
-int carafe_naive_forward(at::Tensor features, at::Tensor masks,
-                         int kernel_size, int group_size, int scale_factor,
-                         at::Tensor output) {
-  if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return carafe_naive_forward_cuda(features, masks, kernel_size,
-        group_size, scale_factor, output);
-#else
-    AT_ERROR("carafe naive is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("carafe naive is not implemented on CPU");
-}
-
-int carafe_naive_backward(at::Tensor top_grad, at::Tensor features,
-                               at::Tensor masks, int kernel_size,
-                               int group_size, int scale_factor,
-                               at::Tensor bottom_grad, at::Tensor mask_grad) {
-  if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return carafe_naive_backward_cuda(top_grad, features, masks, kernel_size,
-        group_size, scale_factor, bottom_grad, mask_grad);
-#else
-    AT_ERROR("carafe naive is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("carafe naive is not implemented on CPU");
-
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &carafe_naive_forward, "carafe_naive forward");
-  m.def("backward", &carafe_naive_backward, "carafe_naive backward");
-}
diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp
deleted file mode 100644
index 59b536c027c..00000000000
--- a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
-                         const int kernel_size, const int group_size,
-                         const int scale_factor, const int batch_size,
-                         const int channels, const int input_height,
-                         const int input_width, const int output_height,
-                         const int output_width, const int mask_channels,
-                         at::Tensor rfeatures, at::Tensor routput,
-                         at::Tensor rmasks, at::Tensor output);
-
-int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
-                          const at::Tensor masks, const int kernel_size,
-                          const int group_size, const int scale_factor,
-                          const int batch_size, const int channels,
-                          const int input_height, const int input_width,
-                          const int output_height, const int output_width,
-                          const int mask_channels, at::Tensor rtop_grad,
-                          at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
-                          at::Tensor rmask_grad, at::Tensor bottom_grad,
-                          at::Tensor mask_grad);
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-int carafe_forward_cuda(at::Tensor features, at::Tensor rfeatures,
-                        at::Tensor masks, at::Tensor rmasks, int kernel_size,
-                        int group_size, int scale_factor, at::Tensor routput,
-                        at::Tensor output) {
-  CHECK_INPUT(features);
-  CHECK_INPUT(rfeatures);
-  CHECK_INPUT(masks);
-  CHECK_INPUT(rmasks);
-  CHECK_INPUT(output);
-  CHECK_INPUT(routput);
-  at::DeviceGuard guard(features.device());
-
-  const int batch_size = output.size(0);
-  const int num_channels = output.size(1);
-  const int output_height = output.size(2);
-  const int output_width = output.size(3);
-
-  const int input_height = features.size(2);
-  const int input_width = features.size(3);
-
-  const int mask_channels = masks.size(1);
-
-  rfeatures.resize_({batch_size, input_height, input_width, num_channels});
-  routput.resize_({batch_size, output_height, output_width, num_channels});
-  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
-
-  CARAFEForwardLaucher(features, masks, kernel_size, group_size, scale_factor,
-                       batch_size, num_channels, input_height, input_width,
-                       output_height, output_width, mask_channels, rfeatures,
-                       routput, rmasks, output);
-
-  return 1;
-}
-
-int carafe_backward_cuda(at::Tensor top_grad, at::Tensor rfeatures,
-                         at::Tensor masks, int kernel_size, int group_size,
-                         int scale_factor, at::Tensor rtop_grad,
-                         at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
-                         at::Tensor rmask_grad, at::Tensor bottom_grad,
-                         at::Tensor mask_grad) {
-  CHECK_INPUT(top_grad);
-  CHECK_INPUT(rfeatures);
-  CHECK_INPUT(masks);
-  CHECK_INPUT(rtop_grad);
-  CHECK_INPUT(rbottom_grad_hs);
-  CHECK_INPUT(rbottom_grad);
-  CHECK_INPUT(rmask_grad);
-  CHECK_INPUT(bottom_grad);
-  CHECK_INPUT(mask_grad);
-  at::DeviceGuard guard(top_grad.device());
-
-  const int batch_size = top_grad.size(0);
-  const int num_channels = top_grad.size(1);
-  const int output_height = top_grad.size(2);
-  const int output_width = top_grad.size(3);
-
-  const int input_height = bottom_grad.size(2);
-  const int input_width = bottom_grad.size(3);
-
-  const int mask_channels = masks.size(1);
-
-  rtop_grad.resize_({batch_size, output_height, output_width, num_channels});
-  rbottom_grad.resize_({batch_size, input_height, input_width, num_channels});
-  rbottom_grad_hs.resize_(
-      {batch_size, output_height, output_width, num_channels});
-  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
-
-  CARAFEBackwardLaucher(top_grad, rfeatures, masks, kernel_size, group_size,
-                        scale_factor, batch_size, num_channels, input_height,
-                        input_width, output_height, output_width, mask_channels,
-                        rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
-                        bottom_grad, mask_grad);
-
-  return 1;
-}
diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu
deleted file mode 100644
index a9566cf6e8c..00000000000
--- a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu
+++ /dev/null
@@ -1,489 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <THC/THCAtomics.cuh>
-#include <cmath>
-
-using namespace at;
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define THREADS_PER_BLOCK 1024  // 32 * 32
-#define WARP_SIZE 32
-#define THREADS_PER_PIXEL 32
-#define MAX_SHARED_MEMORY 49152
-#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
-#define MAXIMIZE_KERNEL_SIZE true
-#define kTileDim 32
-#define kBlockRows 8
-#define FULL_MASK 0xffffffff
-
-inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
-
-__device__ inline int Loc2Index(const int n, const int c, const int h,
-                                const int w, const int channel_num,
-                                const int height, const int width) {
-  int index = w + (h + (c + n * channel_num) * height) * width;
-  return index;
-}
-/* TODO: move this to a common place */
-template <typename scalar_t>
-__device__ inline scalar_t min(scalar_t a, scalar_t b) {
-  return a < b ? a : b;
-}
-
-template <typename scalar_t>
-__device__ inline scalar_t max(scalar_t a, scalar_t b) {
-  return a > b ? a : b;
-}
-
-template <typename scalar_t>
-__device__ __forceinline__ scalar_t WARP_SHFL_DOWN(scalar_t val, int offset)
-{
-    return __shfl_down_sync(FULL_MASK, val, offset);
-}
-
-template<>
-__device__ __forceinline__ c10::Half WARP_SHFL_DOWN<c10::Half>(c10::Half val, int offset)
-{
-  return c10::Half(WARP_SHFL_DOWN<unsigned short>(val.x, offset), c10::Half::from_bits_t{});
-}
-
-
-template <typename scalar_t>
-__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
-  for (int offset = 16; offset > 0; offset /= 2)
-    // val += __shfl_down_sync(FULL_MASK, val, offset);
-    val += WARP_SHFL_DOWN(val, offset);
-  return val;
-}
-
-// Splits the original matrix into submatrices with size 32 * 32.
-// Each block transposes one submatrix by loading it into shared memory.
-// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
-template <typename scalar_t>
-__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
-                                           const int W, const int dh,
-                                           const int dw,
-                                           const scalar_t *__restrict__ X,
-                                           scalar_t *__restrict__ Y) {
-  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
-  const int n = blockIdx.x / (dh * dw);
-  const int k = blockIdx.x % (dh * dw);
-  const int r = k / dw;
-  const int c = k % dw;
-  const int offset = n * H * W;
-  int x = c * kTileDim + threadIdx.x;
-  int y = r * kTileDim + threadIdx.y;
-  if (x < W) {
-    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
-      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
-    }
-  }
-  __syncthreads();
-  x = r * kTileDim + threadIdx.x;
-  y = c * kTileDim + threadIdx.y;
-  if (x < H) {
-    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
-      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void CARAFEForward(
-    const int num_kernels, const scalar_t *__restrict__ bottom_data,
-    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
-    const int group_size, const int scale_factor, const int channels,
-    const int down_height, const int down_width, const int height,
-    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
-#if MAXIMIZE_KERNEL_SIZE
-  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
-#else
-  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
-#endif
-
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index > num_kernels - 1) {
-    return;
-  }
-  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
-  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
-  index = index / THREADS_PER_PIXEL;
-  const int pw = index % width;
-  const int ph = (index / width) % height;
-  const int n = index / width / height;
-
-  const int down_pw = pw / scale_factor;
-  const int down_ph = ph / scale_factor;
-
-  const int start_w = down_pw - (kernel_size - 1) / 2;
-  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
-  const int start_h = down_ph - (kernel_size - 1) / 2;
-  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
-  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
-    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
-    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
-  }
-  __syncthreads();
-
-  const int channels_per_group = ceilf(channels / (float)group_size);
-#pragma unroll
-  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
-    int mask_group = c / channels_per_group;
-    scalar_t output_val = 0;
-#pragma unroll
-    for (int iy = start_h; iy < end_h; iy++) {
-#pragma unroll
-      for (int ix = start_w; ix < end_w; ix++) {
-        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
-          continue;
-        }
-        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
-        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
-        int mask_c =
-            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
-        int feat_index =
-            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
-
-        output_val += bottom_data[feat_index] *
-                      shared_mask[mask_c * WARP_SIZE + pixel_id];
-      }
-    }
-
-    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
-    top_data[top_index] = output_val;
-  }
-}
-
-int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
-                         const int kernel_size, const int group_size,
-                         const int scale_factor, const int batch_size,
-                         const int channels, const int input_height,
-                         const int input_width, const int output_height,
-                         const int output_width, const int mask_channels,
-                         at::Tensor rfeatures, at::Tensor routput,
-                         at::Tensor rmasks, at::Tensor output) {
-  // one warp per pixel
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
-        const int dh = divideUP(channels, kTileDim);
-        const int dw = divideUP(input_height * input_width, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, channels, input_height * input_width, dh, dw,
-                bottom_data, top_data);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
-        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
-        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
-        const int dh = divideUP(mask_channels, kTileDim);
-        const int dw = divideUP(output_height * output_width, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, mask_channels, output_height * output_width, dh, dw,
-                bottom_data, top_data);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "CARAFELaucherForward", ([&] {
-        const int num_kernels =
-            batch_size * output_height * output_width * THREADS_PER_PIXEL;
-        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
-        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
-        scalar_t *top_data = routput.data_ptr<scalar_t>();
-
-        CARAFEForward<scalar_t>
-            <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
-               THREADS_PER_BLOCK, 0, stream>>>(
-                num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
-                scale_factor, channels, input_height, input_width,
-                output_height, output_width, mask_channels, top_data);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "NHWC2NCHW", ([&] {
-        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data_ptr<scalar_t>();
-        const int dh = divideUP(output_height * output_width, kTileDim);
-        const int dw = divideUP(channels, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, output_height * output_width, channels, dh, dw,
-                bottom_data, top_data);
-      }));
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  return 1;
-}
-
-template <typename scalar_t>
-__global__ void CARAFEBackward_Feature(
-    const int num_kernels, const scalar_t *__restrict__ top_diff,
-    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
-    const int group_size, const int scale_factor, const int channels,
-    const int down_height, const int down_width, const int height,
-    const int width, const int mask_channels,
-    scalar_t *__restrict__ bottom_diff) {
-#if MAXIMIZE_KERNEL_SIZE
-  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
-#else
-  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
-#endif
-
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index > num_kernels - 1) {
-    return;
-  }
-
-  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
-  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
-  // (n, c, ph, pw) is an element in the bottom_data
-  index = index / THREADS_PER_PIXEL;
-  const int pw = index % width;
-  const int ph = (index / width) % height;
-  const int n = index / width / height;
-
-  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
-  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
-  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
-  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
-  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
-    const int mask_w = (c % kernel_size) * scale_factor;
-    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
-    const int mask_x = start_w + mask_w;
-    const int mask_y = start_h + mask_h;
-    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
-      shared_mask[c * WARP_SIZE + pixel_id] = 0;
-      continue;
-    }
-    const int mask_group = c / (kernel_size * kernel_size);
-    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
-    int mask_index =
-        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
-    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
-  }
-  __syncthreads();
-  const int channels_per_group = ceilf(channels / (float)group_size);
-#pragma unroll
-  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
-    int mask_group = c / channels_per_group;
-    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
-    scalar_t output_val = 0;
-#pragma unroll
-    for (int iy = start_h; iy < end_h; iy += scale_factor) {
-#pragma unroll
-      for (int ix = start_w; ix < end_w; ix += scale_factor) {
-        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
-          continue;
-        }
-        int mask_iy =
-            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
-        int mask_ix =
-            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
-        int mask_c =
-            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
-        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
-        output_val +=
-            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
-      }
-    }
-    bottom_diff[top_index] = output_val;
-  }
-}
-
-template <typename scalar_t>
-__global__ void FeatureSum(const int num_kernels,
-                           const scalar_t *__restrict__ input_data,
-                           const int scale_factor, const int channels,
-                           const int height, const int width,
-                           scalar_t *__restrict__ output_data) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index > num_kernels - 1) {
-    return;
-  }
-  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
-  index = index / THREADS_PER_PIXEL;
-  const int pw = index % width;
-  const int ph = (index / width) % height;
-  const int n = index / width / height;
-  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
-    scalar_t output_val = 0;
-    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
-      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
-        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
-                                 width * scale_factor, channels);
-        output_val += input_data[input_id];
-      }
-    }
-    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
-    output_data[output_id] = output_val;
-  }
-}
-
-template <typename scalar_t>
-__global__ void CARAFEBackward_Mask(const int num_kernels,
-                                    const scalar_t *__restrict__ top_diff,
-                                    const scalar_t *__restrict__ bottom_data,
-                                    const int kernel_size, const int group_size,
-                                    const int scale_factor, const int channels,
-                                    const int down_height, const int down_width,
-                                    const int height, const int width,
-                                    const int mask_channels,
-                                    scalar_t *__restrict__ mask_diff) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index > num_kernels - 1) {
-    return;
-  }
-
-  const int lane_id = index % WARP_SIZE;
-  index = index / WARP_SIZE;
-  const int mask_c = index % mask_channels;
-  // (n, c, ph, pw) is an element in the bottom_data
-  index = index / mask_channels;
-  const int pw = index % width;
-  const int ph = (index / width) % height;
-  const int n = index / width / height;
-
-  const int down_pw = pw / scale_factor;
-  const int down_ph = ph / scale_factor;
-
-  const int mask_group = mask_c / (kernel_size * kernel_size);
-  const int mask_loc = mask_c % (kernel_size * kernel_size);
-
-  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
-  const int offset_y =
-      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
-
-  const int down_x = down_pw + offset_x;
-  const int down_y = down_ph + offset_y;
-
-  scalar_t output_val = 0;
-
-  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
-      down_x <= down_width - 1) {
-    const int channels_per_mask = ceilf(channels / (float)group_size);
-    const int start = channels_per_mask * mask_group;
-    const int end = min(channels_per_mask * (mask_group + 1), channels);
-    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
-      int bottom_id =
-          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
-      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
-      output_val += top_diff[top_id] * bottom_data[bottom_id];
-    }
-  }
-  __syncwarp();
-  output_val = warpReduceSum(output_val);
-  if (lane_id == 0) {
-    const int mask_id =
-        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
-    mask_diff[mask_id] = output_val;
-  }
-}
-
-int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
-                          const at::Tensor masks, const int kernel_size,
-                          const int group_size, const int scale_factor,
-                          const int batch_size, const int channels,
-                          const int input_height, const int input_width,
-                          const int output_height, const int output_width,
-                          const int mask_channels, at::Tensor rtop_grad,
-                          at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
-                          at::Tensor rmask_grad, at::Tensor bottom_grad,
-                          at::Tensor mask_grad) {
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
-        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
-        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
-        const int dh = divideUP(channels, kTileDim);
-        const int dw = divideUP(output_height * output_width, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, channels, output_height * output_width, dh, dw,
-                bottom_data, top_data);
-      }));
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
-        const int num_kernels =
-            batch_size * output_height * output_width * THREADS_PER_PIXEL;
-        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
-        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
-
-        CARAFEBackward_Feature<scalar_t>
-            <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
-               THREADS_PER_BLOCK, 0, stream>>>(
-                num_kernels, top_diff, bottom_masks, kernel_size, group_size,
-                scale_factor, channels, input_height, input_width,
-                output_height, output_width, mask_channels, bottom_diff);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "FeatureSum", ([&] {
-        const int num_kernels =
-            batch_size * input_height * input_width * THREADS_PER_PIXEL;
-        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
-
-        FeatureSum<scalar_t>
-            <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
-               THREADS_PER_BLOCK, 0, stream>>>(
-                num_kernels, bottom_diff_hs, scale_factor, channels,
-                input_height, input_width, bottom_diff);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
-        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
-        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
-        const int dh = divideUP(input_height * input_width, kTileDim);
-        const int dw = divideUP(channels, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, input_height * input_width, channels, dh, dw,
-                bottom_data, top_data);
-      }));
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
-        const int num_kernels = batch_size * output_height * output_width *
-                                mask_channels * WARP_SIZE;
-        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
-        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
-        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
-
-        CARAFEBackward_Mask<scalar_t>
-            <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
-               THREADS_PER_BLOCK, 0, stream>>>(
-                num_kernels, top_diff, bottom_data, kernel_size, group_size,
-                scale_factor, channels, input_height, input_width,
-                output_height, output_width, mask_channels, mask_diff);
-      }));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
-        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
-        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
-        const int dh = divideUP(output_height * output_width, kTileDim);
-        const int dw = divideUP(mask_channels, kTileDim);
-        BatchTranspose2DCUDAKernel<scalar_t>
-            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
-                batch_size, output_height * output_width, mask_channels, dh, dw,
-                bottom_data, top_data);
-      }));
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  return 1;
-}
diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp
deleted file mode 100644
index 394afd3ad06..00000000000
--- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/torch.h>
-
-#include <cmath>
-#include <vector>
-
-int CARAFENAIVEForwardLaucher(const at::Tensor features, const at::Tensor masks,
-                              const int kernel_size, const int group_size,
-                              const int scale_factor, const int batch_size,
-                              const int channels, const int height,
-                              const int width, at::Tensor output);
-
-int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad,
-                               const at::Tensor features,
-                               const at::Tensor masks, const int kernel_size,
-                               const int group_size, const int scale_factor,
-                               const int batch_size, const int channels,
-                               const int height, const int width,
-                               at::Tensor bottom_grad, at::Tensor mask_grad);
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-int carafe_naive_forward_cuda(at::Tensor features, at::Tensor masks,
-                              int kernel_size, int group_size, int scale_factor,
-                              at::Tensor output) {
-  CHECK_INPUT(features);
-  CHECK_INPUT(masks);
-  CHECK_INPUT(output);
-  at::DeviceGuard guard(features.device());
-
-  int batch_size = output.size(0);
-  int num_channels = output.size(1);
-  int data_height = output.size(2);
-  int data_width = output.size(3);
-
-  CARAFENAIVEForwardLaucher(features, masks, kernel_size, group_size,
-                            scale_factor, batch_size, num_channels, data_height,
-                            data_width, output);
-
-  return 1;
-}
-
-int carafe_naive_backward_cuda(at::Tensor top_grad, at::Tensor features,
-                               at::Tensor masks, int kernel_size,
-                               int group_size, int scale_factor,
-                               at::Tensor bottom_grad, at::Tensor mask_grad) {
-  CHECK_INPUT(top_grad);
-  CHECK_INPUT(features);
-  CHECK_INPUT(masks);
-  CHECK_INPUT(bottom_grad);
-  CHECK_INPUT(mask_grad);
-  at::DeviceGuard guard(top_grad.device());
-
-  int batch_size = top_grad.size(0);
-  int num_channels = top_grad.size(1);
-  int data_height = top_grad.size(2);
-  int data_width = top_grad.size(3);
-
-  CARAFENAIVEBackwardLaucher(top_grad, features, masks, kernel_size, group_size,
-                             scale_factor, batch_size, num_channels,
-                             data_height, data_width, bottom_grad, mask_grad);
-
-  return 1;
-}
diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu
deleted file mode 100644
index 9cf9855a71c..00000000000
--- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu
+++ /dev/null
@@ -1,176 +0,0 @@
-#include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
-
-using namespace at;  // temporal fix for pytorch<=0.4.1 (see #9848)
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define THREADS_PER_BLOCK 1024
-
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  int max_block_num = 65536;
-  return min(optimal_block_num, max_block_num);
-}
-
-__device__ inline int Loc2Index(const int n, const int c, const int h,
-                                const int w, const int channel_num,
-                                const int height, const int width) {
-  int index = w + (h + (c + n * channel_num) * height) * width;
-  return index;
-}
-template <typename scalar_t>
-__global__ void CARAFENAIVEForward(const int nthreads,
-                                   const scalar_t *bottom_data,
-                                   const scalar_t *bottom_masks,
-                                   const int kernel_size, const int group_size,
-                                   const int scale_factor, const int channels,
-                                   const int height, const int width,
-                                   scalar_t *top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the bottom_data
-    int pw = index % width;
-    int ph = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    int mask_channels = kernel_size * kernel_size * group_size;
-    int mask_group = c / (channels / group_size);
-
-    int down_pw = pw / scale_factor;
-    int down_ph = ph / scale_factor;
-    int down_width = width / scale_factor;
-    int down_height = height / scale_factor;
-    int start_w = down_pw - (kernel_size - 1) / 2;
-    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
-    int start_h = down_ph - (kernel_size - 1) / 2;
-    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
-
-    scalar_t output_val = 0;
-    for (int iy = start_h; iy < end_h; iy++) {
-      for (int ix = start_w; ix < end_w; ix++) {
-        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
-          continue;
-        }
-        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
-        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
-        int mask_c =
-            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
-        int feat_index =
-            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
-        int mask_index =
-            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
-        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
-      }
-    }
-    top_data[index] = output_val;
-  }
-}
-
-int CARAFENAIVEForwardLaucher(const at::Tensor features, const at::Tensor masks,
-                              const int kernel_size, const int group_size,
-                              const int scale_factor, const int batch_size,
-                              const int channels, const int height,
-                              const int width, at::Tensor output) {
-  const int output_size = batch_size * channels * height * width;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "CARAFENAIVELaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data_ptr<scalar_t>();
-
-        CARAFENAIVEForward<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-                output_size, bottom_data, bottom_masks, kernel_size, group_size,
-                scale_factor, channels, height, width, top_data);
-      }));
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  return 1;
-}
-
-template <typename scalar_t>
-__global__ void CARAFENAIVEBackward(
-    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
-    const scalar_t *bottom_masks, const int kernel_size, const int group_size,
-    const int scale_factor, const int channels, const int height,
-    const int width, scalar_t *bottom_diff, scalar_t *mask_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the bottom_data
-    int pw = index % width;
-    int ph = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    int mask_channels = kernel_size * kernel_size * group_size;
-    int mask_group = c / (channels / group_size);
-
-    int down_pw = pw / scale_factor;
-    int down_ph = ph / scale_factor;
-    int down_width = width / scale_factor;
-    int down_height = height / scale_factor;
-    int start_w = down_pw - (kernel_size - 1) / 2;
-    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
-    int start_h = down_ph - (kernel_size - 1) / 2;
-    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
-
-    for (int iy = start_h; iy < end_h; iy++) {
-      for (int ix = start_w; ix < end_w; ix++) {
-        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
-          continue;
-        }
-        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
-        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
-        int mask_c =
-            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
-        int feat_index =
-            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
-        int mask_index =
-            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
-        atomicAdd(bottom_diff + feat_index,
-                  bottom_masks[mask_index] * top_diff[index]);
-        atomicAdd(mask_diff + mask_index,
-                  bottom_data[feat_index] * top_diff[index]);
-      }
-    }
-  }
-}
-
-int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad,
-                               const at::Tensor features,
-                               const at::Tensor masks, const int kernel_size,
-                               const int group_size, const int scale_factor,
-                               const int batch_size, const int channels,
-                               const int height, const int width,
-                               at::Tensor bottom_grad, at::Tensor mask_grad) {
-  const int output_size = batch_size * channels * height * width;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "CARAFENAIVELaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
-        scalar_t *mask_diff = mask_grad.data_ptr<scalar_t>();
-
-        CARAFENAIVEBackward<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-                output_size, top_diff, bottom_data, bottom_masks, kernel_size,
-                group_size, scale_factor, channels, height, width, bottom_diff,
-                mask_diff);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  return 1;
-}
diff --git a/mmdet/ops/context_block.py b/mmdet/ops/context_block.py
deleted file mode 100644
index 52b6f91c9b8..00000000000
--- a/mmdet/ops/context_block.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import torch
-from mmcv.cnn import constant_init, kaiming_init
-from torch import nn
-
-
-def last_zero_init(m):
-    if isinstance(m, nn.Sequential):
-        constant_init(m[-1], val=0)
-    else:
-        constant_init(m, val=0)
-
-
-class ContextBlock(nn.Module):
-    """ContextBlock module in GCNet.
-
-    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
-    (https://arxiv.org/abs/1904.11492) for details.
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        ratio (float): Ratio of channels of transform bottleneck
-        pooling_type (str): Pooling method for context modeling
-        fusion_types (list[str]|tuple[str]): Fusion method for feature fusion,
-            options: 'channels_add', 'channel_mul'
-    """
-
-    def __init__(self,
-                 in_channels,
-                 ratio,
-                 pooling_type='att',
-                 fusion_types=('channel_add', )):
-        super(ContextBlock, self).__init__()
-        assert pooling_type in ['avg', 'att']
-        assert isinstance(fusion_types, (list, tuple))
-        valid_fusion_types = ['channel_add', 'channel_mul']
-        assert all([f in valid_fusion_types for f in fusion_types])
-        assert len(fusion_types) > 0, 'at least one fusion should be used'
-        self.in_channels = in_channels
-        self.ratio = ratio
-        self.planes = int(in_channels * ratio)
-        self.pooling_type = pooling_type
-        self.fusion_types = fusion_types
-        if pooling_type == 'att':
-            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
-            self.softmax = nn.Softmax(dim=2)
-        else:
-            self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        if 'channel_add' in fusion_types:
-            self.channel_add_conv = nn.Sequential(
-                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
-                nn.LayerNorm([self.planes, 1, 1]),
-                nn.ReLU(inplace=True),  # yapf: disable
-                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
-        else:
-            self.channel_add_conv = None
-        if 'channel_mul' in fusion_types:
-            self.channel_mul_conv = nn.Sequential(
-                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
-                nn.LayerNorm([self.planes, 1, 1]),
-                nn.ReLU(inplace=True),  # yapf: disable
-                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
-        else:
-            self.channel_mul_conv = None
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.pooling_type == 'att':
-            kaiming_init(self.conv_mask, mode='fan_in')
-            self.conv_mask.inited = True
-
-        if self.channel_add_conv is not None:
-            last_zero_init(self.channel_add_conv)
-        if self.channel_mul_conv is not None:
-            last_zero_init(self.channel_mul_conv)
-
-    def spatial_pool(self, x):
-        batch, channel, height, width = x.size()
-        if self.pooling_type == 'att':
-            input_x = x
-            # [N, C, H * W]
-            input_x = input_x.view(batch, channel, height * width)
-            # [N, 1, C, H * W]
-            input_x = input_x.unsqueeze(1)
-            # [N, 1, H, W]
-            context_mask = self.conv_mask(x)
-            # [N, 1, H * W]
-            context_mask = context_mask.view(batch, 1, height * width)
-            # [N, 1, H * W]
-            context_mask = self.softmax(context_mask)
-            # [N, 1, H * W, 1]
-            context_mask = context_mask.unsqueeze(-1)
-            # [N, 1, C, 1]
-            context = torch.matmul(input_x, context_mask)
-            # [N, C, 1, 1]
-            context = context.view(batch, channel, 1, 1)
-        else:
-            # [N, C, 1, 1]
-            context = self.avg_pool(x)
-
-        return context
-
-    def forward(self, x):
-        # [N, C, 1, 1]
-        context = self.spatial_pool(x)
-
-        out = x
-        if self.channel_mul_conv is not None:
-            # [N, C, 1, 1]
-            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
-            out = out * channel_mul_term
-        if self.channel_add_conv is not None:
-            # [N, C, 1, 1]
-            channel_add_term = self.channel_add_conv(context)
-            out = out + channel_add_term
-
-        return out
diff --git a/mmdet/ops/conv_ws.py b/mmdet/ops/conv_ws.py
deleted file mode 100644
index 43d496ef062..00000000000
--- a/mmdet/ops/conv_ws.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import CONV_LAYERS
-
-
-def conv_ws_2d(input,
-               weight,
-               bias=None,
-               stride=1,
-               padding=0,
-               dilation=1,
-               groups=1,
-               eps=1e-5):
-    c_in = weight.size(0)
-    weight_flat = weight.view(c_in, -1)
-    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
-    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
-    weight = (weight - mean) / (std + eps)
-    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
-
-
-@CONV_LAYERS.register_module(name='ConvWS')
-class ConvWS2d(nn.Conv2d):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 eps=1e-5):
-        super(ConvWS2d, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.eps = eps
-
-    def forward(self, x):
-        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
-                          self.dilation, self.groups, self.eps)
-
-
-@CONV_LAYERS.register_module(name='ConvAWS')
-class ConvAWS2d(nn.Conv2d):
-    """AWS (Adaptive Weight Standardization)
-
-    This is a variant of Weight Standardization
-    (https://arxiv.org/pdf/1903.10520.pdf)
-    It is used in DetectoRS to avoid NaN
-    (https://arxiv.org/pdf/2006.02334.pdf)
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the conv kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        dilation (int or tuple, optional): Spacing between kernel elements.
-            Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If set True, adds a learnable bias to the
-            output. Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.register_buffer('weight_gamma',
-                             torch.ones(self.out_channels, 1, 1, 1))
-        self.register_buffer('weight_beta',
-                             torch.zeros(self.out_channels, 1, 1, 1))
-
-    def _get_weight(self, weight):
-        weight_flat = weight.view(weight.size(0), -1)
-        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
-        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
-        weight = (weight - mean) / std
-        weight = self.weight_gamma * weight + self.weight_beta
-        return weight
-
-    def forward(self, x):
-        weight = self._get_weight(self.weight)
-        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
-                        self.dilation, self.groups)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        """Override default load function.
-
-        AWS overrides the function _load_from_state_dict to recover
-        weight_gamma and weight_beta if they are missing. If weight_gamma and
-        weight_beta are found in the checkpoint, this function will return
-        after super()._load_from_state_dict. Otherwise, it will compute the
-        mean and std of the pretrained weights and store them in weight_beta
-        and weight_gamma.
-        """
-
-        self.weight_gamma.data.fill_(-1)
-        local_missing_keys = []
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, local_missing_keys,
-                                      unexpected_keys, error_msgs)
-        if self.weight_gamma.data.mean() > 0:
-            for k in local_missing_keys:
-                missing_keys.append(k)
-            return
-        weight = self.weight.data
-        weight_flat = weight.view(weight.size(0), -1)
-        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
-        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
-        self.weight_beta.data.copy_(mean)
-        self.weight_gamma.data.copy_(std)
-        missing_gamma_beta = [
-            k for k in local_missing_keys
-            if k.endswith('weight_gamma') or k.endswith('weight_beta')
-        ]
-        for k in missing_gamma_beta:
-            local_missing_keys.remove(k)
-        for k in local_missing_keys:
-            missing_keys.append(k)
diff --git a/mmdet/ops/corner_pool/__init__.py b/mmdet/ops/corner_pool/__init__.py
deleted file mode 100644
index a5457db99f0..00000000000
--- a/mmdet/ops/corner_pool/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .corner_pool import CornerPool
-
-__all__ = ['CornerPool']
diff --git a/mmdet/ops/corner_pool/corner_pool.py b/mmdet/ops/corner_pool/corner_pool.py
deleted file mode 100644
index 00b6b774a4b..00000000000
--- a/mmdet/ops/corner_pool/corner_pool.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from torch import nn
-from torch.autograd import Function
-
-from . import corner_pool_ext
-
-
-class TopPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input):
-        output = corner_pool_ext.top_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input = ctx.saved_variables[0]
-        output = corner_pool_ext.top_pool_backward(input, grad_output)
-        return output
-
-
-class BottomPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input):
-        output = corner_pool_ext.bottom_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input = ctx.saved_variables[0]
-        output = corner_pool_ext.bottom_pool_backward(input, grad_output)
-        return output
-
-
-class LeftPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input):
-        output = corner_pool_ext.left_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input = ctx.saved_variables[0]
-        output = corner_pool_ext.left_pool_backward(input, grad_output)
-        return output
-
-
-class RightPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input):
-        output = corner_pool_ext.right_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input = ctx.saved_variables[0]
-        output = corner_pool_ext.right_pool_backward(input, grad_output)
-        return output
-
-
-class CornerPool(nn.Module):
-    """Corner Pooling.
-
-    Corner Pooling is a new type of pooling layer that helps a
-    convolutional network better localize corners of bounding boxes.
-
-    Please refer to https://arxiv.org/abs/1808.01244 for more details.
-    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
-
-    Args:
-        mode(str): Pooling orientation for the pooling layer
-
-            - 'bottom': Bottom Pooling
-            - 'left': Left Pooling
-            - 'right': Right Pooling
-            - 'top': Top Pooling
-
-    Returns:
-        Feature map after pooling.
-    """
-
-    pool_functions = {
-        'bottom': BottomPoolFunction,
-        'left': LeftPoolFunction,
-        'right': RightPoolFunction,
-        'top': TopPoolFunction,
-    }
-
-    def __init__(self, mode):
-        super(CornerPool, self).__init__()
-        assert mode in self.pool_functions
-        self.corner_pool = self.pool_functions[mode]
-
-    def forward(self, x):
-        return self.corner_pool.apply(x)
diff --git a/mmdet/ops/corner_pool/src/corner_pool.cpp b/mmdet/ops/corner_pool/src/corner_pool.cpp
deleted file mode 100644
index a1fde8078a8..00000000000
--- a/mmdet/ops/corner_pool/src/corner_pool.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-// Modified from
-// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
-#include <torch/torch.h>
-
-#include <vector>
-
-at::Tensor bottom_pool_forward(at::Tensor input) {
-  // Initialize output
-  at::Tensor output = at::zeros_like(input);
-
-  // Get height
-  int64_t height = input.size(2);
-
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    at::Tensor max_temp = at::slice(output, 2, ind, height);
-    at::Tensor cur_temp = at::slice(output, 2, ind, height).clone();
-    at::Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(2, 0);
-  auto grad_output_temp = grad_output.select(2, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < height - 1; ++ind) {
-    input_temp = input.select(2, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-at::Tensor left_pool_forward(at::Tensor input) {
-  // Initialize output
-  at::Tensor output = at::zeros_like(input);
-
-  // Get width
-  int64_t width = input.size(3);
-
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    at::Tensor max_temp = at::slice(output, 3, 0, width - ind);
-    at::Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
-    at::Tensor next_temp = at::slice(output, 3, ind, width).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, width - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(width - 1);
-
-  auto output_temp = output.select(3, width - 1);
-  auto grad_output_temp = grad_output.select(3, width - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < width; ++ind) {
-    input_temp = input.select(3, width - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, width - ind - 1);
-
-    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-at::Tensor right_pool_forward(at::Tensor input) {
-  // Initialize output
-  at::Tensor output = at::zeros_like(input);
-
-  // Get width
-  int64_t width = input.size(3);
-
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    at::Tensor max_temp = at::slice(output, 3, ind, width);
-    at::Tensor cur_temp = at::slice(output, 3, ind, width).clone();
-    at::Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output) {
-  at::Tensor output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(3, 0);
-  auto grad_output_temp = grad_output.select(3, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < width - 1; ++ind) {
-    input_temp = input.select(3, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-at::Tensor top_pool_forward(at::Tensor input) {
-  // Initialize output
-  at::Tensor output = at::zeros_like(input);
-
-  // Get height
-  int64_t height = input.size(2);
-
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    at::Tensor max_temp = at::slice(output, 2, 0, height - ind);
-    at::Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
-    at::Tensor next_temp = at::slice(output, 2, ind, height).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, height - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(height - 1);
-
-  auto output_temp = output.select(2, height - 1);
-  auto grad_output_temp = grad_output.select(2, height - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < height; ++ind) {
-    input_temp = input.select(2, height - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, height - ind - 1);
-
-    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
-        py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
-        py::call_guard<py::gil_scoped_release>());
-}
diff --git a/mmdet/ops/dcn/__init__.py b/mmdet/ops/dcn/__init__.py
deleted file mode 100644
index 79594c90b28..00000000000
--- a/mmdet/ops/dcn/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .deform_conv import (DeformConv, DeformConvPack, ModulatedDeformConv,
-                          ModulatedDeformConvPack, deform_conv,
-                          modulated_deform_conv)
-from .deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
-                          ModulatedDeformRoIPoolingPack, deform_roi_pooling)
-
-__all__ = [
-    'DeformConv', 'DeformConvPack', 'ModulatedDeformConv',
-    'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
-    'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv',
-    'deform_roi_pooling'
-]
diff --git a/mmdet/ops/dcn/deform_conv.py b/mmdet/ops/dcn/deform_conv.py
deleted file mode 100644
index 00766988e4e..00000000000
--- a/mmdet/ops/dcn/deform_conv.py
+++ /dev/null
@@ -1,460 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import CONV_LAYERS
-from mmcv.utils import print_log
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair, _single
-
-from . import deform_conv_ext
-
-
-class DeformConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                offset,
-                weight,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deformable_groups=1,
-                im2col_step=64):
-        if input is not None and input.dim() != 4:
-            raise ValueError(f'Expected 4D tensor as input, got {input.dim()}'
-                             'D tensor instead.')
-        ctx.stride = _pair(stride)
-        ctx.padding = _pair(padding)
-        ctx.dilation = _pair(dilation)
-        ctx.groups = groups
-        ctx.deformable_groups = deformable_groups
-        ctx.im2col_step = im2col_step
-
-        ctx.save_for_backward(input, offset, weight)
-
-        output = input.new_empty(
-            DeformConvFunction._output_size(input, weight, ctx.padding,
-                                            ctx.dilation, ctx.stride))
-
-        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
-
-        if not input.is_cuda:
-            raise NotImplementedError
-        else:
-            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
-            assert (input.shape[0] %
-                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
-            deform_conv_ext.deform_conv_forward(
-                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
-                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
-                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
-                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
-                cur_im2col_step)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, offset, weight = ctx.saved_tensors
-
-        grad_input = grad_offset = grad_weight = None
-
-        if not grad_output.is_cuda:
-            raise NotImplementedError
-        else:
-            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
-            assert (input.shape[0] %
-                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
-
-            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
-                grad_input = torch.zeros_like(input)
-                grad_offset = torch.zeros_like(offset)
-                deform_conv_ext.deform_conv_backward_input(
-                    input, offset, grad_output, grad_input,
-                    grad_offset, weight, ctx.bufs_[0], weight.size(3),
-                    weight.size(2), ctx.stride[1], ctx.stride[0],
-                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
-                    ctx.dilation[0], ctx.groups, ctx.deformable_groups,
-                    cur_im2col_step)
-
-            if ctx.needs_input_grad[2]:
-                grad_weight = torch.zeros_like(weight)
-                deform_conv_ext.deform_conv_backward_parameters(
-                    input, offset, grad_output,
-                    grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
-                    weight.size(2), ctx.stride[1], ctx.stride[0],
-                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
-                    ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
-                    cur_im2col_step)
-
-        return (grad_input, grad_offset, grad_weight, None, None, None, None,
-                None)
-
-    @staticmethod
-    def _output_size(input, weight, padding, dilation, stride):
-        channels = weight.size(0)
-        output_size = (input.size(0), channels)
-        for d in range(input.dim() - 2):
-            in_size = input.size(d + 2)
-            pad = padding[d]
-            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
-            stride_ = stride[d]
-            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
-        if not all(map(lambda s: s > 0, output_size)):
-            raise ValueError('convolution input is too small (output would be '
-                             f'{"x".join(map(str, output_size))})')
-        return output_size
-
-
-class ModulatedDeformConvFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                input,
-                offset,
-                mask,
-                weight,
-                bias=None,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deformable_groups=1):
-        ctx.stride = stride
-        ctx.padding = padding
-        ctx.dilation = dilation
-        ctx.groups = groups
-        ctx.deformable_groups = deformable_groups
-        ctx.with_bias = bias is not None
-        if not ctx.with_bias:
-            bias = input.new_empty(1)  # fake tensor
-        if not input.is_cuda:
-            raise NotImplementedError
-        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
-                or input.requires_grad:
-            ctx.save_for_backward(input, offset, mask, weight, bias)
-        output = input.new_empty(
-            ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
-        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
-        deform_conv_ext.modulated_deform_conv_forward(
-            input, weight, bias, ctx._bufs[0], offset, mask, output,
-            ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
-            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
-            ctx.groups, ctx.deformable_groups, ctx.with_bias)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        if not grad_output.is_cuda:
-            raise NotImplementedError
-        input, offset, mask, weight, bias = ctx.saved_tensors
-        grad_input = torch.zeros_like(input)
-        grad_offset = torch.zeros_like(offset)
-        grad_mask = torch.zeros_like(mask)
-        grad_weight = torch.zeros_like(weight)
-        grad_bias = torch.zeros_like(bias)
-        deform_conv_ext.modulated_deform_conv_backward(
-            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
-            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
-            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
-            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
-            ctx.groups, ctx.deformable_groups, ctx.with_bias)
-        if not ctx.with_bias:
-            grad_bias = None
-
-        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
-                None, None, None, None, None)
-
-    @staticmethod
-    def _infer_shape(ctx, input, weight):
-        n = input.size(0)
-        channels_out = weight.size(0)
-        height, width = input.shape[2:4]
-        kernel_h, kernel_w = weight.shape[2:4]
-        # TODO: support different padding/stride/dilation in height and width
-        height_out = (height + 2 * ctx.padding -
-                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
-        width_out = (width + 2 * ctx.padding -
-                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
-        return n, channels_out, height_out, width_out
-
-
-deform_conv = DeformConvFunction.apply
-modulated_deform_conv = ModulatedDeformConvFunction.apply
-
-
-class DeformConv(nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 deformable_groups=1,
-                 bias=False):
-        super(DeformConv, self).__init__()
-
-        assert not bias
-        assert in_channels % groups == 0, \
-            f'in_channels {in_channels} is not divisible by groups {groups}'
-        assert out_channels % groups == 0, \
-            f'out_channels {out_channels} is not divisible ' \
-            f'by groups {groups}'
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = _pair(stride)
-        self.padding = _pair(padding)
-        self.dilation = _pair(dilation)
-        self.groups = groups
-        self.deformable_groups = deformable_groups
-        # enable compatibility with nn.Conv2d
-        self.transposed = False
-        self.output_padding = _single(0)
-
-        self.weight = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // self.groups,
-                         *self.kernel_size))
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        n = self.in_channels
-        for k in self.kernel_size:
-            n *= k
-        stdv = 1. / math.sqrt(n)
-        self.weight.data.uniform_(-stdv, stdv)
-
-    def forward(self, x, offset):
-        # To fix an assert error in deform_conv_cuda.cpp:128
-        # input image is smaller than kernel
-        input_pad = (
-            x.size(2) < self.kernel_size[0] or x.size(3) < self.kernel_size[1])
-        if input_pad:
-            pad_h = max(self.kernel_size[0] - x.size(2), 0)
-            pad_w = max(self.kernel_size[1] - x.size(3), 0)
-            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
-            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant',
-                           0).contiguous()
-        out = deform_conv(x, offset, self.weight, self.stride, self.padding,
-                          self.dilation, self.groups, self.deformable_groups)
-        if input_pad:
-            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
-                      pad_w].contiguous()
-        return out
-
-
-@CONV_LAYERS.register_module(name='DCN')
-class DeformConvPack(DeformConv):
-    """A Deformable Conv Encapsulation that acts as normal Conv layers.
-
-    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
-    The spatial arrangement is like:
-    ```
-    (x0, y0) (x1, y1) (x2, y2)
-    (x3, y3) (x4, y4) (x5, y5)
-    (x6, y6) (x7, y7) (x8, y8)
-    ```
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int or tuple[int]): Same as nn.Conv2d.
-        padding (int or tuple[int]): Same as nn.Conv2d.
-        dilation (int or tuple[int]): Same as nn.Conv2d.
-        groups (int): Same as nn.Conv2d.
-        bias (bool or str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
-            False.
-    """
-
-    _version = 2
-
-    def __init__(self, *args, **kwargs):
-        super(DeformConvPack, self).__init__(*args, **kwargs)
-
-        self.conv_offset = nn.Conv2d(
-            self.in_channels,
-            self.deformable_groups * 2 * self.kernel_size[0] *
-            self.kernel_size[1],
-            kernel_size=self.kernel_size,
-            stride=_pair(self.stride),
-            padding=_pair(self.padding),
-            dilation=_pair(self.dilation),
-            bias=True)
-        self.init_offset()
-
-    def init_offset(self):
-        self.conv_offset.weight.data.zero_()
-        self.conv_offset.bias.data.zero_()
-
-    def forward(self, x):
-        offset = self.conv_offset(x)
-        return deform_conv(x, offset, self.weight, self.stride, self.padding,
-                           self.dilation, self.groups, self.deformable_groups)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
-
-        if version is None or version < 2:
-            # the key is different in early versions
-            # In version < 2, DeformConvPack loads previous benchmark models.
-            if (prefix + 'conv_offset.weight' not in state_dict
-                    and prefix[:-1] + '_offset.weight' in state_dict):
-                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
-                    prefix[:-1] + '_offset.weight')
-            if (prefix + 'conv_offset.bias' not in state_dict
-                    and prefix[:-1] + '_offset.bias' in state_dict):
-                state_dict[prefix +
-                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
-                                                                '_offset.bias')
-
-        if version is not None and version > 1:
-            print_log(
-                f'DeformConvPack {prefix.rstrip(".")} is upgraded to '
-                'version 2.',
-                logger='root')
-
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs)
-
-
-class ModulatedDeformConv(nn.Module):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 deformable_groups=1,
-                 bias=True):
-        super(ModulatedDeformConv, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.groups = groups
-        self.deformable_groups = deformable_groups
-        self.with_bias = bias
-        # enable compatibility with nn.Conv2d
-        self.transposed = False
-        self.output_padding = _single(0)
-
-        self.weight = nn.Parameter(
-            torch.Tensor(out_channels, in_channels // groups,
-                         *self.kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter('bias', None)
-        self.init_weights()
-
-    def init_weights(self):
-        n = self.in_channels
-        for k in self.kernel_size:
-            n *= k
-        stdv = 1. / math.sqrt(n)
-        self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.zero_()
-
-    def forward(self, x, offset, mask):
-        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
-                                     self.stride, self.padding, self.dilation,
-                                     self.groups, self.deformable_groups)
-
-
-@CONV_LAYERS.register_module(name='DCNv2')
-class ModulatedDeformConvPack(ModulatedDeformConv):
-    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
-    layers.
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int or tuple[int]): Same as nn.Conv2d.
-        stride (int): Same as nn.Conv2d, while tuple is not supported.
-        padding (int): Same as nn.Conv2d, while tuple is not supported.
-        dilation (int): Same as nn.Conv2d, while tuple is not supported.
-        groups (int): Same as nn.Conv2d.
-        bias (bool or str): If specified as `auto`, it will be decided by the
-            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
-            False.
-    """
-
-    _version = 2
-
-    def __init__(self, *args, **kwargs):
-        super(ModulatedDeformConvPack, self).__init__(*args, **kwargs)
-
-        self.conv_offset = nn.Conv2d(
-            self.in_channels,
-            self.deformable_groups * 3 * self.kernel_size[0] *
-            self.kernel_size[1],
-            kernel_size=self.kernel_size,
-            stride=_pair(self.stride),
-            padding=_pair(self.padding),
-            dilation=_pair(self.dilation),
-            bias=True)
-        self.init_weights()
-
-    def init_weights(self):
-        super(ModulatedDeformConvPack, self).init_weights()
-        if hasattr(self, 'conv_offset'):
-            self.conv_offset.weight.data.zero_()
-            self.conv_offset.bias.data.zero_()
-
-    def forward(self, x):
-        out = self.conv_offset(x)
-        o1, o2, mask = torch.chunk(out, 3, dim=1)
-        offset = torch.cat((o1, o2), dim=1)
-        mask = torch.sigmoid(mask)
-        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
-                                     self.stride, self.padding, self.dilation,
-                                     self.groups, self.deformable_groups)
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
-
-        if version is None or version < 2:
-            # the key is different in early versions
-            # In version < 2, ModulatedDeformConvPack
-            # loads previous benchmark models.
-            if (prefix + 'conv_offset.weight' not in state_dict
-                    and prefix[:-1] + '_offset.weight' in state_dict):
-                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
-                    prefix[:-1] + '_offset.weight')
-            if (prefix + 'conv_offset.bias' not in state_dict
-                    and prefix[:-1] + '_offset.bias' in state_dict):
-                state_dict[prefix +
-                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
-                                                                '_offset.bias')
-
-        if version is not None and version > 1:
-            print_log(
-                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
-                'version 2.',
-                logger='root')
-
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs)
diff --git a/mmdet/ops/dcn/deform_pool.py b/mmdet/ops/dcn/deform_pool.py
deleted file mode 100644
index a0ccd60734d..00000000000
--- a/mmdet/ops/dcn/deform_pool.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from . import deform_pool_ext
-
-
-class DeformRoIPoolingFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                data,
-                rois,
-                offset,
-                spatial_scale,
-                out_size,
-                out_channels,
-                no_trans,
-                group_size=1,
-                part_size=None,
-                sample_per_part=4,
-                trans_std=.0):
-        # TODO: support unsquare RoIs
-        out_h, out_w = _pair(out_size)
-        assert isinstance(out_h, int) and isinstance(out_w, int)
-        assert out_h == out_w
-        out_size = out_h  # out_h and out_w must be equal
-
-        ctx.spatial_scale = spatial_scale
-        ctx.out_size = out_size
-        ctx.out_channels = out_channels
-        ctx.no_trans = no_trans
-        ctx.group_size = group_size
-        ctx.part_size = out_size if part_size is None else part_size
-        ctx.sample_per_part = sample_per_part
-        ctx.trans_std = trans_std
-
-        assert 0.0 <= ctx.trans_std <= 1.0
-        if not data.is_cuda:
-            raise NotImplementedError
-
-        n = rois.shape[0]
-        output = data.new_empty(n, out_channels, out_size, out_size)
-        output_count = data.new_empty(n, out_channels, out_size, out_size)
-        deform_pool_ext.deform_psroi_pooling_forward(
-            data, rois, offset, output, output_count, ctx.no_trans,
-            ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
-            ctx.part_size, ctx.sample_per_part, ctx.trans_std)
-
-        if data.requires_grad or rois.requires_grad or offset.requires_grad:
-            ctx.save_for_backward(data, rois, offset)
-        ctx.output_count = output_count
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        if not grad_output.is_cuda:
-            raise NotImplementedError
-
-        data, rois, offset = ctx.saved_tensors
-        output_count = ctx.output_count
-        grad_input = torch.zeros_like(data)
-        grad_rois = None
-        grad_offset = torch.zeros_like(offset)
-
-        deform_pool_ext.deform_psroi_pooling_backward(
-            grad_output, data, rois, offset, output_count, grad_input,
-            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
-            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
-            ctx.trans_std)
-        return (grad_input, grad_rois, grad_offset, None, None, None, None,
-                None, None, None, None)
-
-
-deform_roi_pooling = DeformRoIPoolingFunction.apply
-
-
-class DeformRoIPooling(nn.Module):
-
-    def __init__(self,
-                 spatial_scale,
-                 out_size,
-                 out_channels,
-                 no_trans,
-                 group_size=1,
-                 part_size=None,
-                 sample_per_part=4,
-                 trans_std=.0):
-        super(DeformRoIPooling, self).__init__()
-        self.spatial_scale = spatial_scale
-        self.out_size = _pair(out_size)
-        self.out_channels = out_channels
-        self.no_trans = no_trans
-        self.group_size = group_size
-        self.part_size = out_size if part_size is None else part_size
-        self.sample_per_part = sample_per_part
-        self.trans_std = trans_std
-
-    def forward(self, data, rois, offset):
-        if self.no_trans:
-            offset = data.new_empty(0)
-        return deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                  self.out_size, self.out_channels,
-                                  self.no_trans, self.group_size,
-                                  self.part_size, self.sample_per_part,
-                                  self.trans_std)
-
-
-class DeformRoIPoolingPack(DeformRoIPooling):
-
-    def __init__(self,
-                 spatial_scale,
-                 out_size,
-                 out_channels,
-                 no_trans,
-                 group_size=1,
-                 part_size=None,
-                 sample_per_part=4,
-                 trans_std=.0,
-                 num_offset_fcs=3,
-                 deform_fc_channels=1024):
-        super(DeformRoIPoolingPack,
-              self).__init__(spatial_scale, out_size, out_channels, no_trans,
-                             group_size, part_size, sample_per_part, trans_std)
-
-        self.num_offset_fcs = num_offset_fcs
-        self.deform_fc_channels = deform_fc_channels
-
-        if not no_trans:
-            seq = []
-            ic = self.out_size[0] * self.out_size[1] * self.out_channels
-            for i in range(self.num_offset_fcs):
-                if i < self.num_offset_fcs - 1:
-                    oc = self.deform_fc_channels
-                else:
-                    oc = self.out_size[0] * self.out_size[1] * 2
-                seq.append(nn.Linear(ic, oc))
-                ic = oc
-                if i < self.num_offset_fcs - 1:
-                    seq.append(nn.ReLU(inplace=True))
-            self.offset_fc = nn.Sequential(*seq)
-            self.offset_fc[-1].weight.data.zero_()
-            self.offset_fc[-1].bias.data.zero_()
-
-    def forward(self, data, rois):
-        assert data.size(1) == self.out_channels
-        n = rois.shape[0]
-        if n == 0:
-            return data.new_empty(n, self.out_channels, self.out_size[0],
-                                  self.out_size[1])
-        if self.no_trans:
-            offset = data.new_empty(0)
-            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                      self.out_size, self.out_channels,
-                                      self.no_trans, self.group_size,
-                                      self.part_size, self.sample_per_part,
-                                      self.trans_std)
-        else:
-            offset = data.new_empty(0)
-            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                   self.out_size, self.out_channels, True,
-                                   self.group_size, self.part_size,
-                                   self.sample_per_part, self.trans_std)
-            offset = self.offset_fc(x.view(n, -1))
-            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
-            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                      self.out_size, self.out_channels,
-                                      self.no_trans, self.group_size,
-                                      self.part_size, self.sample_per_part,
-                                      self.trans_std)
-
-
-class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
-
-    def __init__(self,
-                 spatial_scale,
-                 out_size,
-                 out_channels,
-                 no_trans,
-                 group_size=1,
-                 part_size=None,
-                 sample_per_part=4,
-                 trans_std=.0,
-                 num_offset_fcs=3,
-                 num_mask_fcs=2,
-                 deform_fc_channels=1024):
-        super(ModulatedDeformRoIPoolingPack,
-              self).__init__(spatial_scale, out_size, out_channels, no_trans,
-                             group_size, part_size, sample_per_part, trans_std)
-
-        self.num_offset_fcs = num_offset_fcs
-        self.num_mask_fcs = num_mask_fcs
-        self.deform_fc_channels = deform_fc_channels
-
-        if not no_trans:
-            offset_fc_seq = []
-            ic = self.out_size[0] * self.out_size[1] * self.out_channels
-            for i in range(self.num_offset_fcs):
-                if i < self.num_offset_fcs - 1:
-                    oc = self.deform_fc_channels
-                else:
-                    oc = self.out_size[0] * self.out_size[1] * 2
-                offset_fc_seq.append(nn.Linear(ic, oc))
-                ic = oc
-                if i < self.num_offset_fcs - 1:
-                    offset_fc_seq.append(nn.ReLU(inplace=True))
-            self.offset_fc = nn.Sequential(*offset_fc_seq)
-            self.offset_fc[-1].weight.data.zero_()
-            self.offset_fc[-1].bias.data.zero_()
-
-            mask_fc_seq = []
-            ic = self.out_size[0] * self.out_size[1] * self.out_channels
-            for i in range(self.num_mask_fcs):
-                if i < self.num_mask_fcs - 1:
-                    oc = self.deform_fc_channels
-                else:
-                    oc = self.out_size[0] * self.out_size[1]
-                mask_fc_seq.append(nn.Linear(ic, oc))
-                ic = oc
-                if i < self.num_mask_fcs - 1:
-                    mask_fc_seq.append(nn.ReLU(inplace=True))
-                else:
-                    mask_fc_seq.append(nn.Sigmoid())
-            self.mask_fc = nn.Sequential(*mask_fc_seq)
-            self.mask_fc[-2].weight.data.zero_()
-            self.mask_fc[-2].bias.data.zero_()
-
-    def forward(self, data, rois):
-        assert data.size(1) == self.out_channels
-        n = rois.shape[0]
-        if n == 0:
-            return data.new_empty(n, self.out_channels, self.out_size[0],
-                                  self.out_size[1])
-        if self.no_trans:
-            offset = data.new_empty(0)
-            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                      self.out_size, self.out_channels,
-                                      self.no_trans, self.group_size,
-                                      self.part_size, self.sample_per_part,
-                                      self.trans_std)
-        else:
-            offset = data.new_empty(0)
-            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
-                                   self.out_size, self.out_channels, True,
-                                   self.group_size, self.part_size,
-                                   self.sample_per_part, self.trans_std)
-            offset = self.offset_fc(x.view(n, -1))
-            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
-            mask = self.mask_fc(x.view(n, -1))
-            mask = mask.view(n, 1, self.out_size[0], self.out_size[1])
-            return deform_roi_pooling(
-                data, rois, offset, self.spatial_scale, self.out_size,
-                self.out_channels, self.no_trans, self.group_size,
-                self.part_size, self.sample_per_part, self.trans_std) * mask
diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp
deleted file mode 100644
index a81045e18d7..00000000000
--- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp
+++ /dev/null
@@ -1,686 +0,0 @@
-// modify from
-// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
-
-#include <torch/extension.h>
-#include <ATen/DeviceGuard.h>
-
-#include <cmath>
-#include <vector>
-
-void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
-                       const int channels, const int height, const int width,
-                       const int ksize_h, const int ksize_w, const int pad_h,
-                       const int pad_w, const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       at::Tensor data_col);
-
-void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
-                       const int channels, const int height, const int width,
-                       const int ksize_h, const int ksize_w, const int pad_h,
-                       const int pad_w, const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       at::Tensor grad_im);
-
-void deformable_col2im_coord(
-    const at::Tensor data_col, const at::Tensor data_im,
-    const at::Tensor data_offset, const int channels, const int height,
-    const int width, const int ksize_h, const int ksize_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, at::Tensor grad_offset);
-
-void modulated_deformable_im2col_cuda(
-    const at::Tensor data_im, const at::Tensor data_offset,
-    const at::Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kenerl_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    at::Tensor data_col);
-
-void modulated_deformable_col2im_cuda(
-    const at::Tensor data_col, const at::Tensor data_offset,
-    const at::Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kenerl_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    at::Tensor grad_im);
-
-void modulated_deformable_col2im_coord_cuda(
-    const at::Tensor data_col, const at::Tensor data_im,
-    const at::Tensor data_offset, const at::Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
-    at::Tensor grad_mask);
-
-void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
-                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
-                 int padW, int dilationH, int dilationW, int group,
-                 int deformable_group) {
-  TORCH_CHECK(weight.ndimension() == 4,
-           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
-           "but got: %s",
-           weight.ndimension());
-
-  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
-
-  TORCH_CHECK(kW > 0 && kH > 0,
-           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
-           kW);
-
-  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
-           "kernel size should be consistent with weight, ",
-           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
-           kW, weight.size(2), weight.size(3));
-
-  TORCH_CHECK(dW > 0 && dH > 0,
-           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
-
-  TORCH_CHECK(
-      dilationW > 0 && dilationH > 0,
-      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
-      dilationH, dilationW);
-
-  int ndim = input.ndimension();
-  int dimf = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  if (ndim == 4) {
-    dimf++;
-    dimh++;
-    dimw++;
-  }
-
-  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
-           ndim);
-
-  long nInputPlane = weight.size(1) * group;
-  long inputHeight = input.size(dimh);
-  long inputWidth = input.size(dimw);
-  long nOutputPlane = weight.size(0);
-  long outputHeight =
-      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  long outputWidth =
-      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-
-  TORCH_CHECK(nInputPlane % deformable_group == 0,
-           "input channels must divide deformable group size");
-
-  if (outputWidth < 1 || outputHeight < 1)
-    AT_ERROR(
-        "Given input size: (%ld x %ld x %ld). "
-        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
-        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
-        outputWidth);
-
-  TORCH_CHECK(input.size(1) == nInputPlane,
-           "invalid number of input planes, expected: %d, but got: %d",
-           nInputPlane, input.size(1));
-
-  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
-           "input image is smaller than kernel");
-
-  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
-           "invalid spatial size of offset, expected height: %d width: %d, but "
-           "got height: %d width: %d",
-           outputHeight, outputWidth, offset.size(2), offset.size(3));
-
-  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
-           "invalid number of channels of offset");
-
-  if (gradOutput != NULL) {
-    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
-             "invalid number of gradOutput planes, expected: %d, but got: %d",
-             nOutputPlane, gradOutput->size(dimf));
-
-    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
-              gradOutput->size(dimw) == outputWidth),
-             "invalid size of gradOutput, expected height: %d width: %d , but "
-             "got height: %d width: %d",
-             outputHeight, outputWidth, gradOutput->size(dimh),
-             gradOutput->size(dimw));
-  }
-}
-
-int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
-                             at::Tensor offset, at::Tensor output,
-                             at::Tensor columns, at::Tensor ones, int kW,
-                             int kH, int dW, int dH, int padW, int padH,
-                             int dilationW, int dilationH, int group,
-                             int deformable_group, int im2col_step) {
-  // todo: resize columns to include im2col: done
-  // todo: add im2col_step as input
-  // todo: add new output buffer and transpose it to output (or directly
-  // transpose output) todo: possibly change data indexing because of
-  // parallel_imgs
-
-  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
-              dilationH, dilationW, group, deformable_group);
-  at::DeviceGuard guard(input.device());
-
-  input = input.contiguous();
-  offset = offset.contiguous();
-  weight = weight.contiguous();
-
-  int batch = 1;
-  if (input.ndimension() == 3) {
-    // Force batch
-    batch = 0;
-    input.unsqueeze_(0);
-    offset.unsqueeze_(0);
-  }
-
-  // todo: assert batchsize dividable by im2col_step
-
-  long batchSize = input.size(0);
-  long nInputPlane = input.size(1);
-  long inputHeight = input.size(2);
-  long inputWidth = input.size(3);
-
-  long nOutputPlane = weight.size(0);
-
-  long outputWidth =
-      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight =
-      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
-
-  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
-                        outputHeight, outputWidth});
-  columns = at::zeros(
-      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
-      input.options());
-
-  if (ones.ndimension() != 2 ||
-      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
-    ones = at::ones({outputHeight, outputWidth}, input.options());
-  }
-
-  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
-                      inputHeight, inputWidth});
-  offset =
-      offset.view({batchSize / im2col_step, im2col_step,
-                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  at::Tensor output_buffer =
-      at::zeros({batchSize / im2col_step, nOutputPlane,
-                 im2col_step * outputHeight, outputWidth},
-                output.options());
-
-  output_buffer = output_buffer.view(
-      {output_buffer.size(0), group, output_buffer.size(1) / group,
-       output_buffer.size(2), output_buffer.size(3)});
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, columns);
-
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-                          weight.size(2), weight.size(3)});
-
-    for (int g = 0; g < group; g++) {
-      output_buffer[elt][g] = output_buffer[elt][g]
-                                  .flatten(1)
-                                  .addmm_(weight[g].flatten(1), columns[g])
-                                  .view_as(output_buffer[elt][g]);
-    }
-  }
-
-  output_buffer = output_buffer.view(
-      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
-       output_buffer.size(3), output_buffer.size(4)});
-
-  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
-                                      im2col_step, outputHeight, outputWidth});
-  output_buffer.transpose_(1, 2);
-  output.copy_(output_buffer);
-  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
-
-  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  offset = offset.view(
-      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  if (batch == 0) {
-    output = output.view({nOutputPlane, outputHeight, outputWidth});
-    input = input.view({nInputPlane, inputHeight, inputWidth});
-    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
-  }
-
-  return 1;
-}
-
-int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
-                                    at::Tensor gradOutput, at::Tensor gradInput,
-                                    at::Tensor gradOffset, at::Tensor weight,
-                                    at::Tensor columns, int kW, int kH, int dW,
-                                    int dH, int padW, int padH, int dilationW,
-                                    int dilationH, int group,
-                                    int deformable_group, int im2col_step) {
-  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
-              dilationH, dilationW, group, deformable_group);
-  at::DeviceGuard guard(input.device());
-
-  input = input.contiguous();
-  offset = offset.contiguous();
-  gradOutput = gradOutput.contiguous();
-  weight = weight.contiguous();
-
-  int batch = 1;
-
-  if (input.ndimension() == 3) {
-    // Force batch
-    batch = 0;
-    input = input.view({1, input.size(0), input.size(1), input.size(2)});
-    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
-    gradOutput = gradOutput.view(
-        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
-  }
-
-  long batchSize = input.size(0);
-  long nInputPlane = input.size(1);
-  long inputHeight = input.size(2);
-  long inputWidth = input.size(3);
-
-  long nOutputPlane = weight.size(0);
-
-  long outputWidth =
-      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight =
-      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
-  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  columns = at::zeros(
-      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
-      input.options());
-
-  // change order of grad output
-  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
-                                nOutputPlane, outputHeight, outputWidth});
-  gradOutput.transpose_(1, 2);
-
-  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
-                              inputHeight, inputWidth});
-  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
-                      inputHeight, inputWidth});
-  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
-                                deformable_group * 2 * kH * kW, outputHeight,
-                                outputWidth});
-  offset =
-      offset.view({batchSize / im2col_step, im2col_step,
-                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    // divide into groups
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-                          weight.size(2), weight.size(3)});
-    gradOutput = gradOutput.view(
-        {gradOutput.size(0), group, gradOutput.size(1) / group,
-         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
-
-    for (int g = 0; g < group; g++) {
-      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
-                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
-    }
-
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-    gradOutput = gradOutput.view(
-        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
-         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
-
-    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
-                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-                            dilationH, dilationW, im2col_step, deformable_group,
-                            gradOffset[elt]);
-
-    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
-                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, gradInput[elt]);
-  }
-
-  gradOutput.transpose_(1, 2);
-  gradOutput =
-      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
-
-  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  gradOffset = gradOffset.view(
-      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-  offset = offset.view(
-      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  if (batch == 0) {
-    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
-    input = input.view({nInputPlane, inputHeight, inputWidth});
-    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
-    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
-    gradOffset =
-        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
-  }
-
-  return 1;
-}
-
-int deform_conv_backward_parameters_cuda(
-    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
-    at::Tensor gradWeight,  // at::Tensor gradBias,
-    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
-    int padW, int padH, int dilationW, int dilationH, int group,
-    int deformable_group, float scale, int im2col_step) {
-  // todo: transpose and reshape outGrad
-  // todo: reshape columns
-  // todo: add im2col_step as input
-
-  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
-              padW, dilationH, dilationW, group, deformable_group);
-  at::DeviceGuard guard(input.device());
-
-  input = input.contiguous();
-  offset = offset.contiguous();
-  gradOutput = gradOutput.contiguous();
-
-  int batch = 1;
-
-  if (input.ndimension() == 3) {
-    // Force batch
-    batch = 0;
-    input = input.view(
-        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
-    gradOutput = gradOutput.view(
-        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
-  }
-
-  long batchSize = input.size(0);
-  long nInputPlane = input.size(1);
-  long inputHeight = input.size(2);
-  long inputWidth = input.size(3);
-
-  long nOutputPlane = gradWeight.size(0);
-
-  long outputWidth =
-      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight =
-      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
-
-  columns = at::zeros(
-      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
-      input.options());
-
-  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
-                                nOutputPlane, outputHeight, outputWidth});
-  gradOutput.transpose_(1, 2);
-
-  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
-  gradOutputBuffer =
-      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
-                             outputHeight, outputWidth});
-  gradOutputBuffer = gradOutputBuffer.contiguous();
-  gradOutputBuffer.copy_(gradOutput);
-  gradOutputBuffer =
-      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
-                             im2col_step * outputHeight, outputWidth});
-
-  gradOutput.transpose_(1, 2);
-  gradOutput =
-      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
-
-  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
-                      inputHeight, inputWidth});
-  offset =
-      offset.view({batchSize / im2col_step, im2col_step,
-                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                      dilationW, im2col_step, deformable_group, columns);
-
-    // divide into group
-    gradOutputBuffer = gradOutputBuffer.view(
-        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
-         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    gradWeight =
-        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
-                         gradWeight.size(2), gradWeight.size(3)});
-
-    for (int g = 0; g < group; g++) {
-      gradWeight[g] = gradWeight[g]
-                          .flatten(1)
-                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
-                                  columns[g].transpose(1, 0), 1.0, scale)
-                          .view_as(gradWeight[g]);
-    }
-    gradOutputBuffer = gradOutputBuffer.view(
-        {gradOutputBuffer.size(0),
-         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
-         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
-                                  gradWeight.size(2), gradWeight.size(3),
-                                  gradWeight.size(4)});
-  }
-
-  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
-  offset = offset.view(
-      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
-
-  if (batch == 0) {
-    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
-    input = input.view({nInputPlane, inputHeight, inputWidth});
-  }
-
-  return 1;
-}
-
-void modulated_deform_conv_cuda_forward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
-    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w, const int dilation_h,
-    const int dilation_w, const int group, const int deformable_group,
-    const bool with_bias) {
-  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-
-  const int channels_out = weight.size(0);
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
-
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
-             kernel_h_, kernel_w, kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
-             channels, channels_kernel * group);
-
-  const int height_out =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  if (ones.ndimension() != 2 ||
-      ones.size(0) * ones.size(1) < height_out * width_out) {
-    // Resize plane and fill with ones...
-    ones = at::ones({height_out, width_out}, input.options());
-  }
-
-  // resize output
-  output = output.view({batch, channels_out, height_out, width_out}).zero_();
-  // resize temporary columns
-  columns =
-      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
-                input.options());
-
-  output = output.view({output.size(0), group, output.size(1) / group,
-                        output.size(2), output.size(3)});
-
-  for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_cuda(
-        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns);
-
-    // divide into group
-    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-                          weight.size(2), weight.size(3)});
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-
-    for (int g = 0; g < group; g++) {
-      output[b][g] = output[b][g]
-                         .flatten(1)
-                         .addmm_(weight[g].flatten(1), columns[g])
-                         .view_as(output[b][g]);
-    }
-
-    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
-                          weight.size(3), weight.size(4)});
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-  }
-
-  output = output.view({output.size(0), output.size(1) * output.size(2),
-                        output.size(3), output.size(4)});
-
-  if (with_bias) {
-    output += bias.view({1, bias.size(0), 1, 1});
-  }
-}
-
-void modulated_deform_conv_cuda_backward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor columns,
-    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
-    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
-    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
-    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
-    const bool with_bias) {
-  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
-             kernel_h_, kernel_w, kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
-             channels, channels_kernel * group);
-
-  const int height_out =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  if (ones.ndimension() != 2 ||
-      ones.size(0) * ones.size(1) < height_out * width_out) {
-    // Resize plane and fill with ones...
-    ones = at::ones({height_out, width_out}, input.options());
-  }
-
-  grad_input = grad_input.view({batch, channels, height, width});
-  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
-                      input.options());
-
-  grad_output =
-      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
-                        grad_output.size(2), grad_output.size(3)});
-
-  for (int b = 0; b < batch; b++) {
-    // divide int group
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    weight = weight.view({group, weight.size(0) / group, weight.size(1),
-                          weight.size(2), weight.size(3)});
-
-    for (int g = 0; g < group; g++) {
-      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
-                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
-    }
-
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
-                          weight.size(3), weight.size(4)});
-
-    // gradient w.r.t. input coordinate data
-    modulated_deformable_col2im_coord_cuda(
-        columns, input[b], offset[b], mask[b], 1, channels, height, width,
-        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-        grad_mask[b]);
-    // gradient w.r.t. input data
-    modulated_deformable_col2im_cuda(
-        columns, offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, grad_input[b]);
-
-    // gradient w.r.t. weight, dWeight should accumulate across the batch and
-    // group
-    modulated_deformable_im2col_cuda(
-        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-        dilation_h, dilation_w, deformable_group, columns);
-
-    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
-    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
-                                    grad_weight.size(1), grad_weight.size(2),
-                                    grad_weight.size(3)});
-    if (with_bias)
-      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
-
-    for (int g = 0; g < group; g++) {
-      grad_weight[g] =
-          grad_weight[g]
-              .flatten(1)
-              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
-              .view_as(grad_weight[g]);
-      if (with_bias) {
-        grad_bias[g] =
-            grad_bias[g]
-                .view({-1, 1})
-                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
-                .view(-1);
-      }
-    }
-
-    columns =
-        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
-    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
-                                    grad_weight.size(2), grad_weight.size(3),
-                                    grad_weight.size(4)});
-    if (with_bias)
-      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
-  }
-  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
-                                  grad_output.size(2), grad_output.size(3),
-                                  grad_output.size(4)});
-}
diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu
deleted file mode 100644
index 98752dccf8c..00000000000
--- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu
+++ /dev/null
@@ -1,867 +0,0 @@
-/*!
- ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
- *
- * COPYRIGHT
- *
- * All contributions by the University of California:
- * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
- * All rights reserved.
- *
- * All other contributions:
- * Copyright (c) 2014-2017, the respective contributors
- * All rights reserved.
- *
- * Caffe uses a shared copyright model: each contributor holds copyright over
- * their contributions to Caffe. The project versioning records all such
- * contribution and copyright details. If a contributor wants to further mark
- * their specific copyright on a particular contribution, they should indicate
- * their copyright solely in the commit message of the change when it is
- * committed.
- *
- * LICENSE
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * CONTRIBUTION AGREEMENT
- *
- * By contributing to the BVLC/caffe repository through pull-request, comment,
- * or otherwise, the contributor releases their content to the
- * license and copyright terms herein.
- *
- ***************** END Caffe Copyright Notice and Disclaimer ********************
- *
- * Copyright (c) 2018 Microsoft
- * Licensed under The MIT License [see LICENSE for details]
- * \file modulated_deformable_im2col.cuh
- * \brief Function definitions of converting an image to
- * column matrix based on kernel, padding, dilation, and offset.
- * These functions are mainly used in deformable convolution operators.
- * \ref: https://arxiv.org/abs/1703.06211
- * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
- */
-
-// modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-#include <stdio.h>
-#include <math.h>
-#include <float.h>
-
-using namespace at;
-
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-const int kMaxGridNum = 65535;
-
-inline int GET_BLOCKS(const int N)
-{
-  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
-}
-
-template <typename scalar_t>
-__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
-                                               const int height, const int width, scalar_t h, scalar_t w)
-{
-
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  scalar_t lh = h - h_low;
-  scalar_t lw = w - w_low;
-  scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-    v1 = bottom_data[h_low * data_width + w_low];
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = bottom_data[h_low * data_width + w_high];
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = bottom_data[h_high * data_width + w_low];
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = bottom_data[h_high * data_width + w_high];
-
-  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename scalar_t>
-__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
-                                        const int h, const int w, const int height, const int width)
-{
-
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
-  {
-    //empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  scalar_t weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename scalar_t>
-__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
-                                          const int height, const int width, const scalar_t *im_data,
-                                          const int data_width, const int bp_dir)
-{
-
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
-  {
-    //empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  scalar_t weight = 0;
-
-  if (bp_dir == 0)
-  {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-  else if (bp_dir == 1)
-  {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename scalar_t>
-__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
-                                             const int height, const int width, const int kernel_h, const int kernel_w,
-                                             const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-                                             const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
-                                             const int batch_size, const int num_channels, const int deformable_group,
-                                             const int height_col, const int width_col,
-                                             scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
-    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
-    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i)
-    {
-      for (int j = 0; j < kernel_w; ++j)
-      {
-        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
-        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-        scalar_t val = static_cast<scalar_t>(0);
-        const scalar_t h_im = h_in + i * dilation_h + offset_h;
-        const scalar_t w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-        {
-          //const scalar_t map_h = i * dilation_h + offset_h;
-          //const scalar_t map_w = j * dilation_w + offset_w;
-          //const int cur_height = height - h_in;
-          //const int cur_width = width - w_in;
-          //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
-          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-void deformable_im2col(
-    const at::Tensor data_im, const at::Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, at::Tensor data_col)
-{
-  // num_axes should be smaller than block size
-  // todo: check parallel_imgs is correctly passed in
-  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, parallel_imgs, channels, deformable_group,
-            height_col, width_col, data_col_);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
-  }
-}
-
-template <typename scalar_t>
-__global__ void deformable_col2im_gpu_kernel(
-    const int n, const scalar_t *data_col, const scalar_t *data_offset,
-    const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group,
-    const int batch_size, const int deformable_group,
-    const int height_col, const int width_col,
-    scalar_t *grad_im)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
-                                                        2 * kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const scalar_t cur_top_grad = data_col[index];
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++)
-    {
-      for (int dx = -2; dx <= 2; dx++)
-      {
-        if (cur_h + dy >= 0 && cur_h + dy < height &&
-            cur_w + dx >= 0 && cur_w + dx < width &&
-            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1)
-        {
-          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
-          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
-        }
-      }
-    }
-  }
-}
-
-void deformable_col2im(
-    const at::Tensor data_col, const at::Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h,
-    const int ksize_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int parallel_imgs, const int deformable_group,
-    at::Tensor grad_im)
-{
-
-  // todo: make sure parallel_imgs is passed in correctly
-  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
-            ksize_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group,
-            parallel_imgs, deformable_group, height_col, width_col, grad_im_);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
-  }
-}
-
-template <typename scalar_t>
-__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
-                                                   const scalar_t *data_im, const scalar_t *data_offset,
-                                                   const int channels, const int height, const int width,
-                                                   const int kernel_h, const int kernel_w,
-                                                   const int pad_h, const int pad_w,
-                                                   const int stride_h, const int stride_w,
-                                                   const int dilation_h, const int dilation_w,
-                                                   const int channel_per_deformable_group,
-                                                   const int batch_size, const int offset_channels, const int deformable_group,
-                                                   const int height_col, const int width_col, scalar_t *grad_offset)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    scalar_t val = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
-                                                  batch_size * width_col * height_col;
-    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
-                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
-    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                                                        kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
-    {
-      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
-      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-      scalar_t inv_h = h_in + i * dilation_h + offset_h;
-      scalar_t inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-      {
-        inv_h = inv_w = -2;
-      }
-      const scalar_t weight = get_coordinate_weight(
-          inv_h, inv_w,
-          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-
-    grad_offset[index] = val;
-  }
-}
-
-void deformable_col2im_coord(
-    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
-    const int channels, const int height, const int width, const int ksize_h,
-    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
-{
-
-  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
-  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-
-        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
-            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group,
-            parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_);
-      }));
-}
-
-template <typename scalar_t>
-__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
-                                         const int height, const int width, scalar_t h, scalar_t w)
-{
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  scalar_t lh = h - h_low;
-  scalar_t lw = w - w_low;
-  scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-    v1 = bottom_data[h_low * data_width + w_low];
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = bottom_data[h_low * data_width + w_high];
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = bottom_data[h_high * data_width + w_low];
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = bottom_data[h_high * data_width + w_high];
-
-  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename scalar_t>
-__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
-                                             const int h, const int w, const int height, const int width)
-{
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
-  {
-    //empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  scalar_t weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename scalar_t>
-__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
-                                               const int height, const int width, const scalar_t *im_data,
-                                               const int data_width, const int bp_dir)
-{
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
-  {
-    //empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  scalar_t weight = 0;
-
-  if (bp_dir == 0)
-  {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-  else if (bp_dir == 1)
-  {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename scalar_t>
-__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
-                                                       const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
-                                                       const int height, const int width, const int kernel_h, const int kernel_w,
-                                                       const int pad_h, const int pad_w,
-                                                       const int stride_h, const int stride_w,
-                                                       const int dilation_h, const int dilation_w,
-                                                       const int channel_per_deformable_group,
-                                                       const int batch_size, const int num_channels, const int deformable_group,
-                                                       const int height_col, const int width_col,
-                                                       scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
-    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
-    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
-
-    const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i)
-    {
-      for (int j = 0; j < kernel_w; ++j)
-      {
-        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
-        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
-        scalar_t val = static_cast<scalar_t>(0);
-        const scalar_t h_im = h_in + i * dilation_h + offset_h;
-        const scalar_t w_im = w_in + j * dilation_w + offset_w;
-        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-        {
-          //const float map_h = i * dilation_h + offset_h;
-          //const float map_w = j * dilation_w + offset_w;
-          //const int cur_height = height - h_in;
-          //const int cur_width = width - w_in;
-          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
-          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
-        }
-        *data_col_ptr = val * mask;
-        data_col_ptr += batch_size * height_col * width_col;
-        //data_col_ptr += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
-                                                       const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
-                                                       const int channels, const int height, const int width,
-                                                       const int kernel_h, const int kernel_w,
-                                                       const int pad_h, const int pad_w,
-                                                       const int stride_h, const int stride_w,
-                                                       const int dilation_h, const int dilation_w,
-                                                       const int channel_per_deformable_group,
-                                                       const int batch_size, const int deformable_group,
-                                                       const int height_col, const int width_col,
-                                                       scalar_t *grad_im)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
-    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
-    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const scalar_t cur_top_grad = data_col[index] * mask;
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++)
-    {
-      for (int dx = -2; dx <= 2; dx++)
-      {
-        if (cur_h + dy >= 0 && cur_h + dy < height &&
-            cur_w + dx >= 0 && cur_w + dx < width &&
-            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1)
-        {
-          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
-          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
-                                                             const scalar_t *data_col, const scalar_t *data_im,
-                                                             const scalar_t *data_offset, const scalar_t *data_mask,
-                                                             const int channels, const int height, const int width,
-                                                             const int kernel_h, const int kernel_w,
-                                                             const int pad_h, const int pad_w,
-                                                             const int stride_h, const int stride_w,
-                                                             const int dilation_h, const int dilation_w,
-                                                             const int channel_per_deformable_group,
-                                                             const int batch_size, const int offset_channels, const int deformable_group,
-                                                             const int height_col, const int width_col,
-                                                             scalar_t *grad_offset, scalar_t *grad_mask)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    scalar_t val = 0, mval = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
-    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
-    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
-    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
-    {
-      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
-      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
-      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
-      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
-      scalar_t inv_h = h_in + i * dilation_h + offset_h;
-      scalar_t inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-      {
-        inv_h = inv_w = -2;
-      }
-      else
-      {
-        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
-      }
-      const scalar_t weight = dmcn_get_coordinate_weight(
-          inv_h, inv_w,
-          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
-      val += weight * data_col_ptr[col_pos] * mask;
-      cnt += 1;
-    }
-    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
-    grad_offset[index] = val;
-    if (offset_c % 2 == 0)
-      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
-      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
-  }
-}
-
-void modulated_deformable_im2col_cuda(
-    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int batch_size, const int channels, const int height_im, const int width_im,
-    const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int deformable_group, at::Tensor data_col)
-{
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, channels, deformable_group, height_col, width_col, data_col_);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-}
-
-void modulated_deformable_col2im_cuda(
-    const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int batch_size, const int channels, const int height_im, const int width_im,
-    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int deformable_group, at::Tensor grad_im)
-{
-
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
-            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, deformable_group, height_col, width_col, grad_im_);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-}
-
-void modulated_deformable_col2im_coord_cuda(
-    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int batch_size, const int channels, const int height_im, const int width_im,
-    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int deformable_group,
-    at::Tensor grad_offset, at::Tensor grad_mask)
-{
-  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
-  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
-            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
-            grad_offset_, grad_mask_);
-      }));
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
-  }
-}
diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp
deleted file mode 100644
index 3c09f998029..00000000000
--- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// modify from
-// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
-
-// based on
-// author: Charles Shang
-// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
-
-#include <torch/extension.h>
-#include <ATen/DeviceGuard.h>
-
-#include <cmath>
-#include <vector>
-
-void DeformablePSROIPoolForward(
-    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
-    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
-    const int height, const int width, const int num_bbox,
-    const int channels_trans, const int no_trans, const float spatial_scale,
-    const int output_dim, const int group_size, const int pooled_size,
-    const int part_size, const int sample_per_part, const float trans_std);
-
-void DeformablePSROIPoolBackwardAcc(
-    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
-    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
-    at::Tensor trans_grad, const int batch, const int channels,
-    const int height, const int width, const int num_bbox,
-    const int channels_trans, const int no_trans, const float spatial_scale,
-    const int output_dim, const int group_size, const int pooled_size,
-    const int part_size, const int sample_per_part, const float trans_std);
-
-void deform_psroi_pooling_cuda_forward(
-    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
-    at::Tensor top_count, const int no_trans, const float spatial_scale,
-    const int output_dim, const int group_size, const int pooled_size,
-    const int part_size, const int sample_per_part, const float trans_std) {
-  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-  const int channels_trans = no_trans ? 2 : trans.size(1);
-
-  const int num_bbox = bbox.size(0);
-  if (num_bbox != out.size(0))
-    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
-             out.size(0), num_bbox);
-
-  DeformablePSROIPoolForward(
-      input, bbox, trans, out, top_count, batch, channels, height, width,
-      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
-      pooled_size, part_size, sample_per_part, trans_std);
-}
-
-void deform_psroi_pooling_cuda_backward(
-    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
-    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
-    const int no_trans, const float spatial_scale, const int output_dim,
-    const int group_size, const int pooled_size, const int part_size,
-    const int sample_per_part, const float trans_std) {
-  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
-  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-  const int channels_trans = no_trans ? 2 : trans.size(1);
-
-  const int num_bbox = bbox.size(0);
-  if (num_bbox != out_grad.size(0))
-    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
-             out_grad.size(0), num_bbox);
-
-  DeformablePSROIPoolBackwardAcc(
-      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
-      channels, height, width, num_bbox, channels_trans, no_trans,
-      spatial_scale, output_dim, group_size, pooled_size, part_size,
-      sample_per_part, trans_std);
-}
diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu
deleted file mode 100644
index 18e3a048d3f..00000000000
--- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu
+++ /dev/null
@@ -1,364 +0,0 @@
-/*!
- * Copyright (c) 2017 Microsoft
- * Licensed under The MIT License [see LICENSE for details]
- * \file deformable_psroi_pooling.cu
- * \brief
- * \author Yi Li, Guodong Zhang, Jifeng Dai
-*/
-/***************** Adapted by Charles Shang *********************/
-// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
-
-#include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
-#include <stdio.h>
-#include <math.h>
-#include <algorithm>
-
-using namespace at;
-
-#define CUDA_KERNEL_LOOP(i, n)                        \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
-       i < (n);                                       \
-       i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N)
-{
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-template <typename scalar_t>
-__device__ scalar_t bilinear_interp(
-    const scalar_t *data,
-    const scalar_t x,
-    const scalar_t y,
-    const int width,
-    const int height)
-{
-  int x1 = floor(x);
-  int x2 = ceil(x);
-  int y1 = floor(y);
-  int y2 = ceil(y);
-  scalar_t dist_x = (scalar_t)(x - x1);
-  scalar_t dist_y = (scalar_t)(y - y1);
-  scalar_t value11 = data[y1 * width + x1];
-  scalar_t value12 = data[y2 * width + x1];
-  scalar_t value21 = data[y1 * width + x2];
-  scalar_t value22 = data[y2 * width + x2];
-  scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
-  return value;
-}
-
-template <typename scalar_t>
-__global__ void DeformablePSROIPoolForwardKernel(
-    const int count,
-    const scalar_t *bottom_data,
-    const scalar_t spatial_scale,
-    const int channels,
-    const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const scalar_t *bottom_rois, const scalar_t *bottom_trans,
-    const int no_trans,
-    const scalar_t trans_std,
-    const int sample_per_part,
-    const int output_dim,
-    const int group_size,
-    const int part_size,
-    const int num_classes,
-    const int channels_each_class,
-    scalar_t *top_data,
-    scalar_t *top_count)
-{
-  CUDA_KERNEL_LOOP(index, count)
-  {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
-    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
-
-    // Force too small ROIs to be 1x1
-    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
-    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
-
-    // Compute w and h at bottom
-    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
-    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
-
-    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
-    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
-
-    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
-    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
-    int class_id = ctop / channels_each_class;
-    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
-    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
-
-    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-
-    scalar_t sum = 0;
-    int count = 0;
-    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
-    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
-    gw = min(max(gw, 0), group_size - 1);
-    gh = min(max(gh, 0), group_size - 1);
-
-    const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
-    for (int ih = 0; ih < sample_per_part; ih++)
-    {
-      for (int iw = 0; iw < sample_per_part; iw++)
-      {
-        scalar_t w = wstart + iw * sub_bin_size_w;
-        scalar_t h = hstart + ih * sub_bin_size_h;
-        // bilinear interpolation
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
-        {
-          continue;
-        }
-        w = min(max(w, 0.), width - 1.);
-        h = min(max(h, 0.), height - 1.);
-        int c = (ctop * group_size + gh) * group_size + gw;
-        scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
-        sum += val;
-        count++;
-      }
-    }
-    top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
-    top_count[index] = count;
-  }
-}
-
-template <typename scalar_t>
-__global__ void DeformablePSROIPoolBackwardAccKernel(
-    const int count,
-    const scalar_t *top_diff,
-    const scalar_t *top_count,
-    const int num_rois,
-    const scalar_t spatial_scale,
-    const int channels,
-    const int height, const int width,
-    const int pooled_height, const int pooled_width,
-    const int output_dim,
-    scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
-    const scalar_t *bottom_data,
-    const scalar_t *bottom_rois,
-    const scalar_t *bottom_trans,
-    const int no_trans,
-    const scalar_t trans_std,
-    const int sample_per_part,
-    const int group_size,
-    const int part_size,
-    const int num_classes,
-    const int channels_each_class)
-{
-  CUDA_KERNEL_LOOP(index, count)
-  {
-    // The output is in order (n, ctop, ph, pw)
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int ctop = (index / pooled_width / pooled_height) % output_dim;
-    int n = index / pooled_width / pooled_height / output_dim;
-
-    // [start, end) interval for spatial sampling
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
-    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
-    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
-    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
-
-    // Force too small ROIs to be 1x1
-    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
-    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
-
-    // Compute w and h at bottom
-    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
-    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
-
-    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
-    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
-
-    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
-    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
-    int class_id = ctop / channels_each_class;
-    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
-    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
-
-    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
-    wstart += trans_x * roi_width;
-    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
-    hstart += trans_y * roi_height;
-
-    if (top_count[index] <= 0)
-    {
-      continue;
-    }
-    scalar_t diff_val = top_diff[index] / top_count[index];
-    const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
-    scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
-    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
-    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
-    gw = min(max(gw, 0), group_size - 1);
-    gh = min(max(gh, 0), group_size - 1);
-
-    for (int ih = 0; ih < sample_per_part; ih++)
-    {
-      for (int iw = 0; iw < sample_per_part; iw++)
-      {
-        scalar_t w = wstart + iw * sub_bin_size_w;
-        scalar_t h = hstart + ih * sub_bin_size_h;
-        // bilinear interpolation
-        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
-        {
-          continue;
-        }
-        w = min(max(w, 0.), width - 1.);
-        h = min(max(h, 0.), height - 1.);
-        int c = (ctop * group_size + gh) * group_size + gw;
-        // backward on feature
-        int x0 = floor(w);
-        int x1 = ceil(w);
-        int y0 = floor(h);
-        int y1 = ceil(h);
-        scalar_t dist_x = w - x0, dist_y = h - y0;
-        scalar_t q00 = (1 - dist_x) * (1 - dist_y);
-        scalar_t q01 = (1 - dist_x) * dist_y;
-        scalar_t q10 = dist_x * (1 - dist_y);
-        scalar_t q11 = dist_x * dist_y;
-        int bottom_index_base = c * height * width;
-        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
-        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
-        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
-        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
-
-        if (no_trans)
-        {
-          continue;
-        }
-        scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
-        scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
-        scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
-        scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
-        scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
-        diff_x *= roi_width;
-        scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
-        diff_y *= roi_height;
-
-        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
-        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
-      }
-    }
-  }
-}
-
-void DeformablePSROIPoolForward(const at::Tensor data,
-                                const at::Tensor bbox,
-                                const at::Tensor trans,
-                                at::Tensor out,
-                                at::Tensor top_count,
-                                const int batch,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int num_bbox,
-                                const int channels_trans,
-                                const int no_trans,
-                                const float spatial_scale,
-                                const int output_dim,
-                                const int group_size,
-                                const int pooled_size,
-                                const int part_size,
-                                const int sample_per_part,
-                                const float trans_std)
-{
-  const int pooled_height = pooled_size;
-  const int pooled_width = pooled_size;
-  const int count = num_bbox * output_dim * pooled_height * pooled_width;
-  const int num_classes = no_trans ? 1 : channels_trans / 2;
-  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
-        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
-        scalar_t *top_data = out.data_ptr<scalar_t>();
-        scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
-
-        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
-            bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
-            group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
-  }
-}
-
-void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
-                                    const at::Tensor data,
-                                    const at::Tensor bbox,
-                                    const at::Tensor trans,
-                                    const at::Tensor top_count,
-                                    at::Tensor in_grad,
-                                    at::Tensor trans_grad,
-                                    const int batch,
-                                    const int channels,
-                                    const int height,
-                                    const int width,
-                                    const int num_bbox,
-                                    const int channels_trans,
-                                    const int no_trans,
-                                    const float spatial_scale,
-                                    const int output_dim,
-                                    const int group_size,
-                                    const int pooled_size,
-                                    const int part_size,
-                                    const int sample_per_part,
-                                    const float trans_std)
-{
-  // LOG(INFO) << "DeformablePSROIPoolBackward";
-  const int num_rois = num_bbox;
-  const int pooled_height = pooled_size;
-  const int pooled_width = pooled_size;
-  const int count = num_bbox * output_dim * pooled_height * pooled_width;
-  const int num_classes = no_trans ? 1 : channels_trans / 2;
-  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
-        const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
-        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
-        scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
-        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
-        const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
-
-        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
-            pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
-            bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
-            group_size, part_size, num_classes, channels_each_class);
-      }));
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
-  }
-}
diff --git a/mmdet/ops/dcn/src/deform_conv_ext.cpp b/mmdet/ops/dcn/src/deform_conv_ext.cpp
deleted file mode 100644
index fac60162b69..00000000000
--- a/mmdet/ops/dcn/src/deform_conv_ext.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-// modify from
-// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
-
-#include <torch/extension.h>
-#include <ATen/DeviceGuard.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
-                             at::Tensor offset, at::Tensor output,
-                             at::Tensor columns, at::Tensor ones, int kW,
-                             int kH, int dW, int dH, int padW, int padH,
-                             int dilationW, int dilationH, int group,
-                             int deformable_group, int im2col_step);
-
-int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
-                                    at::Tensor gradOutput, at::Tensor gradInput,
-                                    at::Tensor gradOffset, at::Tensor weight,
-                                    at::Tensor columns, int kW, int kH, int dW,
-                                    int dH, int padW, int padH, int dilationW,
-                                    int dilationH, int group,
-                                    int deformable_group, int im2col_step);
-
-int deform_conv_backward_parameters_cuda(
-    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
-    at::Tensor gradWeight,  // at::Tensor gradBias,
-    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
-    int padW, int padH, int dilationW, int dilationH, int group,
-    int deformable_group, float scale, int im2col_step);
-
-void modulated_deform_conv_cuda_forward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
-    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w, const int dilation_h,
-    const int dilation_w, const int group, const int deformable_group,
-    const bool with_bias);
-
-void modulated_deform_conv_cuda_backward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor columns,
-    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
-    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
-    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
-    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
-    const bool with_bias);
-#endif
-
-int deform_conv_forward(at::Tensor input, at::Tensor weight,
-                             at::Tensor offset, at::Tensor output,
-                             at::Tensor columns, at::Tensor ones, int kW,
-                             int kH, int dW, int dH, int padW, int padH,
-                             int dilationW, int dilationH, int group,
-                             int deformable_group, int im2col_step) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return deform_conv_forward_cuda(input, weight, offset, output, columns,
-        ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group,
-        deformable_group, im2col_step);
-#else
-    AT_ERROR("deform conv is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("deform conv is not implemented on CPU");
-}
-
-int deform_conv_backward_input(at::Tensor input, at::Tensor offset,
-                                    at::Tensor gradOutput, at::Tensor gradInput,
-                                    at::Tensor gradOffset, at::Tensor weight,
-                                    at::Tensor columns, int kW, int kH, int dW,
-                                    int dH, int padW, int padH, int dilationW,
-                                    int dilationH, int group,
-                                    int deformable_group, int im2col_step) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return deform_conv_backward_input_cuda(input, offset, gradOutput,
-        gradInput, gradOffset, weight, columns, kW, kH, dW, dH, padW, padH,
-        dilationW, dilationH, group, deformable_group, im2col_step);
-#else
-    AT_ERROR("deform conv is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("deform conv is not implemented on CPU");
-}
-
-int deform_conv_backward_parameters(
-    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
-    at::Tensor gradWeight,  // at::Tensor gradBias,
-    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
-    int padW, int padH, int dilationW, int dilationH, int group,
-    int deformable_group, float scale, int im2col_step) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return deform_conv_backward_parameters_cuda(input, offset, gradOutput,
-        gradWeight, columns, ones, kW, kH, dW, dH, padW, padH, dilationW,
-        dilationH, group, deformable_group, scale, im2col_step);
-#else
-    AT_ERROR("deform conv is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("deform conv is not implemented on CPU");
-}
-
-void modulated_deform_conv_forward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
-    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
-    const int pad_h, const int pad_w, const int dilation_h,
-    const int dilation_w, const int group, const int deformable_group,
-    const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return modulated_deform_conv_cuda_forward(input, weight, bias, ones,
-        offset, mask, output, columns, kernel_h, kernel_w, stride_h,
-        stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
-        deformable_group, with_bias);
-#else
-    AT_ERROR("modulated deform conv is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("modulated deform conv is not implemented on CPU");
-}
-
-void modulated_deform_conv_backward(
-    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
-    at::Tensor offset, at::Tensor mask, at::Tensor columns,
-    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
-    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
-    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
-    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
-    const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return modulated_deform_conv_cuda_backward(input, weight, bias, ones,
-        offset, mask, columns, grad_input, grad_weight, grad_bias, grad_offset,
-        grad_mask, grad_output, kernel_h, kernel_w, stride_h, stride_w,
-        pad_h, pad_w, dilation_h, dilation_w, group, deformable_group,
-        with_bias);
-#else
-    AT_ERROR("modulated deform conv is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("modulated deform conv is not implemented on CPU");
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("deform_conv_forward", &deform_conv_forward,
-        "deform forward");
-  m.def("deform_conv_backward_input", &deform_conv_backward_input,
-        "deform_conv_backward_input");
-  m.def("deform_conv_backward_parameters",
-        &deform_conv_backward_parameters,
-        "deform_conv_backward_parameters");
-  m.def("modulated_deform_conv_forward",
-        &modulated_deform_conv_forward,
-        "modulated deform conv forward");
-  m.def("modulated_deform_conv_backward",
-        &modulated_deform_conv_backward,
-        "modulated deform conv backward");
-}
diff --git a/mmdet/ops/dcn/src/deform_pool_ext.cpp b/mmdet/ops/dcn/src/deform_pool_ext.cpp
deleted file mode 100644
index 877064828d5..00000000000
--- a/mmdet/ops/dcn/src/deform_pool_ext.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// modify from
-// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
-
-// based on
-// author: Charles Shang
-// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
-
-#include <torch/extension.h>
-#include <ATen/DeviceGuard.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-void deform_psroi_pooling_cuda_forward(
-    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
-    at::Tensor top_count, const int no_trans, const float spatial_scale,
-    const int output_dim, const int group_size, const int pooled_size,
-    const int part_size, const int sample_per_part, const float trans_std);
-
-void deform_psroi_pooling_cuda_backward(
-    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
-    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
-    const int no_trans, const float spatial_scale, const int output_dim,
-    const int group_size, const int pooled_size, const int part_size,
-    const int sample_per_part, const float trans_std);
-#endif
-
-void deform_psroi_pooling_forward(
-    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
-    at::Tensor top_count, const int no_trans, const float spatial_scale,
-    const int output_dim, const int group_size, const int pooled_size,
-    const int part_size, const int sample_per_part, const float trans_std) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return deform_psroi_pooling_cuda_forward(input, bbox, trans, out, top_count,
-        no_trans, spatial_scale, output_dim, group_size, pooled_size,
-        part_size, sample_per_part, trans_std);
-#else
-    AT_ERROR("deform psroi pooling is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("deform psroi pooling is not implemented on CPU");
-}
-
-void deform_psroi_pooling_backward(
-    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
-    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
-    const int no_trans, const float spatial_scale, const int output_dim,
-    const int group_size, const int pooled_size, const int part_size,
-    const int sample_per_part, const float trans_std) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return deform_psroi_pooling_cuda_backward(out_grad, input, bbox, trans,
-        top_count, input_grad, trans_grad, no_trans, spatial_scale,
-        output_dim, group_size, pooled_size, part_size, sample_per_part,
-        trans_std);
-#else
-    AT_ERROR("deform psroi pooling is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("deform psroi pooling is not implemented on CPU");
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward,
-        "deform psroi pooling forward");
-  m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward,
-        "deform psroi pooling backward");
-}
diff --git a/mmdet/ops/generalized_attention.py b/mmdet/ops/generalized_attention.py
deleted file mode 100644
index 94a2e370f2e..00000000000
--- a/mmdet/ops/generalized_attention.py
+++ /dev/null
@@ -1,384 +0,0 @@
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import kaiming_init
-
-
-class GeneralizedAttention(nn.Module):
-    """GeneralizedAttention module.
-
-    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
-    (https://arxiv.org/abs/1711.07971) for details.
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        spatial_range (int): The spatial range.
-            -1 indicates no spatial range constraint.
-        num_heads (int): The head number of empirical_attention module.
-        position_embedding_dim (int): The position embedding dimension.
-        position_magnitude (int): A multiplier acting on coord difference.
-        kv_stride (int): The feature stride acting on key/value feature map.
-        q_stride (int): The feature stride acting on query feature map.
-        attention_type (str): A binary indicator string for indicating which
-            items in generalized empirical_attention module are used.
-            '1000' indicates 'query and key content' (appr - appr) item,
-            '0100' indicates 'query content and relative position'
-              (appr - position) item,
-            '0010' indicates 'key content only' (bias - appr) item,
-            '0001' indicates 'relative position only' (bias - position) item.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 spatial_range=-1,
-                 num_heads=9,
-                 position_embedding_dim=-1,
-                 position_magnitude=1,
-                 kv_stride=2,
-                 q_stride=1,
-                 attention_type='1111'):
-
-        super(GeneralizedAttention, self).__init__()
-
-        # hard range means local range for non-local operation
-        self.position_embedding_dim = (
-            position_embedding_dim
-            if position_embedding_dim > 0 else in_channels)
-
-        self.position_magnitude = position_magnitude
-        self.num_heads = num_heads
-        self.in_channels = in_channels
-        self.spatial_range = spatial_range
-        self.kv_stride = kv_stride
-        self.q_stride = q_stride
-        self.attention_type = [bool(int(_)) for _ in attention_type]
-        self.qk_embed_dim = in_channels // num_heads
-        out_c = self.qk_embed_dim * num_heads
-
-        if self.attention_type[0] or self.attention_type[1]:
-            self.query_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_c,
-                kernel_size=1,
-                bias=False)
-            self.query_conv.kaiming_init = True
-
-        if self.attention_type[0] or self.attention_type[2]:
-            self.key_conv = nn.Conv2d(
-                in_channels=in_channels,
-                out_channels=out_c,
-                kernel_size=1,
-                bias=False)
-            self.key_conv.kaiming_init = True
-
-        self.v_dim = in_channels // num_heads
-        self.value_conv = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=self.v_dim * num_heads,
-            kernel_size=1,
-            bias=False)
-        self.value_conv.kaiming_init = True
-
-        if self.attention_type[1] or self.attention_type[3]:
-            self.appr_geom_fc_x = nn.Linear(
-                self.position_embedding_dim // 2, out_c, bias=False)
-            self.appr_geom_fc_x.kaiming_init = True
-
-            self.appr_geom_fc_y = nn.Linear(
-                self.position_embedding_dim // 2, out_c, bias=False)
-            self.appr_geom_fc_y.kaiming_init = True
-
-        if self.attention_type[2]:
-            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
-            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
-            self.appr_bias = nn.Parameter(appr_bias_value)
-
-        if self.attention_type[3]:
-            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
-            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
-            self.geom_bias = nn.Parameter(geom_bias_value)
-
-        self.proj_conv = nn.Conv2d(
-            in_channels=self.v_dim * num_heads,
-            out_channels=in_channels,
-            kernel_size=1,
-            bias=True)
-        self.proj_conv.kaiming_init = True
-        self.gamma = nn.Parameter(torch.zeros(1))
-
-        if self.spatial_range >= 0:
-            # only works when non local is after 3*3 conv
-            if in_channels == 256:
-                max_len = 84
-            elif in_channels == 512:
-                max_len = 42
-
-            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
-            local_constraint_map = np.ones(
-                (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
-            for iy in range(max_len):
-                for ix in range(max_len):
-                    local_constraint_map[
-                        iy, ix,
-                        max((iy - self.spatial_range) //
-                            self.kv_stride, 0):min((iy + self.spatial_range +
-                                                    1) // self.kv_stride +
-                                                   1, max_len),
-                        max((ix - self.spatial_range) //
-                            self.kv_stride, 0):min((ix + self.spatial_range +
-                                                    1) // self.kv_stride +
-                                                   1, max_len)] = 0
-
-            self.local_constraint_map = nn.Parameter(
-                torch.from_numpy(local_constraint_map).byte(),
-                requires_grad=False)
-
-        if self.q_stride > 1:
-            self.q_downsample = nn.AvgPool2d(
-                kernel_size=1, stride=self.q_stride)
-        else:
-            self.q_downsample = None
-
-        if self.kv_stride > 1:
-            self.kv_downsample = nn.AvgPool2d(
-                kernel_size=1, stride=self.kv_stride)
-        else:
-            self.kv_downsample = None
-
-        self.init_weights()
-
-    def get_position_embedding(self,
-                               h,
-                               w,
-                               h_kv,
-                               w_kv,
-                               q_stride,
-                               kv_stride,
-                               device,
-                               feat_dim,
-                               wave_length=1000):
-        h_idxs = torch.linspace(0, h - 1, h).cuda(device)
-        h_idxs = h_idxs.view((h, 1)) * q_stride
-
-        w_idxs = torch.linspace(0, w - 1, w).cuda(device)
-        w_idxs = w_idxs.view((w, 1)) * q_stride
-
-        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).cuda(device)
-        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
-
-        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).cuda(device)
-        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
-
-        # (h, h_kv, 1)
-        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
-        h_diff *= self.position_magnitude
-
-        # (w, w_kv, 1)
-        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
-        w_diff *= self.position_magnitude
-
-        feat_range = torch.arange(0, feat_dim / 4).cuda(device)
-
-        dim_mat = torch.Tensor([wave_length]).cuda(device)
-        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
-        dim_mat = dim_mat.view((1, 1, -1))
-
-        embedding_x = torch.cat(
-            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
-
-        embedding_y = torch.cat(
-            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
-
-        return embedding_x, embedding_y
-
-    def forward(self, x_input):
-        num_heads = self.num_heads
-
-        # use empirical_attention
-        if self.q_downsample is not None:
-            x_q = self.q_downsample(x_input)
-        else:
-            x_q = x_input
-        n, _, h, w = x_q.shape
-
-        if self.kv_downsample is not None:
-            x_kv = self.kv_downsample(x_input)
-        else:
-            x_kv = x_input
-        _, _, h_kv, w_kv = x_kv.shape
-
-        if self.attention_type[0] or self.attention_type[1]:
-            proj_query = self.query_conv(x_q).view(
-                (n, num_heads, self.qk_embed_dim, h * w))
-            proj_query = proj_query.permute(0, 1, 3, 2)
-
-        if self.attention_type[0] or self.attention_type[2]:
-            proj_key = self.key_conv(x_kv).view(
-                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
-
-        if self.attention_type[1] or self.attention_type[3]:
-            position_embed_x, position_embed_y = self.get_position_embedding(
-                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
-                x_input.device, self.position_embedding_dim)
-            # (n, num_heads, w, w_kv, dim)
-            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
-                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
-                permute(0, 3, 1, 2, 4).\
-                repeat(n, 1, 1, 1, 1)
-
-            # (n, num_heads, h, h_kv, dim)
-            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
-                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
-                permute(0, 3, 1, 2, 4).\
-                repeat(n, 1, 1, 1, 1)
-
-            position_feat_x /= math.sqrt(2)
-            position_feat_y /= math.sqrt(2)
-
-        # accelerate for saliency only
-        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
-            appr_bias = self.appr_bias.\
-                view(1, num_heads, 1, self.qk_embed_dim).\
-                repeat(n, 1, 1, 1)
-
-            energy = torch.matmul(appr_bias, proj_key).\
-                view(n, num_heads, 1, h_kv * w_kv)
-
-            h = 1
-            w = 1
-        else:
-            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
-            if not self.attention_type[0]:
-                energy = torch.zeros(
-                    n,
-                    num_heads,
-                    h,
-                    w,
-                    h_kv,
-                    w_kv,
-                    dtype=x_input.dtype,
-                    device=x_input.device)
-
-            # attention_type[0]: appr - appr
-            # attention_type[1]: appr - position
-            # attention_type[2]: bias - appr
-            # attention_type[3]: bias - position
-            if self.attention_type[0] or self.attention_type[2]:
-                if self.attention_type[0] and self.attention_type[2]:
-                    appr_bias = self.appr_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim)
-                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
-                        view(n, num_heads, h, w, h_kv, w_kv)
-
-                elif self.attention_type[0]:
-                    energy = torch.matmul(proj_query, proj_key).\
-                        view(n, num_heads, h, w, h_kv, w_kv)
-
-                elif self.attention_type[2]:
-                    appr_bias = self.appr_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim).\
-                        repeat(n, 1, 1, 1)
-
-                    energy += torch.matmul(appr_bias, proj_key).\
-                        view(n, num_heads, 1, 1, h_kv, w_kv)
-
-            if self.attention_type[1] or self.attention_type[3]:
-                if self.attention_type[1] and self.attention_type[3]:
-                    geom_bias = self.geom_bias.\
-                        view(1, num_heads, 1, self.qk_embed_dim)
-
-                    proj_query_reshape = (proj_query + geom_bias).\
-                        view(n, num_heads, h, w, self.qk_embed_dim)
-
-                    energy_x = torch.matmul(
-                        proj_query_reshape.permute(0, 1, 3, 2, 4),
-                        position_feat_x.permute(0, 1, 2, 4, 3))
-                    energy_x = energy_x.\
-                        permute(0, 1, 3, 2, 4).unsqueeze(4)
-
-                    energy_y = torch.matmul(
-                        proj_query_reshape,
-                        position_feat_y.permute(0, 1, 2, 4, 3))
-                    energy_y = energy_y.unsqueeze(5)
-
-                    energy += energy_x + energy_y
-
-                elif self.attention_type[1]:
-                    proj_query_reshape = proj_query.\
-                        view(n, num_heads, h, w, self.qk_embed_dim)
-                    proj_query_reshape = proj_query_reshape.\
-                        permute(0, 1, 3, 2, 4)
-                    position_feat_x_reshape = position_feat_x.\
-                        permute(0, 1, 2, 4, 3)
-                    position_feat_y_reshape = position_feat_y.\
-                        permute(0, 1, 2, 4, 3)
-
-                    energy_x = torch.matmul(proj_query_reshape,
-                                            position_feat_x_reshape)
-                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
-
-                    energy_y = torch.matmul(proj_query_reshape,
-                                            position_feat_y_reshape)
-                    energy_y = energy_y.unsqueeze(5)
-
-                    energy += energy_x + energy_y
-
-                elif self.attention_type[3]:
-                    geom_bias = self.geom_bias.\
-                        view(1, num_heads, self.qk_embed_dim, 1).\
-                        repeat(n, 1, 1, 1)
-
-                    position_feat_x_reshape = position_feat_x.\
-                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
-
-                    position_feat_y_reshape = position_feat_y.\
-                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
-
-                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
-                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
-
-                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
-                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
-
-                    energy += energy_x + energy_y
-
-            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
-
-        if self.spatial_range >= 0:
-            cur_local_constraint_map = \
-                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
-                contiguous().\
-                view(1, 1, h*w, h_kv*w_kv)
-
-            energy = energy.masked_fill_(cur_local_constraint_map,
-                                         float('-inf'))
-
-        attention = F.softmax(energy, 3)
-
-        proj_value = self.value_conv(x_kv)
-        proj_value_reshape = proj_value.\
-            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
-            permute(0, 1, 3, 2)
-
-        out = torch.matmul(attention, proj_value_reshape).\
-            permute(0, 1, 3, 2).\
-            contiguous().\
-            view(n, self.v_dim * self.num_heads, h, w)
-
-        out = self.proj_conv(out)
-        out = self.gamma * out + x_input
-        return out
-
-    def init_weights(self):
-        for m in self.modules():
-            if hasattr(m, 'kaiming_init') and m.kaiming_init:
-                kaiming_init(
-                    m,
-                    mode='fan_in',
-                    nonlinearity='leaky_relu',
-                    bias=0,
-                    distribution='uniform',
-                    a=1)
diff --git a/mmdet/ops/masked_conv/__init__.py b/mmdet/ops/masked_conv/__init__.py
deleted file mode 100644
index f537ace080f..00000000000
--- a/mmdet/ops/masked_conv/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .masked_conv import MaskedConv2d, masked_conv2d
-
-__all__ = ['masked_conv2d', 'MaskedConv2d']
diff --git a/mmdet/ops/masked_conv/masked_conv.py b/mmdet/ops/masked_conv/masked_conv.py
deleted file mode 100644
index d29793286c0..00000000000
--- a/mmdet/ops/masked_conv/masked_conv.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import math
-
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from . import masked_conv2d_ext
-
-
-class MaskedConv2dFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
-        assert mask.dim() == 3 and mask.size(0) == 1
-        assert features.dim() == 4 and features.size(0) == 1
-        assert features.size()[2:] == mask.size()[1:]
-        pad_h, pad_w = _pair(padding)
-        stride_h, stride_w = _pair(stride)
-        if stride_h != 1 or stride_w != 1:
-            raise ValueError(
-                'Stride could not only be 1 in masked_conv2d currently.')
-        if not features.is_cuda:
-            raise NotImplementedError
-
-        out_channel, in_channel, kernel_h, kernel_w = weight.size()
-
-        batch_size = features.size(0)
-        out_h = int(
-            math.floor((features.size(2) + 2 * pad_h -
-                        (kernel_h - 1) - 1) / stride_h + 1))
-        out_w = int(
-            math.floor((features.size(3) + 2 * pad_w -
-                        (kernel_h - 1) - 1) / stride_w + 1))
-        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
-        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
-        if mask_inds.numel() > 0:
-            mask_h_idx = mask_inds[:, 0].contiguous()
-            mask_w_idx = mask_inds[:, 1].contiguous()
-            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
-                                          mask_inds.size(0))
-            masked_conv2d_ext.masked_im2col_forward(features, mask_h_idx,
-                                                    mask_w_idx, kernel_h,
-                                                    kernel_w, pad_h, pad_w,
-                                                    data_col)
-
-            masked_output = torch.addmm(1, bias[:, None], 1,
-                                        weight.view(out_channel, -1), data_col)
-            masked_conv2d_ext.masked_col2im_forward(masked_output, mask_h_idx,
-                                                    mask_w_idx, out_h, out_w,
-                                                    out_channel, output)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        return (None, ) * 5
-
-
-masked_conv2d = MaskedConv2dFunction.apply
-
-
-class MaskedConv2d(nn.Conv2d):
-    """A MaskedConv2d which inherits the official Conv2d.
-
-    The masked forward doesn't implement the backward function and only
-    supports the stride parameter to be 1 currently.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super(MaskedConv2d,
-              self).__init__(in_channels, out_channels, kernel_size, stride,
-                             padding, dilation, groups, bias)
-
-    def forward(self, input, mask=None):
-        if mask is None:  # fallback to the normal Conv2d
-            return super(MaskedConv2d, self).forward(input)
-        else:
-            return masked_conv2d(input, mask, self.weight, self.bias,
-                                 self.padding)
diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp
deleted file mode 100644
index 84bd7c27913..00000000000
--- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-int MaskedIm2colForwardLaucher(const at::Tensor im, const int height,
-                               const int width, const int channels,
-                               const int kernel_h, const int kernel_w,
-                               const int pad_h, const int pad_w,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int mask_cnt,
-                               at::Tensor col);
-
-int MaskedCol2imForwardLaucher(const at::Tensor col, const int height,
-                               const int width, const int channels,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int mask_cnt,
-                               at::Tensor im);
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-int masked_im2col_forward_cuda(const at::Tensor im, const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int kernel_h,
-                               const int kernel_w, const int pad_h,
-                               const int pad_w, at::Tensor col) {
-  CHECK_INPUT(im);
-  CHECK_INPUT(mask_h_idx);
-  CHECK_INPUT(mask_w_idx);
-  CHECK_INPUT(col);
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  at::DeviceGuard guard(im.device());
-
-  int channels = im.size(1);
-  int height = im.size(2);
-  int width = im.size(3);
-  int mask_cnt = mask_h_idx.size(0);
-
-  MaskedIm2colForwardLaucher(im, height, width, channels, kernel_h, kernel_w,
-                             pad_h, pad_w, mask_h_idx, mask_w_idx, mask_cnt,
-                             col);
-
-  return 1;
-}
-
-int masked_col2im_forward_cuda(const at::Tensor col,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, int height,
-                               int width, int channels, at::Tensor im) {
-  CHECK_INPUT(col);
-  CHECK_INPUT(mask_h_idx);
-  CHECK_INPUT(mask_w_idx);
-  CHECK_INPUT(im);
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  at::DeviceGuard guard(col.device());
-
-  int mask_cnt = mask_h_idx.size(0);
-
-  MaskedCol2imForwardLaucher(col, height, width, channels, mask_h_idx,
-                             mask_w_idx, mask_cnt, im);
-
-  return 1;
-}
diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu
deleted file mode 100644
index b8323592f52..00000000000
--- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define THREADS_PER_BLOCK 1024
-
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  int max_block_num = 65000;
-  return min(optimal_block_num, max_block_num);
-}
-
-template <typename scalar_t>
-__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
-                                    const int height, const int width,
-                                    const int kernel_h, const int kernel_w,
-                                    const int pad_h, const int pad_w,
-                                    const int64_t *mask_h_idx,
-                                    const int64_t *mask_w_idx,
-                                    const int mask_cnt, scalar_t *data_col) {
-  // mask_cnt * channels
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const int m_index = index % mask_cnt;
-    const int h_col = mask_h_idx[m_index];
-    const int w_col = mask_w_idx[m_index];
-    const int c_im = index / mask_cnt;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col - pad_h;
-    const int w_offset = w_col - pad_w;
-    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
-    for (int i = 0; i < kernel_h; ++i) {
-      int h_im = h_offset + i;
-      for (int j = 0; j < kernel_w; ++j) {
-        int w_im = w_offset + j;
-        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
-          *data_col_ptr =
-              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
-        } else {
-          *data_col_ptr = 0.0;
-        }
-        data_col_ptr += mask_cnt;
-      }
-    }
-  }
-}
-
-int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
-                               const int width, const int channels,
-                               const int kernel_h, const int kernel_w,
-                               const int pad_h, const int pad_w,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int mask_cnt,
-                               at::Tensor top_data) {
-  const int output_size = mask_cnt * channels;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
-        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
-        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
-        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
-        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
-        MaskedIm2colForward<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()
->>>(
-                output_size, bottom_data_, height, width, kernel_h, kernel_w,
-                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
-
-template <typename scalar_t>
-__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
-                                    const int height, const int width,
-                                    const int channels,
-                                    const int64_t *mask_h_idx,
-                                    const int64_t *mask_w_idx,
-                                    const int mask_cnt, scalar_t *data_im) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const int m_index = index % mask_cnt;
-    const int h_im = mask_h_idx[m_index];
-    const int w_im = mask_w_idx[m_index];
-    const int c_im = index / mask_cnt;
-    // compute the start and end of the output
-    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
-  }
-}
-
-int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height,
-                               const int width, const int channels,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int mask_cnt,
-                               at::Tensor top_data) {
-  const int output_size = mask_cnt * channels;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
-        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
-        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
-        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
-        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
-
-        MaskedCol2imForward<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
-                output_size, bottom_data_, height, width, channels, mask_h_idx_,
-                mask_w_idx_, mask_cnt, top_data_);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp b/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp
deleted file mode 100644
index 39058ad7755..00000000000
--- a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int masked_im2col_forward_cuda(const at::Tensor im, const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int kernel_h,
-                               const int kernel_w, const int pad_h,
-                               const int pad_w, at::Tensor col);
-
-int masked_col2im_forward_cuda(const at::Tensor col,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, int height,
-                               int width, int channels, at::Tensor im);
-#endif
-
-int masked_im2col_forward(const at::Tensor im, const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, const int kernel_h,
-                               const int kernel_w, const int pad_h,
-                               const int pad_w, at::Tensor col) {
-  if (im.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, kernel_h,
-      kernel_w, pad_h, pad_w, col);
-#else
-    AT_ERROR("masked_im2col is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("masked_im2col is not implemented on CPU");
-}
-
-int masked_col2im_forward(const at::Tensor col,
-                               const at::Tensor mask_h_idx,
-                               const at::Tensor mask_w_idx, int height,
-                               int width, int channels, at::Tensor im) {
-  if (col.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, height,
-      width, channels, im);
-#else
-    AT_ERROR("masked_col2im is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("masked_col2im is not implemented on CPU");
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("masked_im2col_forward", &masked_im2col_forward,
-        "masked_im2col forward");
-  m.def("masked_col2im_forward", &masked_col2im_forward,
-        "masked_col2im forward");
-}
diff --git a/mmdet/ops/merge_cells.py b/mmdet/ops/merge_cells.py
deleted file mode 100644
index e1d588404b2..00000000000
--- a/mmdet/ops/merge_cells.py
+++ /dev/null
@@ -1,147 +0,0 @@
-from abc import abstractmethod
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import ConvModule
-
-
-class BaseMergeCell(nn.Module):
-    """The basic class for cells used in NAS-FPN and NAS-FCOS.
-
-    BaseMergeCell takes 2 inputs. After applying concolution
-    on them, they are resized to the target size. Then,
-    they go through binary_op, which depends on the type of cell.
-    If with_out_conv is True, the result of output will go through
-    another convolution layer.
-
-    Args:
-        in_channels (int): number of input channels in out_conv layer.
-        out_channels (int): number of output channels in out_conv layer.
-        with_out_conv (bool): Whether to use out_conv layer
-        out_conv_cfg (dict): Config dict for convolution layer, which should
-            contain "groups", "kernel_size", "padding", "bias" to build
-            out_conv layer.
-        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
-        out_conv_order (tuple): The order of conv/norm/activation layers in
-            out_conv.
-        with_input1_conv (bool): Whether to use convolution on input1.
-        with_input2_conv (bool): Whether to use convolution on input2.
-        input_conv_cfg (dict): Config dict for building input1_conv layer and
-            input2_conv layer, which is expected to contain the type of
-            convolution.
-            Default: None, which means using conv2d.
-        input_norm_cfg (dict): Config dict for normalization layer in
-            input1_conv and input2_conv layer. Default: None.
-        upsample_mode (str): Interpolation method used to resize the output
-            of input1_conv and input2_conv to target size. Currently, we
-            support ['nearest', 'bilinear']. Default: 'nearest'.
-    """
-
-    def __init__(self,
-                 fused_channels=256,
-                 out_channels=256,
-                 with_out_conv=True,
-                 out_conv_cfg=dict(
-                     groups=1, kernel_size=3, padding=1, bias=True),
-                 out_norm_cfg=None,
-                 out_conv_order=('act', 'conv', 'norm'),
-                 with_input1_conv=False,
-                 with_input2_conv=False,
-                 input_conv_cfg=None,
-                 input_norm_cfg=None,
-                 upsample_mode='nearest'):
-        super(BaseMergeCell, self).__init__()
-        assert upsample_mode in ['nearest', 'bilinear']
-        self.with_out_conv = with_out_conv
-        self.with_input1_conv = with_input1_conv
-        self.with_input2_conv = with_input2_conv
-        self.upsample_mode = upsample_mode
-
-        if self.with_out_conv:
-            self.out_conv = ConvModule(
-                fused_channels,
-                out_channels,
-                **out_conv_cfg,
-                norm_cfg=out_norm_cfg,
-                order=out_conv_order)
-
-        self.input1_conv = self._build_input_conv(
-            out_channels, input_conv_cfg,
-            input_norm_cfg) if with_input1_conv else nn.Sequential()
-        self.input2_conv = self._build_input_conv(
-            out_channels, input_conv_cfg,
-            input_norm_cfg) if with_input2_conv else nn.Sequential()
-
-    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
-        return ConvModule(
-            channel,
-            channel,
-            3,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True)
-
-    @abstractmethod
-    def _binary_op(self, x1, x2):
-        pass
-
-    def _resize(self, x, size):
-        if x.shape[-2:] == size:
-            return x
-        elif x.shape[-2:] < size:
-            return F.interpolate(x, size=size, mode=self.upsample_mode)
-        else:
-            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
-            kernel_size = x.shape[-1] // size[-1]
-            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
-            return x
-
-    def forward(self, x1, x2, out_size=None):
-        assert x1.shape[:2] == x2.shape[:2]
-        assert out_size is None or len(out_size) == 2
-        if out_size is None:  # resize to larger one
-            out_size = max(x1.size()[2:], x2.size()[2:])
-
-        x1 = self.input1_conv(x1)
-        x2 = self.input2_conv(x2)
-
-        x1 = self._resize(x1, out_size)
-        x2 = self._resize(x2, out_size)
-
-        x = self._binary_op(x1, x2)
-        if self.with_out_conv:
-            x = self.out_conv(x)
-        return x
-
-
-class SumCell(BaseMergeCell):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
-
-    def _binary_op(self, x1, x2):
-        return x1 + x2
-
-
-class ConcatCell(BaseMergeCell):
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
-                                         **kwargs)
-
-    def _binary_op(self, x1, x2):
-        ret = torch.cat([x1, x2], dim=1)
-        return ret
-
-
-class GlobalPoolingCell(BaseMergeCell):
-
-    def __init__(self, in_channels=None, out_channels=None, **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
-
-    def _binary_op(self, x1, x2):
-        x2_att = self.global_pool(x2).sigmoid()
-        return x2 + x2_att * x1
diff --git a/mmdet/ops/nms/__init__.py b/mmdet/ops/nms/__init__.py
deleted file mode 100644
index 4440a5056b4..00000000000
--- a/mmdet/ops/nms/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .nms_wrapper import batched_nms, nms, nms_match, soft_nms
-
-__all__ = ['nms', 'soft_nms', 'batched_nms', 'nms_match']
diff --git a/mmdet/ops/nms/nms_wrapper.py b/mmdet/ops/nms/nms_wrapper.py
deleted file mode 100644
index 867693b3a9d..00000000000
--- a/mmdet/ops/nms/nms_wrapper.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import numpy as np
-import torch
-
-from . import nms_ext
-
-
-def nms(dets, iou_thr, device_id=None):
-    """Dispatch to either CPU or GPU NMS implementations.
-
-    The input can be either a torch tensor or numpy array. GPU NMS will be used
-    if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
-    will be used. The returned type will always be the same as inputs.
-
-    Arguments:
-        dets (torch.Tensor or np.ndarray): bboxes with scores.
-        iou_thr (float): IoU threshold for NMS.
-        device_id (int, optional): when `dets` is a numpy array, if `device_id`
-            is None, then cpu nms is used, otherwise gpu_nms will be used.
-
-    Returns:
-        tuple: kept bboxes and indice, which is always the same data type as
-            the input.
-
-    Example:
-        >>> dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
-        >>>                  [49.3, 32.9, 51.0, 35.3, 0.9],
-        >>>                  [49.2, 31.8, 51.0, 35.4, 0.5],
-        >>>                  [35.1, 11.5, 39.1, 15.7, 0.5],
-        >>>                  [35.6, 11.8, 39.3, 14.2, 0.5],
-        >>>                  [35.3, 11.5, 39.9, 14.5, 0.4],
-        >>>                  [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32)
-        >>> iou_thr = 0.6
-        >>> suppressed, inds = nms(dets, iou_thr)
-        >>> assert len(inds) == len(suppressed) == 3
-    """
-    # convert dets (tensor or numpy array) to tensor
-    if isinstance(dets, torch.Tensor):
-        is_numpy = False
-        dets_th = dets
-    elif isinstance(dets, np.ndarray):
-        is_numpy = True
-        device = 'cpu' if device_id is None else f'cuda:{device_id}'
-        dets_th = torch.from_numpy(dets).to(device)
-    else:
-        raise TypeError('dets must be either a Tensor or numpy array, '
-                        f'but got {type(dets)}')
-
-    # execute cpu or cuda nms
-    if dets_th.shape[0] == 0:
-        inds = dets_th.new_zeros(0, dtype=torch.long)
-    else:
-        if dets_th.is_cuda:
-            inds = nms_ext.nms(dets_th, iou_thr)
-        else:
-            inds = nms_ext.nms(dets_th, iou_thr)
-
-    if is_numpy:
-        inds = inds.cpu().numpy()
-    return dets[inds, :], inds
-
-
-def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
-    """Dispatch to only CPU Soft NMS implementations.
-
-    The input can be either a torch tensor or numpy array.
-    The returned type will always be the same as inputs.
-
-    Arguments:
-        dets (torch.Tensor or np.ndarray): bboxes with scores.
-        iou_thr (float): IoU threshold for Soft NMS.
-        method (str): either 'linear' or 'gaussian'
-        sigma (float): hyperparameter for gaussian method
-        min_score (float): score filter threshold
-
-    Returns:
-        tuple: new det bboxes and indice, which is always the same
-        data type as the input.
-
-    Example:
-        >>> dets = np.array([[4., 3., 5., 3., 0.9],
-        >>>                  [4., 3., 5., 4., 0.9],
-        >>>                  [3., 1., 3., 1., 0.5],
-        >>>                  [3., 1., 3., 1., 0.5],
-        >>>                  [3., 1., 3., 1., 0.4],
-        >>>                  [3., 1., 3., 1., 0.0]], dtype=np.float32)
-        >>> iou_thr = 0.6
-        >>> new_dets, inds = soft_nms(dets, iou_thr, sigma=0.5)
-        >>> assert len(inds) == len(new_dets) == 5
-    """
-    # convert dets (tensor or numpy array) to tensor
-    if isinstance(dets, torch.Tensor):
-        is_tensor = True
-        dets_t = dets.detach().cpu()
-    elif isinstance(dets, np.ndarray):
-        is_tensor = False
-        dets_t = torch.from_numpy(dets)
-    else:
-        raise TypeError('dets must be either a Tensor or numpy array, '
-                        f'but got {type(dets)}')
-
-    method_codes = {'linear': 1, 'gaussian': 2}
-    if method not in method_codes:
-        raise ValueError(f'Invalid method for SoftNMS: {method}')
-    results = nms_ext.soft_nms(dets_t, iou_thr, method_codes[method], sigma,
-                               min_score)
-
-    new_dets = results[:, :5]
-    inds = results[:, 5]
-
-    if is_tensor:
-        return new_dets.to(
-            device=dets.device, dtype=dets.dtype), inds.to(
-                device=dets.device, dtype=torch.long)
-    else:
-        return new_dets.numpy().astype(dets.dtype), inds.numpy().astype(
-            np.int64)
-
-
-def batched_nms(bboxes, scores, inds, nms_cfg, class_agnostic=False):
-    """Performs non-maximum suppression in a batched fashion.
-
-    Modified from https://github.com/pytorch/vision/blob
-    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
-    In order to perform NMS independently per class, we add an offset to all
-    the boxes. The offset is dependent only on the class idx, and is large
-    enough so that boxes from different classes do not overlap.
-
-    Arguments:
-        bboxes (torch.Tensor): bboxes in shape (N, 4).
-        scores (torch.Tensor): scores in shape (N, ).
-        inds (torch.Tensor): each index value correspond to a bbox cluster,
-            and NMS will not be applied between elements of different inds,
-            shape (N, ).
-        nms_cfg (dict): specify nms type and class_agnostic as well as other
-            parameters like iou_thr.
-        class_agnostic (bool): if true, nms is class agnostic,
-            i.e. IoU thresholding happens over all bboxes,
-            regardless of the predicted class
-
-    Returns:
-        tuple: kept bboxes and indice.
-    """
-    nms_cfg_ = nms_cfg.copy()
-    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
-    if class_agnostic:
-        bboxes_for_nms = bboxes
-    else:
-        max_coordinate = bboxes.max()
-        offsets = inds.to(bboxes) * (max_coordinate + 1)
-        bboxes_for_nms = bboxes + offsets[:, None]
-    nms_type = nms_cfg_.pop('type', 'nms')
-    nms_op = eval(nms_type)
-    dets, keep = nms_op(
-        torch.cat([bboxes_for_nms, scores[:, None]], -1), **nms_cfg_)
-    bboxes = bboxes[keep]
-    scores = dets[:, -1]
-    return torch.cat([bboxes, scores[:, None]], -1), keep
-
-
-def nms_match(dets, thresh):
-    """Matched dets into different groups by NMS.
-
-    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
-    record the indice of supporessed bbox and form a group with the indice of
-    kept bbox. In each group, indice is sorted as score order.
-
-    Arguments:
-        dets (torch.Tensor | np.ndarray): Det bboxes with scores, shape (N, 5).
-        iou_thr (float): IoU thresh for NMS.
-
-    Returns:
-        List[Tensor | ndarray]: The outer list corresponds different matched
-            group, the inner Tensor corresponds the indices for a group in
-            score order.
-    """
-    if dets.shape[0] == 0:
-        matched = []
-    else:
-        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
-                                    f'but get {dets.shape}'
-        if isinstance(dets, torch.Tensor):
-            dets_t = dets.detach().cpu()
-        else:
-            dets_t = torch.from_numpy(dets)
-        matched = nms_ext.nms_match(dets_t, thresh)
-
-    if isinstance(dets, torch.Tensor):
-        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
-    else:
-        return [np.array(m, dtype=np.int) for m in matched]
diff --git a/mmdet/ops/nms/src/cpu/nms_cpu.cpp b/mmdet/ops/nms/src/cpu/nms_cpu.cpp
deleted file mode 100644
index 230657e9774..00000000000
--- a/mmdet/ops/nms/src/cpu/nms_cpu.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-// Soft-NMS is added by MMDetection.
-// Modified from
-// https://github.com/bharatsingh430/soft-nms/blob/master/lib/nms/cpu_nms.pyx.
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include <torch/extension.h>
-
-template <typename scalar_t>
-at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
-  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
-
-  if (dets.numel() == 0) {
-    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
-  }
-
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-  auto scores = dets.select(1, 4).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
-  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto keep = keep_t.data_ptr<int64_t>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<scalar_t>();
-  auto y1 = y1_t.data_ptr<scalar_t>();
-  auto x2 = x2_t.data_ptr<scalar_t>();
-  auto y2 = y2_t.data_ptr<scalar_t>();
-  auto areas = areas_t.data_ptr<scalar_t>();
-
-  int64_t num_to_keep = 0;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) continue;
-    keep[num_to_keep++] = i;
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) continue;
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
-
-      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1);
-      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > threshold) suppressed[j] = 1;
-    }
-  }
-  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
-}
-
-at::Tensor nms_cpu(const at::Tensor& dets, const float threshold) {
-  at::Tensor result;
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
-    result = nms_cpu_kernel<scalar_t>(dets, threshold);
-  });
-  return result;
-}
-
-template <typename scalar_t>
-at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold,
-                               const unsigned char method, const float sigma,
-                               const float min_score) {
-  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
-
-  if (dets.numel() == 0) {
-    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
-  }
-
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-  auto scores_t = dets.select(1, 4).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
-
-  auto ndets = dets.size(0);
-  auto x1 = x1_t.data_ptr<scalar_t>();
-  auto y1 = y1_t.data_ptr<scalar_t>();
-  auto x2 = x2_t.data_ptr<scalar_t>();
-  auto y2 = y2_t.data_ptr<scalar_t>();
-  auto scores = scores_t.data_ptr<scalar_t>();
-  auto areas = areas_t.data_ptr<scalar_t>();
-
-  int64_t pos = 0;
-  at::Tensor inds_t = at::arange(ndets, dets.options());
-  auto inds = inds_t.data_ptr<scalar_t>();
-
-  for (int64_t i = 0; i < ndets; i++) {
-    auto max_score = scores[i];
-    auto max_pos = i;
-
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iscore = scores[i];
-    auto iarea = areas[i];
-    auto iind = inds[i];
-
-    pos = i + 1;
-    // get max box
-    while (pos < ndets) {
-      if (max_score < scores[pos]) {
-        max_score = scores[pos];
-        max_pos = pos;
-      }
-      pos = pos + 1;
-    }
-    // add max box as a detection
-    x1[i] = x1[max_pos];
-    y1[i] = y1[max_pos];
-    x2[i] = x2[max_pos];
-    y2[i] = y2[max_pos];
-    scores[i] = scores[max_pos];
-    areas[i] = areas[max_pos];
-    inds[i] = inds[max_pos];
-
-    // swap ith box with position of max box
-    x1[max_pos] = ix1;
-    y1[max_pos] = iy1;
-    x2[max_pos] = ix2;
-    y2[max_pos] = iy2;
-    scores[max_pos] = iscore;
-    areas[max_pos] = iarea;
-    inds[max_pos] = iind;
-
-    ix1 = x1[i];
-    iy1 = y1[i];
-    ix2 = x2[i];
-    iy2 = y2[i];
-    iscore = scores[i];
-    iarea = areas[i];
-
-    pos = i + 1;
-    // NMS iterations, note that N changes if detection boxes fall below
-    // threshold
-    while (pos < ndets) {
-      auto xx1 = std::max(ix1, x1[pos]);
-      auto yy1 = std::max(iy1, y1[pos]);
-      auto xx2 = std::min(ix2, x2[pos]);
-      auto yy2 = std::min(iy2, y2[pos]);
-
-      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1);
-      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[pos] - inter);
-
-      scalar_t weight = 1.;
-      if (method == 1) {
-        if (ovr > threshold) weight = 1 - ovr;
-      } else if (method == 2) {
-        weight = std::exp(-(ovr * ovr) / sigma);
-      } else {
-        // original NMS
-        if (ovr > threshold) {
-          weight = 0;
-        } else {
-          weight = 1;
-        }
-      }
-      scores[pos] = weight * scores[pos];
-      // if box score falls below threshold, discard the box by
-      // swapping with last box update N
-      if (scores[pos] < min_score) {
-        x1[pos] = x1[ndets - 1];
-        y1[pos] = y1[ndets - 1];
-        x2[pos] = x2[ndets - 1];
-        y2[pos] = y2[ndets - 1];
-        scores[pos] = scores[ndets - 1];
-        areas[pos] = areas[ndets - 1];
-        inds[pos] = inds[ndets - 1];
-        ndets = ndets - 1;
-        pos = pos - 1;
-      }
-      pos = pos + 1;
-    }
-  }
-  at::Tensor result = at::zeros({6, ndets}, dets.options());
-  result[0] = x1_t.slice(0, 0, ndets);
-  result[1] = y1_t.slice(0, 0, ndets);
-  result[2] = x2_t.slice(0, 0, ndets);
-  result[3] = y2_t.slice(0, 0, ndets);
-  result[4] = scores_t.slice(0, 0, ndets);
-  result[5] = inds_t.slice(0, 0, ndets);
-
-  result = result.t().contiguous();
-  return result;
-}
-
-at::Tensor soft_nms_cpu(const at::Tensor& dets, const float threshold,
-                        const unsigned char method, const float sigma,
-                        const float min_score) {
-  at::Tensor result;
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
-    result = soft_nms_cpu_kernel<scalar_t>(dets, threshold, method, sigma,
-                                           min_score);
-  });
-  return result;
-}
-
-
-template <typename scalar_t>
-std::vector<std::vector<int> > nms_match_cpu_kernel(const at::Tensor& dets,
-                                                    const float threshold) {
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-  auto scores = dets.select(1, 4).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  at::Tensor suppressed_t =
-      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<scalar_t>();
-  auto y1 = y1_t.data_ptr<scalar_t>();
-  auto x2 = x2_t.data_ptr<scalar_t>();
-  auto y2 = y2_t.data_ptr<scalar_t>();
-  auto areas = areas_t.data_ptr<scalar_t>();
-
-  std::vector<int> keep;
-  std::vector<std::vector<int> > matched;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) continue;
-    keep.push_back(i);
-    std::vector<int> v_i;
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) continue;
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
-
-      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1);
-      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr >= threshold) {
-        suppressed[j] = 1;
-        v_i.push_back(j);
-      }
-    }
-    matched.push_back(v_i);
-  }
-  for (size_t i = 0; i < keep.size(); i++)
-    matched[i].insert(matched[i].begin(), keep[i]);
-  return matched;
-}
-
-std::vector<std::vector<int> > nms_match_cpu(const at::Tensor& dets,
-                                             const float threshold) {
-  std::vector<std::vector<int> > result;
-  // result = nms_match_cpu_kernel<scalar_t>(dets, threshold);
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_match", [&] {
-    result = nms_match_cpu_kernel<scalar_t>(dets, threshold);
-  });
-  return result;
-}
diff --git a/mmdet/ops/nms/src/cuda/nms_cuda.cpp b/mmdet/ops/nms/src/cuda/nms_cuda.cpp
deleted file mode 100644
index d46b8166904..00000000000
--- a/mmdet/ops/nms/src/cuda/nms_cuda.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include <torch/extension.h>
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-
-at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh);
-
-at::Tensor nms_cuda(const at::Tensor& dets, const float threshold) {
-  CHECK_CUDA(dets);
-  if (dets.numel() == 0)
-    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
-  return nms_cuda_forward(dets, threshold);
-}
diff --git a/mmdet/ops/nms/src/cuda/nms_kernel.cu b/mmdet/ops/nms/src/cuda/nms_kernel.cu
deleted file mode 100644
index bb6d18abcfa..00000000000
--- a/mmdet/ops/nms/src/cuda/nms_kernel.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/DeviceGuard.h>
-
-#include <THC/THC.h>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <vector>
-#include <iostream>
-
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-
-__device__ inline float devIoU(float const * const a, float const * const b) {
-  float left = max(a[0], b[0]), right = min(a[2], b[2]);
-  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left, 0.f), height = max(bottom - top, 0.f);
-  float interS = width * height;
-  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
-  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
-  return interS / (Sa + Sb - interS);
-}
-
-__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
-                           const float *dev_boxes, unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size =
-        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
-
-  __shared__ float block_boxes[threadsPerBlock * 5];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 5;
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-// boxes is a N x 5 tensor
-at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
-
-  // Ensure CUDA uses the input tensor device.
-  at::DeviceGuard guard(boxes.device());
-
-  using scalar_t = float;
-  AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
-  auto scores = boxes.select(1, 4);
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-  auto boxes_sorted = boxes.index_select(0, order_t);
-
-  int boxes_num = boxes.size(0);
-
-  const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
-
-  scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
-
-  THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
-
-  unsigned long long* mask_dev = NULL;
-  //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
-  //                      boxes_num * col_blocks * sizeof(unsigned long long)));
-
-  mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
-
-  dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
-              THCCeilDiv(boxes_num, threadsPerBlock));
-  dim3 threads(threadsPerBlock);
-  nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num,
-                                  nms_overlap_thresh,
-                                  boxes_dev,
-                                  mask_dev);
-
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  THCudaCheck(cudaMemcpyAsync(
-			  &mask_host[0],
-			  mask_dev,
-			  sizeof(unsigned long long) * boxes_num * col_blocks,
-			  cudaMemcpyDeviceToHost,
-			  at::cuda::getCurrentCUDAStream()
-			  ));
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
-  at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data_ptr<int64_t>();
-
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-
-  THCudaFree(state, mask_dev);
-  // TODO improve this part
-  return order_t.index({
-      keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
-          order_t.device(), keep.scalar_type())});
-}
diff --git a/mmdet/ops/nms/src/nms_ext.cpp b/mmdet/ops/nms/src/nms_ext.cpp
deleted file mode 100644
index 2a4402eeb17..00000000000
--- a/mmdet/ops/nms/src/nms_ext.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// Modified from https://github.com/bharatsingh430/soft-nms/blob/master/lib/nms/cpu_nms.pyx, Soft-NMS is added
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include <torch/extension.h>
-
-at::Tensor nms_cpu(const at::Tensor& dets, const float threshold);
-
-at::Tensor soft_nms_cpu(const at::Tensor& dets, const float threshold,
-                    const unsigned char method, const float sigma, const
-                    float min_score);
-
-std::vector<std::vector<int> > nms_match_cpu(const at::Tensor& dets, const float threshold);
-
-
-#ifdef WITH_CUDA
-at::Tensor nms_cuda(const at::Tensor& dets, const float threshold);
-#endif
-
-at::Tensor nms(const at::Tensor& dets, const float threshold){
-  if (dets.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return nms_cuda(dets, threshold);
-#else
-    AT_ERROR("nms is not compiled with GPU support");
-#endif
-  }
-  return nms_cpu(dets, threshold);
-}
-
-at::Tensor soft_nms(const at::Tensor& dets, const float threshold,
-                        const unsigned char method, const float sigma, const
-                        float min_score) {
-  if (dets.device().is_cuda()) {
-    AT_ERROR("soft_nms is not implemented on GPU");
-  }
-  return soft_nms_cpu(dets, threshold, method, sigma, min_score);
-}
-
-std::vector<std::vector<int> > nms_match(const at::Tensor& dets, const float threshold) {
-  if (dets.type().is_cuda()) {
-    AT_ERROR("nms_match is not implemented on GPU");
-  }
-  return nms_match_cpu(dets, threshold);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("nms", &nms, "non-maximum suppression");
-  m.def("soft_nms", &soft_nms, "soft non-maximum suppression");
-  m.def("nms_match", &nms_match, "non-maximum suppression match");
-}
diff --git a/mmdet/ops/non_local.py b/mmdet/ops/non_local.py
deleted file mode 100644
index 3630eb2f73a..00000000000
--- a/mmdet/ops/non_local.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import torch
-import torch.nn as nn
-from mmcv.cnn import ConvModule, constant_init, normal_init
-
-
-class NonLocal2D(nn.Module):
-    """Non-local module.
-
-    See https://arxiv.org/abs/1711.07971 for details.
-
-    Args:
-        in_channels (int): Channels of the input feature map.
-        reduction (int): Channel reduction ratio.
-        use_scale (bool): Whether to scale pairwise_weight by 1/inter_channels.
-        conv_cfg (dict): The config dict for convolution layers.
-            (only applicable to conv_out)
-        norm_cfg (dict): The config dict for normalization layers.
-            (only applicable to conv_out)
-        mode (str): Options are `embedded_gaussian` and `dot_product`.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 reduction=2,
-                 use_scale=True,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 mode='embedded_gaussian'):
-        super(NonLocal2D, self).__init__()
-        self.in_channels = in_channels
-        self.reduction = reduction
-        self.use_scale = use_scale
-        self.inter_channels = in_channels // reduction
-        self.mode = mode
-        assert mode in ['embedded_gaussian', 'dot_product']
-
-        # g, theta, phi are actually `nn.Conv2d`. Here we use ConvModule for
-        # potential usage.
-        self.g = ConvModule(
-            self.in_channels, self.inter_channels, kernel_size=1, act_cfg=None)
-        self.theta = ConvModule(
-            self.in_channels, self.inter_channels, kernel_size=1, act_cfg=None)
-        self.phi = ConvModule(
-            self.in_channels, self.inter_channels, kernel_size=1, act_cfg=None)
-        self.conv_out = ConvModule(
-            self.inter_channels,
-            self.in_channels,
-            kernel_size=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-        self.init_weights()
-
-    def init_weights(self, std=0.01, zeros_init=True):
-        for m in [self.g, self.theta, self.phi]:
-            normal_init(m.conv, std=std)
-        if zeros_init:
-            constant_init(self.conv_out.conv, 0)
-        else:
-            normal_init(self.conv_out.conv, std=std)
-
-    def embedded_gaussian(self, theta_x, phi_x):
-        # pairwise_weight: [N, HxW, HxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        if self.use_scale:
-            # theta_x.shape[-1] is `self.inter_channels`
-            pairwise_weight /= theta_x.shape[-1]**0.5
-        pairwise_weight = pairwise_weight.softmax(dim=-1)
-        return pairwise_weight
-
-    def dot_product(self, theta_x, phi_x):
-        # pairwise_weight: [N, HxW, HxW]
-        pairwise_weight = torch.matmul(theta_x, phi_x)
-        pairwise_weight /= pairwise_weight.shape[-1]
-        return pairwise_weight
-
-    def forward(self, x):
-        n, _, h, w = x.shape
-
-        # g_x: [N, HxW, C]
-        g_x = self.g(x).view(n, self.inter_channels, -1)
-        g_x = g_x.permute(0, 2, 1)
-
-        # theta_x: [N, HxW, C]
-        theta_x = self.theta(x).view(n, self.inter_channels, -1)
-        theta_x = theta_x.permute(0, 2, 1)
-
-        # phi_x: [N, C, HxW]
-        phi_x = self.phi(x).view(n, self.inter_channels, -1)
-
-        pairwise_func = getattr(self, self.mode)
-        # pairwise_weight: [N, HxW, HxW]
-        pairwise_weight = pairwise_func(theta_x, phi_x)
-
-        # y: [N, HxW, C]
-        y = torch.matmul(pairwise_weight, g_x)
-        # y: [N, C, H, W]
-        y = y.permute(0, 2, 1).reshape(n, self.inter_channels, h, w)
-
-        output = x + self.conv_out(y)
-
-        return output
diff --git a/mmdet/ops/plugin.py b/mmdet/ops/plugin.py
deleted file mode 100644
index 5189d71e5a8..00000000000
--- a/mmdet/ops/plugin.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from mmcv.cnn import ConvModule
-
-from .context_block import ContextBlock
-from .generalized_attention import GeneralizedAttention
-from .non_local import NonLocal2D
-
-plugin_cfg = {
-    # format: layer_type: (abbreviation, module)
-    'ContextBlock': ('context_block', ContextBlock),
-    'GeneralizedAttention': ('gen_attention_block', GeneralizedAttention),
-    'NonLocal2D': ('nonlocal_block', NonLocal2D),
-    'ConvModule': ('conv_block', ConvModule),
-}
-
-
-def build_plugin_layer(cfg, postfix='', **kwargs):
-    """Build plugin layer.
-
-    Args:
-        cfg (None or dict): cfg should contain:
-            type (str): identify plugin layer type.
-            layer args: args needed to instantiate a plugin layer.
-        postfix (int, str): appended into norm abbreviation to
-            create named layer.
-
-    Returns:
-        name (str): abbreviation + postfix
-        layer (nn.Module): created plugin layer
-    """
-    assert isinstance(cfg, dict) and 'type' in cfg
-    cfg_ = cfg.copy()
-
-    layer_type = cfg_.pop('type')
-    if layer_type not in plugin_cfg:
-        raise KeyError(f'Unrecognized plugin type {layer_type}')
-    else:
-        abbr, plugin_layer = plugin_cfg[layer_type]
-
-    assert isinstance(postfix, (int, str))
-    name = abbr + str(postfix)
-
-    layer = plugin_layer(**kwargs, **cfg_)
-
-    return name, layer
diff --git a/mmdet/ops/point_sample.py b/mmdet/ops/point_sample.py
deleted file mode 100644
index f739bca4466..00000000000
--- a/mmdet/ops/point_sample.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.modules.utils import _pair
-
-
-def normalize(grid):
-    """Normalize input grid from [-1, 1] to [0, 1]
-
-    Args:
-        grid (Tensor): The grid to be normalize, range [-1, 1].
-
-    Returns:
-        Tensor: Normalized grid, range [0, 1].
-    """
-
-    return (grid + 1.0) / 2.0
-
-
-def denormalize(grid):
-    """Denormalize input grid from range [0, 1] to [-1, 1]
-    Args:
-        grid (Tensor): The grid to be denormalize, range [0, 1].
-
-    Returns:
-        Tensor: Denormalized grid, range [-1, 1].
-    """
-
-    return grid * 2.0 - 1.0
-
-
-def generate_grid(num_grid, size, device):
-    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
-    space.
-
-    Args:
-        num_grid (int): The number of grids to sample, one for each region.
-        size (tuple(int, int)): The side size of the regular grid.
-        device (torch.device): Desired device of returned tensor.
-
-    Returns:
-        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
-            contains coordinates for the regular grids.
-    """
-
-    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
-    grid = F.affine_grid(
-        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
-    grid = normalize(grid)
-    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
-
-
-def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
-    """Convert roi based relative point coordinates to image based absolute
-    point coordinates.
-
-    Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
-
-    Returns:
-        Tensor: Image based absolute point coordinates, shape (N, P, 2)
-    """
-
-    with torch.no_grad():
-        assert rel_roi_points.size(0) == rois.size(0)
-        assert rois.dim() == 2
-        assert rel_roi_points.dim() == 3
-        assert rel_roi_points.size(2) == 2
-        # remove batch idx
-        if rois.size(1) == 5:
-            rois = rois[:, 1:]
-        abs_img_points = rel_roi_points.clone()
-        abs_img_points[:, :, 0] = abs_img_points[:, :, 0] * (
-            rois[:, None, 2] - rois[:, None, 0])
-        abs_img_points[:, :, 1] = abs_img_points[:, :, 1] * (
-            rois[:, None, 3] - rois[:, None, 1])
-        abs_img_points[:, :, 0] += rois[:, None, 0]
-        abs_img_points[:, :, 1] += rois[:, None, 1]
-    return abs_img_points
-
-
-def abs_img_point_to_rel_img_point(abs_img_points,
-                                   img_shape,
-                                   spatial_scale=1.):
-    """Convert image based absolute point coordinates to image based relative
-    coordinates for sampling.
-
-    Args:
-        abs_img_points (Tensor): Image based absolute point coordinates,
-            shape (N, P, 2)
-        img_shape (tuple): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
-
-    Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
-    """
-
-    assert isinstance(img_shape, tuple) and len(img_shape) == 2
-    h, w = img_shape
-    scale = torch.tensor([w, h],
-                         dtype=torch.float,
-                         device=abs_img_points.device)
-    scale = scale.view(1, 1, 2)
-    rel_img_points = abs_img_points / scale * spatial_scale
-
-    return rel_img_points
-
-
-def rel_roi_point_to_rel_img_point(rois,
-                                   rel_roi_points,
-                                   img_shape,
-                                   spatial_scale=1.):
-    """Convert roi based relative point coordinates to image based absolute
-    point coordinates.
-
-    Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
-        img_shape (tuple): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
-
-    Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
-    """
-
-    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
-    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img_shape,
-                                                   spatial_scale)
-
-    return rel_img_point
-
-
-def point_sample(input, points, align_corners=False, **kwargs):
-    """A wrapper around :function:`grid_sample` to support 3D point_coords
-    tensors Unlike :function:`torch.nn.functional.grid_sample` it assumes
-    point_coords to lie inside [0, 1] x [0, 1] square.
-
-    Args:
-        input (Tensor): Feature map, shape (N, C, H, W).
-        points (Tensor): Image based absolute point coordinates (normalized),
-            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
-        align_corners (bool): Whether align_corners. Default: False
-
-    Returns:
-        Tensor: Features of `point` on `input`, shape (N, C, P) or
-            (N, C, Hgrid, Wgrid).
-    """
-
-    add_dim = False
-    if points.dim() == 3:
-        add_dim = True
-        points = points.unsqueeze(2)
-    output = F.grid_sample(
-        input, denormalize(points), align_corners=align_corners, **kwargs)
-    if add_dim:
-        output = output.squeeze(3)
-    return output
-
-
-class SimpleRoIAlign(nn.Module):
-
-    def __init__(self, out_size, spatial_scale, aligned=True):
-        """Simple RoI align in PointRend, faster than standard RoIAlign.
-
-        Args:
-            out_size (tuple[int]): h, w
-            spatial_scale (float): scale the input boxes by this number
-            aligned (bool): if False, use the legacy implementation in
-                MMDetection, align_corners=True will be used in F.grid_sample.
-                If True, align the results more perfectly.
-        """
-
-        super(SimpleRoIAlign, self).__init__()
-        self.out_size = _pair(out_size)
-        self.spatial_scale = float(spatial_scale)
-        # to be consistent with other RoI ops
-        self.use_torchvision = False
-        self.aligned = aligned
-
-    def forward(self, features, rois):
-
-        num_imgs = features.size(0)
-        num_rois = rois.size(0)
-        rel_roi_points = generate_grid(
-            num_rois, self.out_size, device=rois.device)
-
-        point_feats = []
-        for batch_ind in range(num_imgs):
-            # unravel batch dim
-            feat = features[batch_ind].unsqueeze(0)
-            inds = (rois[:, 0].long() == batch_ind)
-            if inds.any():
-                rel_img_points = rel_roi_point_to_rel_img_point(
-                    rois[inds], rel_roi_points[inds], feat.shape[2:],
-                    self.spatial_scale).unsqueeze(0)
-                point_feat = point_sample(
-                    feat, rel_img_points, align_corners=not self.aligned)
-                point_feat = point_feat.squeeze(0).transpose(0, 1)
-                point_feats.append(point_feat)
-
-        channels = features.size(1)
-        roi_feats = torch.cat(point_feats, dim=0)
-        roi_feats = roi_feats.reshape(num_rois, channels, *self.out_size)
-
-        return roi_feats
-
-    def __repr__(self):
-        format_str = self.__class__.__name__
-        format_str += '(out_size={}, spatial_scale={}'.format(
-            self.out_size, self.spatial_scale)
-        return format_str
diff --git a/mmdet/ops/roi_align/__init__.py b/mmdet/ops/roi_align/__init__.py
deleted file mode 100644
index 6da98298fa5..00000000000
--- a/mmdet/ops/roi_align/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .roi_align import RoIAlign, roi_align
-
-__all__ = ['roi_align', 'RoIAlign']
diff --git a/mmdet/ops/roi_align/gradcheck.py b/mmdet/ops/roi_align/gradcheck.py
deleted file mode 100644
index 136456b398b..00000000000
--- a/mmdet/ops/roi_align/gradcheck.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os.path as osp
-import sys
-
-import numpy as np
-import torch
-from torch.autograd import gradcheck
-
-sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-from roi_align import RoIAlign  # noqa: E402, isort:skip
-
-feat_size = 15
-spatial_scale = 1.0 / 8
-img_size = feat_size / spatial_scale
-num_imgs = 2
-num_rois = 20
-
-batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
-rois = np.random.rand(num_rois, 4) * img_size * 0.5
-rois[:, 2:] += img_size * 0.5
-rois = np.hstack((batch_ind, rois))
-
-feat = torch.randn(
-    num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
-rois = torch.from_numpy(rois).float().cuda()
-inputs = (feat, rois)
-print('Gradcheck for roi align...')
-test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
-print(test)
-test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
-print(test)
diff --git a/mmdet/ops/roi_align/roi_align.py b/mmdet/ops/roi_align/roi_align.py
deleted file mode 100644
index 27be883b424..00000000000
--- a/mmdet/ops/roi_align/roi_align.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from torch import nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from . import roi_align_ext
-
-
-class RoIAlignFunction(Function):
-
-    @staticmethod
-    def forward(ctx,
-                features,
-                rois,
-                out_size,
-                spatial_scale,
-                sample_num=0,
-                aligned=True):
-        out_h, out_w = _pair(out_size)
-        assert isinstance(out_h, int) and isinstance(out_w, int)
-        ctx.spatial_scale = spatial_scale
-        ctx.sample_num = sample_num
-        ctx.save_for_backward(rois)
-        ctx.feature_size = features.size()
-        ctx.aligned = aligned
-
-        if aligned:
-            output = roi_align_ext.forward_v2(features, rois, spatial_scale,
-                                              out_h, out_w, sample_num,
-                                              aligned)
-        elif features.is_cuda:
-            (batch_size, num_channels, data_height,
-             data_width) = features.size()
-            num_rois = rois.size(0)
-
-            output = features.new_zeros(num_rois, num_channels, out_h, out_w)
-            roi_align_ext.forward_v1(features, rois, out_h, out_w,
-                                     spatial_scale, sample_num, output)
-        else:
-            raise NotImplementedError
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        feature_size = ctx.feature_size
-        spatial_scale = ctx.spatial_scale
-        sample_num = ctx.sample_num
-        rois = ctx.saved_tensors[0]
-        aligned = ctx.aligned
-        assert feature_size is not None
-
-        batch_size, num_channels, data_height, data_width = feature_size
-        out_w = grad_output.size(3)
-        out_h = grad_output.size(2)
-
-        grad_input = grad_rois = None
-        if not aligned:
-            if ctx.needs_input_grad[0]:
-                grad_input = rois.new_zeros(batch_size, num_channels,
-                                            data_height, data_width)
-                roi_align_ext.backward_v1(grad_output.contiguous(), rois,
-                                          out_h, out_w, spatial_scale,
-                                          sample_num, grad_input)
-        else:
-            grad_input = roi_align_ext.backward_v2(grad_output, rois,
-                                                   spatial_scale, out_h, out_w,
-                                                   batch_size, num_channels,
-                                                   data_height, data_width,
-                                                   sample_num, aligned)
-
-        return grad_input, grad_rois, None, None, None, None
-
-
-roi_align = RoIAlignFunction.apply
-
-
-class RoIAlign(nn.Module):
-
-    def __init__(self,
-                 out_size,
-                 spatial_scale,
-                 sample_num=0,
-                 use_torchvision=False,
-                 aligned=True):
-        """
-        Args:
-            out_size (tuple): h, w
-            spatial_scale (float): scale the input boxes by this number
-            sample_num (int): number of inputs samples to take for each
-                output sample. 2 to take samples densely for current models.
-            use_torchvision (bool): whether to use roi_align from torchvision
-            aligned (bool): if False, use the legacy implementation in
-                MMDetection. If True, align the results more perfectly.
-
-        Note:
-            The implementation of RoIAlign when aligned=True is modified from
-            https://github.com/facebookresearch/detectron2/
-
-            The meaning of aligned=True:
-
-            Given a continuous coordinate c, its two neighboring pixel
-            indices (in our pixel model) are computed by floor(c - 0.5) and
-            ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
-            indices [0] and [1] (which are sampled from the underlying signal
-            at continuous coordinates 0.5 and 1.5). But the original roi_align
-            (aligned=False) does not subtract the 0.5 when computing
-            neighboring pixel indices and therefore it uses pixels with a
-            slightly incorrect alignment (relative to our pixel model) when
-            performing bilinear interpolation.
-
-            With `aligned=True`,
-            we first appropriately scale the ROI and then shift it by -0.5
-            prior to calling roi_align. This produces the correct neighbors;
-
-            The difference does not make a difference to the model's
-            performance if ROIAlign is used together with conv layers.
-        """
-        super(RoIAlign, self).__init__()
-        self.out_size = _pair(out_size)
-        self.spatial_scale = float(spatial_scale)
-        self.aligned = aligned
-        self.sample_num = int(sample_num)
-        self.use_torchvision = use_torchvision
-        assert not (use_torchvision and
-                    aligned), 'Torchvision does not support aligned RoIAlgin'
-
-    def forward(self, features, rois):
-        """
-        Args:
-            features: NCHW images
-            rois: Bx5 boxes. First column is the index into N. The other 4
-            columns are xyxy.
-        """
-        assert rois.dim() == 2 and rois.size(1) == 5
-
-        if self.use_torchvision:
-            from torchvision.ops import roi_align as tv_roi_align
-            return tv_roi_align(features, rois, self.out_size,
-                                self.spatial_scale, self.sample_num)
-        else:
-            return roi_align(features, rois, self.out_size, self.spatial_scale,
-                             self.sample_num, self.aligned)
-
-    def __repr__(self):
-        indent_str = '\n    '
-        format_str = self.__class__.__name__
-        format_str += f'({indent_str}out_size={self.out_size},'
-        format_str += f'{indent_str}spatial_scale={self.spatial_scale},'
-        format_str += f'{indent_str}sample_num={self.sample_num},'
-        format_str += f'{indent_str}use_torchvision={self.use_torchvision},'
-        format_str += f'{indent_str}aligned={self.aligned})'
-        return format_str
diff --git a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp b/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp
deleted file mode 100644
index 9e01fe17da0..00000000000
--- a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* input, const T& spatial_scale,
-                     const int channels, const int height, const int width,
-                     const int pooled_height, const int pooled_width,
-                     const int sampling_ratio, const T* rois, T* output,
-                     bool aligned) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign cannot have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] +
-                            pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] +
-                            pc.w4 * offset_input[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high,
-                                   const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-inline void add(T* address, const T& val) {
-  *address += val;
-}
-
-template <typename T>
-void ROIAlignBackward(const int nthreads, const T* grad_output,
-                      const T& spatial_scale, const int channels,
-                      const int height, const int width,
-                      const int pooled_height, const int pooled_width,
-                      const int sampling_ratio, T* grad_input, const T* rois,
-                      const int n_stride, const int c_stride,
-                      const int h_stride, const int w_stride, bool aligned) {
-  for (int index = 0; index < nthreads; index++) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    int output_offset = n * n_stride + c * c_stride;
-    const T* offset_grad_output = grad_output + output_offset;
-    const T grad_output_this_bin =
-        offset_grad_output[ph * h_stride + pw * w_stride];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T y = roi_start_h + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_start_w + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        T g1 = grad_output_this_bin * w1 / count;
-        T g2 = grad_output_this_bin * w2 / count;
-        T g3 = grad_output_this_bin * w3 / count;
-        T g4 = grad_output_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          // atomic add is not needed for now since it is single threaded
-          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // for
-}  // ROIAlignBackward
-
-at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input,
-                                       const at::Tensor& rois,
-                                       const float spatial_scale,
-                                       const int pooled_height,
-                                       const int pooled_width,
-                                       const int sampling_ratio, bool aligned) {
-  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
-  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
-
-  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
-
-  at::CheckedFrom c = "ROIAlignForwardV2CPULaucher";
-  at::checkAllSameType(c, {input_t, rois_t});
-
-  auto num_rois = rois.size(0);
-  auto channels = input.size(1);
-  auto height = input.size(2);
-  auto width = input.size(3);
-
-  at::Tensor output = at::zeros(
-      {num_rois, channels, pooled_height, pooled_width}, input.options());
-
-  auto output_size = num_rois * pooled_height * pooled_width * channels;
-
-  if (output.numel() == 0) return output;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] {
-    ROIAlignForward<scalar_t>(
-        output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
-  });
-  return output;
-}
-
-at::Tensor ROIAlignBackwardV2CPULaucher(
-    const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
-    const int pooled_height, const int pooled_width, const int batch_size,
-    const int channels, const int height, const int width,
-    const int sampling_ratio, bool aligned) {
-  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
-  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
-
-  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
-
-  at::CheckedFrom c = "ROIAlignBackwardV2CPULaucher";
-  at::checkAllSameType(c, {grad_t, rois_t});
-
-  at::Tensor grad_input =
-      at::zeros({batch_size, channels, height, width}, grad.options());
-
-  // handle possibly empty gradients
-  if (grad.numel() == 0) {
-    return grad_input;
-  }
-
-  // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad.stride(0);
-  int c_stride = grad.stride(1);
-  int h_stride = grad.stride(2);
-  int w_stride = grad.stride(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] {
-    ROIAlignBackward<scalar_t>(
-        grad.numel(), grad.contiguous().data_ptr<scalar_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        grad_input.data_ptr<scalar_t>(), rois.contiguous().data_ptr<scalar_t>(),
-        n_stride, c_stride, h_stride, w_stride, aligned);
-  });
-  return grad_input;
-}
diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu
deleted file mode 100644
index 7afa33229d8..00000000000
--- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu
+++ /dev/null
@@ -1,283 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define THREADS_PER_BLOCK 1024
-
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  int max_block_num = 65000;
-  return min(optimal_block_num, max_block_num);
-}
-
-template <typename scalar_t>
-__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
-                                         const int height, const int width,
-                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    return 0;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (scalar_t)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (scalar_t)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  scalar_t ly = y - y_low;
-  scalar_t lx = x - x_low;
-  scalar_t hy = 1. - ly;
-  scalar_t hx = 1. - lx;
-  // do bilinear interpolation
-  scalar_t lt = bottom_data[y_low * width + x_low];
-  scalar_t rt = bottom_data[y_low * width + x_high];
-  scalar_t lb = bottom_data[y_high * width + x_low];
-  scalar_t rb = bottom_data[y_high * width + x_high];
-  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
-
-  return val;
-}
-
-template <typename scalar_t>
-__global__ void ROIAlignForwardV1(
-    const int nthreads, const scalar_t *bottom_data,
-    const scalar_t *bottom_rois, const scalar_t spatial_scale,
-    const int sample_num, const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, scalar_t *top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the aligned output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
-    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
-
-    // Force malformed ROIs to be 1x1
-    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
-    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
-
-    scalar_t bin_size_h = roi_height / pooled_height;
-    scalar_t bin_size_w = roi_width / pooled_width;
-
-    const scalar_t *offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-    int sample_num_h = (sample_num > 0)
-                           ? sample_num
-                           : ceil(roi_height / pooled_height);  // e.g., = 2
-    int sample_num_w =
-        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-    scalar_t output_val = 0;
-    for (int iy = 0; iy < sample_num_h; iy++) {
-      const scalar_t y = roi_start_h + ph * bin_size_h +
-                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
-                             (scalar_t)(sample_num_h);
-      for (int ix = 0; ix < sample_num_w; ix++) {
-        const scalar_t x = roi_start_w + pw * bin_size_w +
-                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
-                               (scalar_t)(sample_num_w);
-        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
-                                                      height, width, y, x);
-        output_val += val;
-      }
-    }
-    output_val /= (sample_num_h * sample_num_w);
-    top_data[index] = output_val;
-  }
-}
-
-int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
-                           const float spatial_scale, const int sample_num,
-                           const int channels, const int height,
-                           const int width, const int num_rois,
-                           const int pooled_height, const int pooled_width,
-                           at::Tensor output) {
-  const int output_size = num_rois * pooled_height * pooled_width * channels;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "ROIAlignLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data_ptr<scalar_t>();
-
-        ROIAlignForwardV1<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-                sample_num, channels, height, width, pooled_height,
-                pooled_width, top_data);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
-
-template <typename scalar_t>
-__device__ void bilinear_interpolate_gradient(const int height, const int width,
-                                              scalar_t y, scalar_t x,
-                                              scalar_t &w1, scalar_t &w2,
-                                              scalar_t &w3, scalar_t &w4,
-                                              int &x_low, int &x_high,
-                                              int &y_low, int &y_high) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (scalar_t)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (scalar_t)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  scalar_t ly = y - y_low;
-  scalar_t lx = x - x_low;
-  scalar_t hy = 1. - ly;
-  scalar_t hx = 1. - lx;
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <typename scalar_t>
-__global__ void ROIAlignBackwardV1(
-    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
-    const scalar_t spatial_scale, const int sample_num, const int channels,
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, scalar_t *bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the aligned output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
-    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
-    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
-    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
-
-    // Force malformed ROIs to be 1x1
-    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
-    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
-
-    scalar_t bin_size_h = roi_height / pooled_height;
-    scalar_t bin_size_w = roi_width / pooled_width;
-
-    scalar_t *offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
-    int offset_top = (n * channels + c) * pooled_height * pooled_width +
-                     ph * pooled_width + pw;
-    scalar_t offset_top_diff = top_diff[offset_top];
-
-    int sample_num_h = (sample_num > 0)
-                           ? sample_num
-                           : ceil(roi_height / pooled_height);  // e.g., = 2
-    int sample_num_w =
-        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-    const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
-
-    for (int iy = 0; iy < sample_num_h; iy++) {
-      const scalar_t y =
-          roi_start_h + ph * bin_size_h +
-          (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
-      for (int ix = 0; ix < sample_num_w; ix++) {
-        const scalar_t x =
-            roi_start_w + pw * bin_size_w +
-            (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
-        scalar_t w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient<scalar_t>(
-            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
-        scalar_t g1 = offset_top_diff * w1 / count;
-        scalar_t g2 = offset_top_diff * w2 / count;
-        scalar_t g3 = offset_top_diff * w3 / count;
-        scalar_t g4 = offset_top_diff * w4 / count;
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
-          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
-          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
-          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
-        }
-      }
-    }
-  }
-}
-
-int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
-                            const float spatial_scale, const int sample_num,
-                            const int channels, const int height,
-                            const int width, const int num_rois,
-                            const int pooled_height, const int pooled_width,
-                            at::Tensor bottom_grad) {
-  const int output_size = num_rois * pooled_height * pooled_width * channels;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
-        if (sizeof(scalar_t) == sizeof(double)) {
-          fprintf(stderr, "double is not supported\n");
-          exit(-1);
-        }
-
-        ROIAlignBackwardV1<scalar_t>
-            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                output_size, top_diff, rois_data, spatial_scale, sample_num,
-                channels, height, width, pooled_height, pooled_width,
-                bottom_diff);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu
deleted file mode 100644
index 0189323cd1e..00000000000
--- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu
+++ /dev/null
@@ -1,348 +0,0 @@
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-
-// TODO make it in a common file
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-template <typename T>
-__device__ T bilinear_interpolate(const T* bottom_data, const int height,
-                                  const int width, T y, T x,
-                                  const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    return 0;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  T v1 = bottom_data[y_low * width + x_low];
-  T v2 = bottom_data[y_low * width + x_high];
-  T v3 = bottom_data[y_high * width + x_low];
-  T v4 = bottom_data[y_high * width + x_high];
-  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
-}
-
-template <typename T>
-__global__ void RoIAlignForwardV2(
-    const int nthreads, const T* bottom_data, const T spatial_scale,
-    const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int sampling_ratio,
-    const T* bottom_rois, T* top_data, bool aligned) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    // When the grid is empty, output zeros.
-    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    T output_val = 0.;
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = roi_start_h + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_start_w + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-
-        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
-                                     index);
-        output_val += val;
-      }
-    }
-    output_val /= count;
-
-    top_data[index] = output_val;
-  }
-}
-
-template <typename T>
-__device__ void bilinear_interpolate_gradient(
-    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
-    int& x_low, int& x_high, int& y_low, int& y_high,
-    const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = bottom_data[y_low * width + x_low];
-  // T v2 = bottom_data[y_low * width + x_high];
-  // T v3 = bottom_data[y_high * width + x_low];
-  // T v4 = bottom_data[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <typename T>
-__global__ void RoIAlignBackwardFeatureV2(
-    const int nthreads, const T* top_diff, const int num_rois,
-    const T spatial_scale, const int channels, const int height,
-    const int width, const int pooled_height, const int pooled_width,
-    const int sampling_ratio, T* bottom_diff, const T* bottom_rois,
-    bool aligned) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 5;
-    int roi_batch_ind = offset_bottom_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
-
-    int top_offset = (n * channels + c) * pooled_height * pooled_width;
-    const T* offset_top_diff = top_diff + top_offset;
-    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
-    {
-      const T y = roi_start_h + ph * bin_size_h +
-                  static_cast<T>(iy + .5f) * bin_size_h /
-                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T x = roi_start_w + pw * bin_size_w +
-                    static_cast<T>(ix + .5f) * bin_size_w /
-                        static_cast<T>(roi_bin_grid_w);
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        T g1 = top_diff_this_bin * w1 / count;
-        T g2 = top_diff_this_bin * w2 / count;
-        T g3 = top_diff_this_bin * w3 / count;
-        T g4 = top_diff_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          atomicAdd(offset_bottom_diff + y_low * width + x_low,
-                    static_cast<T>(g1));
-          atomicAdd(offset_bottom_diff + y_low * width + x_high,
-                    static_cast<T>(g2));
-          atomicAdd(offset_bottom_diff + y_high * width + x_low,
-                    static_cast<T>(g3));
-          atomicAdd(offset_bottom_diff + y_high * width + x_high,
-                    static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // CUDA_1D_KERNEL_LOOP
-}  // RoIAlignBackward
-
-at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
-                                    const at::Tensor& rois,
-                                    const float spatial_scale,
-                                    const int pooled_height,
-                                    const int pooled_width,
-                                    const int sampling_ratio, bool aligned) {
-  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
-  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
-  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
-
-  at::CheckedFrom c = "ROIAlign_forward_cuda";
-  at::checkAllSameGPU(c, {input_t, rois_t});
-  at::checkAllSameType(c, {input_t, rois_t});
-  at::cuda::CUDAGuard device_guard(input.device());
-
-  auto num_rois = rois.size(0);
-  auto channels = input.size(1);
-  auto height = input.size(2);
-  auto width = input.size(3);
-
-  auto output = at::empty({num_rois, channels, pooled_height, pooled_width},
-                          input.options());
-  auto output_size = num_rois * pooled_height * pooled_width * channels;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
-  dim3 block(512);
-
-  if (output.numel() == 0) {
-    AT_CUDA_CHECK(cudaGetLastError());
-    return output;
-  }
-
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
-    RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>(
-        output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
-  });
-  cudaDeviceSynchronize();
-  AT_CUDA_CHECK(cudaGetLastError());
-  return output;
-}
-
-// TODO remove the dependency on input and use instead its sizes -> save memory
-at::Tensor ROIAlignBackwardV2Laucher(
-    const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
-    const int pooled_height, const int pooled_width, const int batch_size,
-    const int channels, const int height, const int width,
-    const int sampling_ratio, bool aligned) {
-  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
-  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
-
-  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
-  at::CheckedFrom c = "ROIAlign_backward_cuda";
-  at::checkAllSameGPU(c, {grad_t, rois_t});
-  at::checkAllSameType(c, {grad_t, rois_t});
-  at::cuda::CUDAGuard device_guard(grad.device());
-
-  auto num_rois = rois.size(0);
-  auto grad_input =
-      at::zeros({batch_size, channels, height, width}, grad.options());
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
-  dim3 block(512);
-
-  // handle possibly empty gradients
-  if (grad.numel() == 0) {
-    AT_CUDA_CHECK(cudaGetLastError());
-    return grad_input;
-  }
-
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
-    RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>(
-        grad.numel(), grad.contiguous().data_ptr<scalar_t>(), num_rois,
-        spatial_scale, channels, height, width, pooled_height, pooled_width,
-        sampling_ratio, grad_input.data_ptr<scalar_t>(),
-        rois.contiguous().data_ptr<scalar_t>(), aligned);
-  });
-  AT_CUDA_CHECK(cudaGetLastError());
-  return grad_input;
-}
diff --git a/mmdet/ops/roi_align/src/roi_align_ext.cpp b/mmdet/ops/roi_align/src/roi_align_ext.cpp
deleted file mode 100644
index 18add01bba2..00000000000
--- a/mmdet/ops/roi_align/src/roi_align_ext.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
-                           const float spatial_scale, const int sample_num,
-                           const int channels, const int height,
-                           const int width, const int num_rois,
-                           const int pooled_height, const int pooled_width,
-                           at::Tensor output);
-
-int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
-                            const float spatial_scale, const int sample_num,
-                            const int channels, const int height,
-                            const int width, const int num_rois,
-                            const int pooled_height, const int pooled_width,
-                            at::Tensor bottom_grad);
-
-at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
-                                    const at::Tensor& rois,
-                                    const float spatial_scale,
-                                    const int pooled_height,
-                                    const int pooled_width,
-                                    const int sampling_ratio, bool aligned);
-
-at::Tensor ROIAlignBackwardV2Laucher(
-    const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
-    const int pooled_height, const int pooled_width, const int batch_size,
-    const int channels, const int height, const int width,
-    const int sampling_ratio, bool aligned);
-#endif
-
-at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input,
-                                       const at::Tensor& rois,
-                                       const float spatial_scale,
-                                       const int pooled_height,
-                                       const int pooled_width,
-                                       const int sampling_ratio, bool aligned);
-
-at::Tensor ROIAlignBackwardV2CPULaucher(
-    const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
-    const int pooled_height, const int pooled_width, const int batch_size,
-    const int channels, const int height, const int width,
-    const int sampling_ratio, bool aligned);
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height,
-                       int pooled_width, float spatial_scale, int sample_num,
-                       at::Tensor output) {
-  if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
-    CHECK_INPUT(features);
-    CHECK_INPUT(rois);
-    CHECK_INPUT(output);
-    at::DeviceGuard guard(features.device());
-
-    // Number of ROIs
-    int num_rois = rois.size(0);
-    int size_rois = rois.size(1);
-
-    if (size_rois != 5) {
-      printf("wrong roi size\n");
-      return 0;
-    }
-
-    int num_channels = features.size(1);
-    int data_height = features.size(2);
-    int data_width = features.size(3);
-
-    ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
-                           num_channels, data_height, data_width, num_rois,
-                           pooled_height, pooled_width, output);
-
-    return 1;
-#else
-    AT_ERROR("ROIAlign is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("ROIAlign is not implemented on CPU");
-}
-
-int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height,
-                        int pooled_width, float spatial_scale, int sample_num,
-                        at::Tensor bottom_grad) {
-  if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
-    CHECK_INPUT(top_grad);
-    CHECK_INPUT(rois);
-    CHECK_INPUT(bottom_grad);
-    at::DeviceGuard guard(top_grad.device());
-
-    // Number of ROIs
-    int num_rois = rois.size(0);
-    int size_rois = rois.size(1);
-    if (size_rois != 5) {
-      printf("wrong roi size\n");
-      return 0;
-    }
-
-    int num_channels = bottom_grad.size(1);
-    int data_height = bottom_grad.size(2);
-    int data_width = bottom_grad.size(3);
-
-    ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
-                            num_channels, data_height, data_width, num_rois,
-                            pooled_height, pooled_width, bottom_grad);
-
-    return 1;
-#else
-    AT_ERROR("ROIAlign is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("ROIAlign is not implemented on CPU");
-}
-
-// Interface for Python
-inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input,
-                                     const at::Tensor& rois,
-                                     const float spatial_scale,
-                                     const int pooled_height,
-                                     const int pooled_width,
-                                     const int sampling_ratio, bool aligned) {
-  if (input.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height,
-                                    pooled_width, sampling_ratio, aligned);
-#else
-    AT_ERROR("ROIAlignV2 is not compiled with GPU support");
-#endif
-  }
-  return ROIAlignForwardV2CPULaucher(input, rois, spatial_scale, pooled_height,
-                                     pooled_width, sampling_ratio, aligned);
-}
-
-inline at::Tensor ROIAlign_backwardV2(
-    const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
-    const int pooled_height, const int pooled_width, const int batch_size,
-    const int channels, const int height, const int width,
-    const int sampling_ratio, bool aligned) {
-  if (grad.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height,
-                                     pooled_width, batch_size, channels, height,
-                                     width, sampling_ratio, aligned);
-#else
-    AT_ERROR("ROIAlignV2 is not compiled with GPU support");
-#endif
-  }
-  return ROIAlignBackwardV2CPULaucher(grad, rois, spatial_scale, pooled_height,
-                                      pooled_width, batch_size, channels,
-                                      height, width, sampling_ratio, aligned);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward_v1", &ROIAlign_forwardV1, "Roi_Align V1 forward");
-  m.def("backward_v1", &ROIAlign_backwardV1, "Roi_Align V1 backward");
-  m.def("forward_v2", &ROIAlign_forwardV2, "Roi_Align V2 forward");
-  m.def("backward_v2", &ROIAlign_backwardV2, "Roi_Align V2 backward");
-}
diff --git a/mmdet/ops/roi_pool/__init__.py b/mmdet/ops/roi_pool/__init__.py
deleted file mode 100644
index 9f0474e5939..00000000000
--- a/mmdet/ops/roi_pool/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .roi_pool import RoIPool, roi_pool
-
-__all__ = ['roi_pool', 'RoIPool']
diff --git a/mmdet/ops/roi_pool/gradcheck.py b/mmdet/ops/roi_pool/gradcheck.py
deleted file mode 100644
index d11af790241..00000000000
--- a/mmdet/ops/roi_pool/gradcheck.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os.path as osp
-import sys
-
-import torch
-from torch.autograd import gradcheck
-
-sys.path.append(osp.abspath(osp.join(__file__, '../../')))
-from roi_pool import RoIPool  # noqa: E402, isort:skip
-
-feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
-rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
-                     [1, 67, 40, 110, 120]]).cuda()
-inputs = (feat, rois)
-print('Gradcheck for roi pooling...')
-test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
-print(test)
diff --git a/mmdet/ops/roi_pool/roi_pool.py b/mmdet/ops/roi_pool/roi_pool.py
deleted file mode 100644
index 13c2708b333..00000000000
--- a/mmdet/ops/roi_pool/roi_pool.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-from torch.nn.modules.utils import _pair
-
-from . import roi_pool_ext
-
-
-class RoIPoolFunction(Function):
-
-    @staticmethod
-    def forward(ctx, features, rois, out_size, spatial_scale):
-        assert features.is_cuda
-        out_h, out_w = _pair(out_size)
-        assert isinstance(out_h, int) and isinstance(out_w, int)
-        ctx.save_for_backward(rois)
-        num_channels = features.size(1)
-        num_rois = rois.size(0)
-        out_size = (num_rois, num_channels, out_h, out_w)
-        output = features.new_zeros(out_size)
-        argmax = features.new_zeros(out_size, dtype=torch.int)
-        roi_pool_ext.forward(features, rois, out_h, out_w, spatial_scale,
-                             output, argmax)
-        ctx.spatial_scale = spatial_scale
-        ctx.feature_size = features.size()
-        ctx.argmax = argmax
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        assert grad_output.is_cuda
-        spatial_scale = ctx.spatial_scale
-        feature_size = ctx.feature_size
-        argmax = ctx.argmax
-        rois = ctx.saved_tensors[0]
-        assert feature_size is not None
-
-        grad_input = grad_rois = None
-        if ctx.needs_input_grad[0]:
-            grad_input = grad_output.new_zeros(feature_size)
-            roi_pool_ext.backward(grad_output.contiguous(), rois, argmax,
-                                  spatial_scale, grad_input)
-
-        return grad_input, grad_rois, None, None
-
-
-roi_pool = RoIPoolFunction.apply
-
-
-class RoIPool(nn.Module):
-
-    def __init__(self, out_size, spatial_scale, use_torchvision=False):
-        super(RoIPool, self).__init__()
-
-        self.out_size = _pair(out_size)
-        self.spatial_scale = float(spatial_scale)
-        self.use_torchvision = use_torchvision
-
-    def forward(self, features, rois):
-        if self.use_torchvision:
-            from torchvision.ops import roi_pool as tv_roi_pool
-            return tv_roi_pool(features, rois, self.out_size,
-                               self.spatial_scale)
-        else:
-            return roi_pool(features, rois, self.out_size, self.spatial_scale)
-
-    def __repr__(self):
-        format_str = self.__class__.__name__
-        format_str += f'(out_size={self.out_size}, '
-        format_str += f'spatial_scale={self.spatial_scale}, '
-        format_str += f'use_torchvision={self.use_torchvision})'
-        return format_str
diff --git a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu
deleted file mode 100644
index 88fab97fbb4..00000000000
--- a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu
+++ /dev/null
@@ -1,151 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define THREADS_PER_BLOCK 1024
-
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  int max_block_num = 65000;
-  return min(optimal_block_num, max_block_num);
-}
-
-template <typename scalar_t>
-__global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
-                               const scalar_t *rois,
-                               const scalar_t spatial_scale, const int channels,
-                               const int height, const int width,
-                               const int pooled_h, const int pooled_w,
-                               scalar_t *top_data, int *argmax_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_w;
-    int ph = (index / pooled_w) % pooled_h;
-    int c = (index / pooled_w / pooled_h) % channels;
-    int n = index / pooled_w / pooled_h / channels;
-
-    const scalar_t *offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-    // calculate the roi region on feature maps
-    scalar_t roi_x1 = offset_rois[1] * spatial_scale;
-    scalar_t roi_y1 = offset_rois[2] * spatial_scale;
-    scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale;
-    scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale;
-
-    // force malformed rois to be 1x1
-    scalar_t roi_w = roi_x2 - roi_x1;
-    scalar_t roi_h = roi_y2 - roi_y1;
-    if (roi_w <= 0 || roi_h <= 0) continue;
-
-    scalar_t bin_size_w = roi_w / static_cast<scalar_t>(pooled_w);
-    scalar_t bin_size_h = roi_h / static_cast<scalar_t>(pooled_h);
-
-    // the corresponding bin region
-    int bin_x1 = floor(static_cast<scalar_t>(pw) * bin_size_w + roi_x1);
-    int bin_y1 = floor(static_cast<scalar_t>(ph) * bin_size_h + roi_y1);
-    int bin_x2 = ceil(static_cast<scalar_t>(pw + 1) * bin_size_w + roi_x1);
-    int bin_y2 = ceil(static_cast<scalar_t>(ph + 1) * bin_size_h + roi_y1);
-
-    // add roi offsets and clip to input boundaries
-    bin_x1 = min(max(bin_x1, 0), width);
-    bin_y1 = min(max(bin_y1, 0), height);
-    bin_x2 = min(max(bin_x2, 0), width);
-    bin_y2 = min(max(bin_y2, 0), height);
-    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
-
-    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
-    int max_idx = -1;
-    bottom_data += (roi_batch_ind * channels + c) * height * width;
-
-    // Define an empty pooling region to be zero
-    scalar_t max_val = is_empty ? static_cast<scalar_t>(0)
-                                : bottom_data[bin_y1 * width + bin_x1] - 1;
-
-    for (int h = bin_y1; h < bin_y2; ++h) {
-      for (int w = bin_x1; w < bin_x2; ++w) {
-        int offset = h * width + w;
-        if (bottom_data[offset] > max_val) {
-          max_val = bottom_data[offset];
-          max_idx = offset;
-        }
-      }
-    }
-    top_data[index] = max_val;
-    if (argmax_data != NULL) argmax_data[index] = max_idx;
-  }
-}
-
-int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
-                          const float spatial_scale, const int channels,
-                          const int height, const int width, const int num_rois,
-                          const int pooled_h, const int pooled_w,
-                          at::Tensor output, at::Tensor argmax) {
-  const int output_size = num_rois * channels * pooled_h * pooled_w;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "ROIPoolLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        scalar_t *top_data = output.data_ptr<scalar_t>();
-        int *argmax_data = argmax.data_ptr<int>();
-
-        ROIPoolForward<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
-                                   0, at::cuda::getCurrentCUDAStream()>>>(
-            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-            channels, height, width, pooled_h, pooled_w, top_data, argmax_data);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
-template <typename scalar_t>
-__global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
-                                const scalar_t *rois, const int *argmax_data,
-                                const scalar_t spatial_scale,
-                                const int channels, const int height,
-                                const int width, const int pooled_h,
-                                const int pooled_w, scalar_t *bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int pw = index % pooled_w;
-    int ph = (index / pooled_w) % pooled_h;
-    int c = (index / pooled_w / pooled_h) % channels;
-    int n = index / pooled_w / pooled_h / channels;
-    int roi_batch_ind = rois[n * 5];
-    int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
-                                   ph * pooled_w + pw];
-    if (bottom_index != -1) {
-      atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width +
-                    bottom_index,
-                top_diff[index]);
-    }
-  }
-}
-int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
-                           const at::Tensor argmax, const float spatial_scale,
-                           const int batch_size, const int channels,
-                           const int height, const int width,
-                           const int num_rois, const int pooled_h,
-                           const int pooled_w, at::Tensor bottom_grad) {
-  const int output_size = num_rois * pooled_h * pooled_w * channels;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.scalar_type(), "ROIPoolLaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
-        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
-        const int *argmax_data = argmax.data_ptr<int>();
-        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
-        if (sizeof(scalar_t) == sizeof(double)) {
-          fprintf(stderr, "double is not supported\n");
-          exit(-1);
-        }
-        ROIPoolBackward<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
-                                    0, at::cuda::getCurrentCUDAStream()>>>(
-            output_size, top_diff, rois_data, argmax_data,
-            scalar_t(spatial_scale), channels, height, width, pooled_h,
-            pooled_w, bottom_diff);
-      }));
-  THCudaCheck(cudaGetLastError());
-  return 1;
-}
diff --git a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp b/mmdet/ops/roi_pool/src/roi_pool_ext.cpp
deleted file mode 100644
index 27d6b8a5d07..00000000000
--- a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <torch/extension.h>
-
-#include <cmath>
-#include <vector>
-
-#ifdef WITH_CUDA
-int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
-                          const float spatial_scale, const int channels,
-                          const int height, const int width, const int num_rois,
-                          const int pooled_h, const int pooled_w,
-                          at::Tensor output, at::Tensor argmax);
-
-int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
-                           const at::Tensor argmax, const float spatial_scale,
-                           const int batch_size, const int channels,
-                           const int height, const int width,
-                           const int num_rois, const int pooled_h,
-                           const int pooled_w, at::Tensor bottom_grad);
-#endif
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-int roi_pooling_forward(at::Tensor features, at::Tensor rois,
-                             int pooled_height, int pooled_width,
-                             float spatial_scale, at::Tensor output,
-                             at::Tensor argmax) {
-  if (features.device().is_cuda()) {
-#ifdef WITH_CUDA
-    CHECK_INPUT(features);
-    CHECK_INPUT(rois);
-    CHECK_INPUT(output);
-    CHECK_INPUT(argmax);
-    at::DeviceGuard guard(features.device());
-
-    // Number of ROIs
-    int num_rois = rois.size(0);
-    int size_rois = rois.size(1);
-
-    if (size_rois != 5) {
-      printf("wrong roi size\n");
-      return 0;
-    }
-
-    int channels = features.size(1);
-    int height = features.size(2);
-    int width = features.size(3);
-
-    ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width,
-                          num_rois, pooled_height, pooled_width, output, argmax);
-
-    return 1;
-#else
-    AT_ERROR("roi_pool is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("roi_pool is not implemented on CPU");
-}
-
-int roi_pooling_backward(at::Tensor top_grad, at::Tensor rois,
-                              at::Tensor argmax, float spatial_scale,
-                              at::Tensor bottom_grad) {
-  if (top_grad.device().is_cuda()) {
-#ifdef WITH_CUDA
-    CHECK_INPUT(top_grad);
-    CHECK_INPUT(rois);
-    CHECK_INPUT(argmax);
-    CHECK_INPUT(bottom_grad);
-    at::DeviceGuard guard(top_grad.device());
-
-    int pooled_height = top_grad.size(2);
-    int pooled_width = top_grad.size(3);
-    int num_rois = rois.size(0);
-    int size_rois = rois.size(1);
-
-    if (size_rois != 5) {
-      printf("wrong roi size\n");
-      return 0;
-    }
-    int batch_size = bottom_grad.size(0);
-    int channels = bottom_grad.size(1);
-    int height = bottom_grad.size(2);
-    int width = bottom_grad.size(3);
-
-    ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size,
-                           channels, height, width, num_rois, pooled_height,
-                           pooled_width, bottom_grad);
-
-  return 1;
-#else
-    AT_ERROR("roi_pool is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("roi_pool is not implemented on CPU");
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &roi_pooling_forward, "Roi_Pooling forward");
-  m.def("backward", &roi_pooling_backward, "Roi_Pooling backward");
-}
diff --git a/mmdet/ops/saconv.py b/mmdet/ops/saconv.py
deleted file mode 100644
index fb35be67e0e..00000000000
--- a/mmdet/ops/saconv.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import CONV_LAYERS, constant_init
-
-from .conv_ws import ConvAWS2d
-from .dcn import deform_conv
-
-
-@CONV_LAYERS.register_module(name='SAC')
-class SAConv2d(ConvAWS2d):
-    """SAC (Switchable Atrous Convolution)
-
-    This is an implementation of SAC in DetectoRS
-    (https://arxiv.org/pdf/2006.02334.pdf).
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to both sides of
-            the input. Default: 0
-        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
-            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
-        dilation (int or tuple, optional): Spacing between kernel elements.
-            Default: 1
-        groups (int, optional): Number of blocked connections from input
-            channels to output channels. Default: 1
-        bias (bool, optional): If ``True``, adds a learnable bias to the
-            output. Default: ``True``
-        use_deform: If ``True``, replace convolution with deformable
-            convolution. Default: ``False``.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 use_deform=False):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias=bias)
-        self.use_deform = use_deform
-        self.switch = nn.Conv2d(
-            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
-        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
-        self.pre_context = nn.Conv2d(
-            self.in_channels, self.in_channels, kernel_size=1, bias=True)
-        self.post_context = nn.Conv2d(
-            self.out_channels, self.out_channels, kernel_size=1, bias=True)
-        if self.use_deform:
-            self.offset_s = nn.Conv2d(
-                self.in_channels,
-                18,
-                kernel_size=3,
-                padding=1,
-                stride=stride,
-                bias=True)
-            self.offset_l = nn.Conv2d(
-                self.in_channels,
-                18,
-                kernel_size=3,
-                padding=1,
-                stride=stride,
-                bias=True)
-        self.init_weights()
-
-    def init_weights(self):
-        constant_init(self.switch, 0, bias=1)
-        self.weight_diff.data.zero_()
-        constant_init(self.pre_context, 0)
-        constant_init(self.post_context, 0)
-        if self.use_deform:
-            constant_init(self.offset_s, 0)
-            constant_init(self.offset_l, 0)
-
-    def forward(self, x):
-        # pre-context
-        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
-        avg_x = self.pre_context(avg_x)
-        avg_x = avg_x.expand_as(x)
-        x = x + avg_x
-        # switch
-        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
-        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
-        switch = self.switch(avg_x)
-        # sac
-        weight = self._get_weight(self.weight)
-        if self.use_deform:
-            offset = self.offset_s(avg_x)
-            out_s = deform_conv(x, offset, weight, self.stride, self.padding,
-                                self.dilation, self.groups, 1)
-        else:
-            out_s = super().conv2d_forward(x, weight)
-        ori_p = self.padding
-        ori_d = self.dilation
-        self.padding = tuple(3 * p for p in self.padding)
-        self.dilation = tuple(3 * d for d in self.dilation)
-        weight = weight + self.weight_diff
-        if self.use_deform:
-            offset = self.offset_l(avg_x)
-            out_l = deform_conv(x, offset, weight, self.stride, self.padding,
-                                self.dilation, self.groups, 1)
-        else:
-            out_l = super().conv2d_forward(x, weight)
-        out = switch * out_s + (1 - switch) * out_l
-        self.padding = ori_p
-        self.dilation = ori_d
-        # post-context
-        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
-        avg_x = self.post_context(avg_x)
-        avg_x = avg_x.expand_as(out)
-        out = out + avg_x
-        return out
diff --git a/mmdet/ops/sigmoid_focal_loss/__init__.py b/mmdet/ops/sigmoid_focal_loss/__init__.py
deleted file mode 100644
index 218032945b2..00000000000
--- a/mmdet/ops/sigmoid_focal_loss/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss
-
-__all__ = ['SigmoidFocalLoss', 'sigmoid_focal_loss']
diff --git a/mmdet/ops/sigmoid_focal_loss/sigmoid_focal_loss.py b/mmdet/ops/sigmoid_focal_loss/sigmoid_focal_loss.py
deleted file mode 100644
index 0715af38e1e..00000000000
--- a/mmdet/ops/sigmoid_focal_loss/sigmoid_focal_loss.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import torch.nn as nn
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from . import sigmoid_focal_loss_ext
-
-
-class SigmoidFocalLossFunction(Function):
-
-    @staticmethod
-    def forward(ctx, input, target, gamma=2.0, alpha=0.25):
-        ctx.save_for_backward(input, target)
-        num_classes = input.shape[1]
-        ctx.num_classes = num_classes
-        ctx.gamma = gamma
-        ctx.alpha = alpha
-
-        loss = sigmoid_focal_loss_ext.forward(input, target, num_classes,
-                                              gamma, alpha)
-        return loss
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, d_loss):
-        input, target = ctx.saved_tensors
-        num_classes = ctx.num_classes
-        gamma = ctx.gamma
-        alpha = ctx.alpha
-        d_loss = d_loss.contiguous()
-        d_input = sigmoid_focal_loss_ext.backward(input, target, d_loss,
-                                                  num_classes, gamma, alpha)
-        return d_input, None, None, None, None
-
-
-sigmoid_focal_loss = SigmoidFocalLossFunction.apply
-
-
-# TODO: remove this module
-class SigmoidFocalLoss(nn.Module):
-
-    def __init__(self, gamma, alpha):
-        super(SigmoidFocalLoss, self).__init__()
-        self.gamma = gamma
-        self.alpha = alpha
-
-    def forward(self, logits, targets):
-        assert logits.is_cuda
-        loss = sigmoid_focal_loss(logits, targets, self.gamma, self.alpha)
-        return loss.sum()
-
-    def __repr__(self):
-        tmpstr = self.__class__.__name__
-        tmpstr += f'(gamma={self.gamma}, alpha={self.alpha})'
-        return tmpstr
diff --git a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu b/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu
deleted file mode 100644
index 012d01c26b1..00000000000
--- a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu
+++ /dev/null
@@ -1,175 +0,0 @@
-// modified from
-// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
-
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-// This file is modified from
-// https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
-// Cheng-Yang Fu
-// cyfu@cs.unc.edu
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THC.h>
-#include <THC/THCAtomics.cuh>
-#include <THC/THCDeviceUtils.cuh>
-
-#include <cfloat>
-
-// TODO make it in a common file
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-template <typename scalar_t>
-__global__ void SigmoidFocalLossForward(const int nthreads,
-                                        const scalar_t *logits,
-                                        const int64_t *targets,
-                                        const int num_classes,
-                                        const float gamma, const float alpha,
-                                        const int num, scalar_t *losses) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    int n = i / num_classes;
-    int d = i % num_classes;  // current class[0~79];
-    int t = targets[n];       // target class [0~79];
-
-    // Decide it is positive or negative case.
-    scalar_t c1 = (t == d);
-    scalar_t c2 = (t >= 0 & t != d);
-
-    scalar_t zn = (1.0 - alpha);
-    scalar_t zp = (alpha);
-
-    // p = 1. / 1. + expf(-x); p = sigmoid(x)
-    scalar_t p = 1. / (1. + expf(-logits[i]));
-
-    // (1-p)**gamma * log(p) where
-    scalar_t term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
-
-    // p**gamma * log(1-p)
-    scalar_t term2 =
-        powf(p, gamma) *
-        (-1. * logits[i] * (logits[i] >= 0) -
-         logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
-
-    losses[i] = 0.0;
-    losses[i] += -c1 * term1 * zp;
-    losses[i] += -c2 * term2 * zn;
-
-  }  // CUDA_1D_KERNEL_LOOP
-}  // SigmoidFocalLossForward
-
-template <typename scalar_t>
-__global__ void SigmoidFocalLossBackward(
-    const int nthreads, const scalar_t *logits, const int64_t *targets,
-    const scalar_t *d_losses, const int num_classes, const float gamma,
-    const float alpha, const int num, scalar_t *d_logits) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
-    int n = i / num_classes;
-    int d = i % num_classes;  // current class[0~79];
-    int t = targets[n];       // target class [0~79], 80 is background;
-
-    // Decide it is positive or negative case.
-    scalar_t c1 = (t == d);
-    scalar_t c2 = (t >= 0 & t != d);
-
-    scalar_t zn = (1.0 - alpha);
-    scalar_t zp = (alpha);
-    // p = 1. / 1. + expf(-x); p = sigmoid(x)
-    scalar_t p = 1. / (1. + expf(-logits[i]));
-
-    // (1-p)**g * (1 - p - g*p*log(p)
-    scalar_t term1 =
-        powf((1. - p), gamma) * (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
-
-    // (p**g) * (g*(1-p)*log(1-p) - p)
-    scalar_t term2 =
-        powf(p, gamma) *
-        ((-1. * logits[i] * (logits[i] >= 0) -
-          logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
-             (1. - p) * gamma -
-         p);
-    d_logits[i] = 0.0;
-    d_logits[i] += -c1 * term1 * zp;
-    d_logits[i] += -c2 * term2 * zn;
-    d_logits[i] = d_logits[i] * d_losses[i];
-
-  }  // CUDA_1D_KERNEL_LOOP
-}  // SigmoidFocalLossBackward
-
-at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
-                                         const at::Tensor &targets,
-                                         const int num_classes,
-                                         const float gamma, const float alpha) {
-  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
-  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
-  AT_ASSERTM(targets.max().item<long>() <= (long)num_classes,
-             "target label should smaller or equal than num classes");
-
-  const int num_samples = logits.size(0);
-
-  auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
-  auto losses_size = num_samples * logits.size(1);
-
-  dim3 grid(
-      std::min(THCCeilDiv((int64_t)losses_size, (int64_t)512), (int64_t)4096));
-  dim3 block(512);
-
-  if (losses.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
-    return losses;
-  }
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
-        SigmoidFocalLossForward<scalar_t>
-            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                losses_size, logits.contiguous().data_ptr<scalar_t>(),
-                targets.contiguous().data_ptr<int64_t>(), num_classes, gamma,
-                alpha, num_samples, losses.data_ptr<scalar_t>());
-      });
-  THCudaCheck(cudaGetLastError());
-  return losses;
-}
-
-at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
-                                          const at::Tensor &targets,
-                                          const at::Tensor &d_losses,
-                                          const int num_classes,
-                                          const float gamma,
-                                          const float alpha) {
-  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
-  AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
-
-  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
-
-  const int num_samples = logits.size(0);
-  AT_ASSERTM(logits.size(1) == num_classes,
-             "logits.size(1) should be num_classes");
-
-  auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
-  auto d_logits_size = num_samples * logits.size(1);
-
-  dim3 grid(std::min(THCCeilDiv((int64_t)d_logits_size, (int64_t)512),
-                     (int64_t)4096));
-  dim3 block(512);
-
-  if (d_logits.numel() == 0) {
-    THCudaCheck(cudaGetLastError());
-    return d_logits;
-  }
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
-        SigmoidFocalLossBackward<scalar_t>
-            <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-                d_logits_size, logits.contiguous().data_ptr<scalar_t>(),
-                targets.contiguous().data_ptr<int64_t>(),
-                d_losses.contiguous().data_ptr<scalar_t>(), num_classes, gamma,
-                alpha, num_samples, d_logits.data_ptr<scalar_t>());
-      });
-
-  THCudaCheck(cudaGetLastError());
-  return d_logits;
-}
diff --git a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp b/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp
deleted file mode 100644
index 3d66f3f8ff8..00000000000
--- a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// modify from
-// https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
-#include <torch/extension.h>
-
-#ifdef WITH_CUDA
-at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
-                                         const at::Tensor &targets,
-                                         const int num_classes,
-                                         const float gamma, const float alpha);
-
-at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
-                                          const at::Tensor &targets,
-                                          const at::Tensor &d_losses,
-                                          const int num_classes,
-                                          const float gamma, const float alpha);
-#endif
-
-// Interface for Python
-at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits,
-                                    const at::Tensor &targets,
-                                    const int num_classes, const float gamma,
-                                    const float alpha) {
-  if (logits.device().is_cuda()) {
-#ifdef WITH_CUDA
-    at::DeviceGuard guard(logits.device());
-    return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma,
-                                         alpha);
-#else
-      AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("SigmoidFocalLoss is not implemented on the CPU");
-}
-
-at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits,
-                                     const at::Tensor &targets,
-                                     const at::Tensor &d_losses,
-                                     const int num_classes, const float gamma,
-                                     const float alpha) {
-  if (logits.device().is_cuda()) {
-#ifdef WITH_CUDA
-    at::DeviceGuard guard(logits.device());
-    return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses,
-                                          num_classes, gamma, alpha);
-#else
-      AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("SigmoidFocalLoss is not implemented on the CPU");
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &SigmoidFocalLoss_forward,
-        "SigmoidFocalLoss forward");
-  m.def("backward", &SigmoidFocalLoss_backward,
-        "SigmoidFocalLoss backward");
-}
diff --git a/mmdet/ops/utils/__init__.py b/mmdet/ops/utils/__init__.py
deleted file mode 100644
index 0244c0f5470..00000000000
--- a/mmdet/ops/utils/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# from . import compiling_info
-from .compiling_info import get_compiler_version, get_compiling_cuda_version
-
-# get_compiler_version = compiling_info.get_compiler_version
-# get_compiling_cuda_version = compiling_info.get_compiling_cuda_version
-
-__all__ = ['get_compiler_version', 'get_compiling_cuda_version']
diff --git a/mmdet/ops/utils/src/compiling_info.cpp b/mmdet/ops/utils/src/compiling_info.cpp
deleted file mode 100644
index a671805aaf0..00000000000
--- a/mmdet/ops/utils/src/compiling_info.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
-#include <torch/extension.h>
-
-#ifdef WITH_CUDA
-#include <cuda_runtime_api.h>
-int get_cudart_version() { return CUDART_VERSION; }
-#endif
-
-std::string get_compiling_cuda_version() {
-#ifdef WITH_CUDA
-  std::ostringstream oss;
-
-  // copied from
-  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
-  auto printCudaStyleVersion = [&](int v) {
-    oss << (v / 1000) << "." << (v / 10 % 100);
-    if (v % 10 != 0) {
-      oss << "." << (v % 10);
-    }
-  };
-  printCudaStyleVersion(get_cudart_version());
-  return oss.str();
-#else
-  return std::string("not available");
-#endif
-}
-
-// similar to
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
-std::string get_compiler_version() {
-  std::ostringstream ss;
-#if defined(__GNUC__)
-#ifndef __clang__
-  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
-#endif
-#endif
-
-#if defined(__clang_major__)
-  {
-    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
-       << __clang_patchlevel__;
-  }
-#endif
-
-#if defined(_MSC_VER)
-  { ss << "MSVC " << _MSC_FULL_VER; }
-#endif
-  return ss.str();
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
-  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
-        "get_compiling_cuda_version");
-}
diff --git a/mmdet/ops/wrappers.py b/mmdet/ops/wrappers.py
deleted file mode 100644
index 10109936fc0..00000000000
--- a/mmdet/ops/wrappers.py
+++ /dev/null
@@ -1,100 +0,0 @@
-r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
-
-Wrap some nn modules to support empty tensor
-input. Currently, these wrappers are mainly used in mask heads like
-fcn_mask_head and maskiou_heads since mask heads are trained on only positive
-RoIs.
-"""
-import math
-
-import torch
-import torch.nn as nn
-from mmcv.cnn import CONV_LAYERS
-from torch.nn.modules.utils import _pair
-
-
-class NewEmptyTensorOp(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, x, new_shape):
-        ctx.shape = x.shape
-        return x.new_empty(new_shape)
-
-    @staticmethod
-    def backward(ctx, grad):
-        shape = ctx.shape
-        return NewEmptyTensorOp.apply(grad, shape), None
-
-
-@CONV_LAYERS.register_module(name='Conv', force=True)
-class Conv2d(nn.Conv2d):
-
-    def forward(self, x):
-        if x.numel() == 0 and torch.__version__ <= '1.4':
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
-                                     self.padding, self.stride, self.dilation):
-                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
-
-
-class ConvTranspose2d(nn.ConvTranspose2d):
-
-    def forward(self, x):
-        if x.numel() == 0 and torch.__version__ <= '1.4.0':
-            out_shape = [x.shape[0], self.out_channels]
-            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
-                                         self.padding, self.stride,
-                                         self.dilation, self.output_padding):
-                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super(ConvTranspose2d, self).forward(x)
-
-
-class MaxPool2d(nn.MaxPool2d):
-
-    def forward(self, x):
-        if x.numel() == 0 and torch.__version__ <= '1.4':
-            out_shape = list(x.shape[:2])
-            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
-                                     _pair(self.padding), _pair(self.stride),
-                                     _pair(self.dilation)):
-                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
-                o = math.ceil(o) if self.ceil_mode else math.floor(o)
-                out_shape.append(o)
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            return empty
-
-        return super().forward(x)
-
-
-class Linear(torch.nn.Linear):
-
-    def forward(self, x):
-        if x.numel() == 0:
-            out_shape = [x.shape[0], self.out_features]
-            empty = NewEmptyTensorOp.apply(x, out_shape)
-            if self.training:
-                # produce dummy gradient to avoid DDP warning.
-                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
-                return empty + dummy
-            else:
-                return empty
-
-        return super().forward(x)
diff --git a/mmdet/utils/collect_env.py b/mmdet/utils/collect_env.py
index 053f2403b1d..7bd71a87274 100644
--- a/mmdet/utils/collect_env.py
+++ b/mmdet/utils/collect_env.py
@@ -53,7 +53,7 @@ def collect_env():
 
     env_info['MMCV'] = mmcv.__version__
     env_info['MMDetection'] = mmdet.__version__
-    from mmdet.ops import get_compiler_version, get_compiling_cuda_version
+    from mmcv.ops import get_compiler_version, get_compiling_cuda_version
     env_info['MMDetection Compiler'] = get_compiler_version()
     env_info['MMDetection CUDA Compiler'] = get_compiling_cuda_version()
     return env_info
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 2a66ea699f1..8581152d49c 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,5 +1,5 @@
 matplotlib
-mmcv==0.6.2
+mmcv>=1.0.2
 numpy
 # need older pillow until torchvision is fixed
 Pillow<=6.2.2
diff --git a/setup.py b/setup.py
index 3724a29b3e7..f67b36beee2 100755
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,8 @@ def write_version_py():
     sha = get_hash()
     with open('mmdet/VERSION', 'r') as f:
         SHORT_VERSION = f.read().strip()
-    VERSION_INFO = ', '.join(SHORT_VERSION.split('.'))
+    VERSION_INFO = ', '.join(
+        [x if x.isdigit() else f'"{x}"' for x in SHORT_VERSION.split('.')])
     VERSION = SHORT_VERSION + '+' + sha
 
     version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION,
@@ -199,7 +200,6 @@ def gen_packages_items():
         keywords='computer vision, object detection',
         url='https://github.com/open-mmlab/mmdetection',
         packages=find_packages(exclude=('configs', 'tools', 'demo')),
-        package_data={'mmdet.ops': ['*/*.so']},
         classifiers=[
             'Development Status :: 4 - Beta',
             'License :: OSI Approved :: Apache Software License',
@@ -219,83 +219,6 @@ def gen_packages_items():
             'build': parse_requirements('requirements/build.txt'),
             'optional': parse_requirements('requirements/optional.txt'),
         },
-        ext_modules=[
-            make_cuda_ext(
-                name='compiling_info',
-                module='mmdet.ops.utils',
-                sources=['src/compiling_info.cpp']),
-            make_cuda_ext(
-                name='nms_ext',
-                module='mmdet.ops.nms',
-                sources=['src/nms_ext.cpp', 'src/cpu/nms_cpu.cpp'],
-                sources_cuda=[
-                    'src/cuda/nms_cuda.cpp', 'src/cuda/nms_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='roi_align_ext',
-                module='mmdet.ops.roi_align',
-                sources=[
-                    'src/roi_align_ext.cpp',
-                    'src/cpu/roi_align_v2.cpp',
-                ],
-                sources_cuda=[
-                    'src/cuda/roi_align_kernel.cu',
-                    'src/cuda/roi_align_kernel_v2.cu'
-                ]),
-            make_cuda_ext(
-                name='roi_pool_ext',
-                module='mmdet.ops.roi_pool',
-                sources=['src/roi_pool_ext.cpp'],
-                sources_cuda=['src/cuda/roi_pool_kernel.cu']),
-            make_cuda_ext(
-                name='deform_conv_ext',
-                module='mmdet.ops.dcn',
-                sources=['src/deform_conv_ext.cpp'],
-                sources_cuda=[
-                    'src/cuda/deform_conv_cuda.cpp',
-                    'src/cuda/deform_conv_cuda_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='deform_pool_ext',
-                module='mmdet.ops.dcn',
-                sources=['src/deform_pool_ext.cpp'],
-                sources_cuda=[
-                    'src/cuda/deform_pool_cuda.cpp',
-                    'src/cuda/deform_pool_cuda_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='sigmoid_focal_loss_ext',
-                module='mmdet.ops.sigmoid_focal_loss',
-                sources=['src/sigmoid_focal_loss_ext.cpp'],
-                sources_cuda=['src/cuda/sigmoid_focal_loss_cuda.cu']),
-            make_cuda_ext(
-                name='masked_conv2d_ext',
-                module='mmdet.ops.masked_conv',
-                sources=['src/masked_conv2d_ext.cpp'],
-                sources_cuda=[
-                    'src/cuda/masked_conv2d_cuda.cpp',
-                    'src/cuda/masked_conv2d_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='carafe_ext',
-                module='mmdet.ops.carafe',
-                sources=['src/carafe_ext.cpp'],
-                sources_cuda=[
-                    'src/cuda/carafe_cuda.cpp',
-                    'src/cuda/carafe_cuda_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='carafe_naive_ext',
-                module='mmdet.ops.carafe',
-                sources=['src/carafe_naive_ext.cpp'],
-                sources_cuda=[
-                    'src/cuda/carafe_naive_cuda.cpp',
-                    'src/cuda/carafe_naive_cuda_kernel.cu'
-                ]),
-            make_cuda_ext(
-                name='corner_pool_ext',
-                module='mmdet.ops.corner_pool',
-                sources=['src/corner_pool.cpp']),
-        ],
+        ext_modules=[],
         cmdclass={'build_ext': BuildExtension},
         zip_safe=False)
diff --git a/tests/test_config.py b/tests/test_config.py
index 91cbe81b3c2..bbcf4493206 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -274,8 +274,8 @@ def _check_roi_extractor(config, roi_extractor, prev_roi_extractor=None):
     assert (len(config.featmap_strides) == len(roi_extractor.roi_layers))
     assert (config.out_channels == roi_extractor.out_channels)
     from torch.nn.modules.utils import _pair
-    assert (_pair(
-        config.roi_layer.out_size) == roi_extractor.roi_layers[0].out_size)
+    assert (_pair(config.roi_layer.output_size) ==
+            roi_extractor.roi_layers[0].output_size)
 
     if 'use_torchvision' in config.roi_layer:
         assert (config.roi_layer.use_torchvision ==
diff --git a/tests/test_models/test_backbones.py b/tests/test_models/test_backbones.py
index feb08858dcb..d8bf69e6596 100644
--- a/tests/test_models/test_backbones.py
+++ b/tests/test_models/test_backbones.py
@@ -1,5 +1,6 @@
 import pytest
 import torch
+from mmcv.ops import DeformConv2dPack
 from torch.nn.modules import AvgPool2d, GroupNorm
 from torch.nn.modules.batchnorm import _BatchNorm
 
@@ -9,7 +10,6 @@
 from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
 from mmdet.models.backbones.resnext import Bottleneck as BottleneckX
 from mmdet.models.utils import ResLayer
-from mmdet.ops import DeformConvPack
 
 
 def is_block(modules):
@@ -52,7 +52,7 @@ def test_resnet_basic_block():
 
     with pytest.raises(AssertionError):
         # Not implemented yet.
-        dcn = dict(type='DCN', deformable_groups=1, fallback_on_stride=False)
+        dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
         BasicBlock(64, 64, dcn=dcn)
 
     with pytest.raises(AssertionError):
@@ -146,11 +146,11 @@ def test_resnet_bottleneck():
     assert block.conv2.stride == (1, 1)
 
     # Test Bottleneck DCN
-    dcn = dict(type='DCN', deformable_groups=1, fallback_on_stride=False)
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
     with pytest.raises(AssertionError):
         Bottleneck(64, 64, dcn=dcn, conv_cfg=dict(type='Conv'))
     block = Bottleneck(64, 64, dcn=dcn)
-    assert isinstance(block.conv2, DeformConvPack)
+    assert isinstance(block.conv2, DeformConv2dPack)
 
     # Test Bottleneck forward
     block = Bottleneck(64, 16)
@@ -198,7 +198,7 @@ def test_resnet_bottleneck():
                 attention_type='0010',
                 kv_stride=2),
             position='after_conv2'),
-        dict(cfg=dict(type='NonLocal2D'), position='after_conv2'),
+        dict(cfg=dict(type='NonLocal2d'), position='after_conv2'),
         dict(
             cfg=dict(type='ContextBlock', ratio=1. / 16),
             position='after_conv3')
@@ -301,7 +301,7 @@ def test_resnet_backbone():
 
     with pytest.raises(AssertionError):
         # len(stage_with_dcn) == num_stages
-        dcn = dict(type='DCN', deformable_groups=1, fallback_on_stride=False)
+        dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
         ResNet(50, dcn=dcn, stage_with_dcn=(True, ))
 
     with pytest.raises(AssertionError):
@@ -468,7 +468,7 @@ def test_resnet_backbone():
                 kv_stride=2),
             stages=(False, True, True, True),
             position='after_conv2'),
-        dict(cfg=dict(type='NonLocal2D'), position='after_conv2'),
+        dict(cfg=dict(type='NonLocal2d'), position='after_conv2'),
         dict(
             cfg=dict(type='ContextBlock', ratio=1. / 16),
             stages=(False, True, True, False),
@@ -632,7 +632,7 @@ def test_renext_bottleneck():
     assert block.conv2.out_channels == 128
 
     # Test ResNeXt Bottleneck with DCN
-    dcn = dict(type='DCN', deformable_groups=1, fallback_on_stride=False)
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
     with pytest.raises(AssertionError):
         # conv_cfg must be None if dcn is not None
         BottleneckX(
@@ -743,7 +743,7 @@ def test_res2net_bottle2neck():
     assert block.scales == 4
 
     # Test Res2Net Bottle2neck with DCN
-    dcn = dict(type='DCN', deformable_groups=1, fallback_on_stride=False)
+    dcn = dict(type='DCN', deform_groups=1, fallback_on_stride=False)
     with pytest.raises(AssertionError):
         # conv_cfg must be None if dcn is not None
         Bottle2neck(
diff --git a/tests/test_models/test_pisa_heads.py b/tests/test_models/test_pisa_heads.py
index c085147c998..6b1d42db49c 100644
--- a/tests/test_models/test_pisa_heads.py
+++ b/tests/test_models/test_pisa_heads.py
@@ -174,7 +174,7 @@ def test_pisa_roi_head_loss():
 
     bbox_roi_extractor = dict(
         type='SingleRoIExtractor',
-        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
         out_channels=1,
         featmap_strides=[1])
 
diff --git a/tests/test_models/test_roi_extractor.py b/tests/test_models/test_roi_extractor.py
index a523edb4875..22743f2d3be 100644
--- a/tests/test_models/test_roi_extractor.py
+++ b/tests/test_models/test_roi_extractor.py
@@ -7,7 +7,7 @@
 def test_groie():
     # test with pre/post
     cfg = dict(
-        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
         out_channels=256,
         featmap_strides=[4, 8, 16, 32],
         pre_cfg=dict(
@@ -42,7 +42,7 @@ def test_groie():
 
     # test w.o. pre/post
     cfg = dict(
-        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
         out_channels=256,
         featmap_strides=[4, 8, 16, 32])
 
@@ -63,7 +63,7 @@ def test_groie():
     # test w.o. pre/post concat
     cfg = dict(
         aggregation='concat',
-        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
         out_channels=256 * 4,
         featmap_strides=[4, 8, 16, 32])
 
@@ -85,7 +85,7 @@ def test_groie():
     with pytest.raises(AssertionError):
         cfg = dict(
             aggregation='not support',
-            roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
             out_channels=1024,
             featmap_strides=[4, 8, 16, 32])
         _ = GenericRoIExtractor(**cfg)
@@ -93,7 +93,7 @@ def test_groie():
     # test concat channels number
     cfg = dict(
         aggregation='concat',
-        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=2),
         out_channels=256 * 5,  # 256*5 != 256*4
         featmap_strides=[4, 8, 16, 32])
 
diff --git a/tests/test_ops/test_corner_pool.py b/tests/test_ops/test_corner_pool.py
deleted file mode 100644
index cb84acf0d79..00000000000
--- a/tests/test_ops/test_corner_pool.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-CommandLine:
-    pytest tests/test_corner_pool.py
-"""
-import pytest
-import torch
-
-from mmdet.ops import CornerPool
-
-
-def test_corner_pool_device_and_dtypes_cpu():
-    """
-    CommandLine:
-        xdoctest -m tests/test_corner_pool.py \
-            test_corner_pool_device_and_dtypes_cpu
-    """
-    with pytest.raises(AssertionError):
-        # pool mode must in ['bottom', 'left', 'right', 'top']
-        pool = CornerPool('corner')
-
-    lr_tensor = torch.tensor([[[[0, 0, 0, 0, 0], [2, 1, 3, 0, 2],
-                                [5, 4, 1, 1, 6], [0, 0, 0, 0, 0],
-                                [0, 0, 0, 0, 0]]]])
-    tb_tensor = torch.tensor([[[[0, 3, 1, 0, 0], [0, 1, 1, 0, 0],
-                                [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
-                                [0, 0, 2, 0, 0]]]])
-    # Left Pool
-    left_answer = torch.tensor([[[[0, 0, 0, 0, 0], [3, 3, 3, 2, 2],
-                                  [6, 6, 6, 6, 6], [0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]]]])
-    pool = CornerPool('left')
-    left_tensor = pool(lr_tensor)
-    assert left_tensor.type() == lr_tensor.type()
-    assert torch.equal(left_tensor, left_answer)
-    # Right Pool
-    right_answer = torch.tensor([[[[0, 0, 0, 0, 0], [2, 2, 3, 3, 3],
-                                   [5, 5, 5, 5, 6], [0, 0, 0, 0, 0],
-                                   [0, 0, 0, 0, 0]]]])
-    pool = CornerPool('right')
-    right_tensor = pool(lr_tensor)
-    assert right_tensor.type() == lr_tensor.type()
-    assert torch.equal(right_tensor, right_answer)
-    # Top Pool
-    top_answer = torch.tensor([[[[0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
-                                 [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
-                                 [0, 0, 2, 0, 0]]]])
-    pool = CornerPool('top')
-    top_tensor = pool(tb_tensor)
-    assert top_tensor.type() == tb_tensor.type()
-    assert torch.equal(top_tensor, top_answer)
-    # Bottom Pool
-    bottom_answer = torch.tensor([[[[0, 3, 1, 0, 0], [0, 3, 1, 0, 0],
-                                    [0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
-                                    [0, 3, 4, 0, 0]]]])
-    pool = CornerPool('bottom')
-    bottom_tensor = pool(tb_tensor)
-    assert bottom_tensor.type() == tb_tensor.type()
-    assert torch.equal(bottom_tensor, bottom_answer)
diff --git a/tests/test_ops/test_merge_cells.py b/tests/test_ops/test_merge_cells.py
deleted file mode 100644
index 25e76ee7a20..00000000000
--- a/tests/test_ops/test_merge_cells.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""
-CommandLine:
-    pytest tests/test_merge_cells.py
-"""
-import torch
-import torch.nn.functional as F
-
-from mmdet.ops.merge_cells import (BaseMergeCell, ConcatCell,
-                                   GlobalPoolingCell, SumCell)
-
-
-def test_sum_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 16, 16])
-    sum_cell = SumCell(256, 256)
-    output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
-    assert output.size() == inputs_x.size()
-    output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
-    assert output.size() == inputs_y.size()
-    output = sum_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_x.size()
-
-
-def test_concat_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 16, 16])
-    concat_cell = ConcatCell(256, 256)
-    output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
-    assert output.size() == inputs_x.size()
-    output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
-    assert output.size() == inputs_y.size()
-    output = concat_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_x.size()
-
-
-def test_global_pool_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 32, 32])
-    gp_cell = GlobalPoolingCell(with_out_conv=False)
-    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
-    assert (gp_cell_out.size() == inputs_x.size())
-    gp_cell = GlobalPoolingCell(256, 256)
-    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
-    assert (gp_cell_out.size() == inputs_x.size())
-
-
-def test_resize_methods():
-    inputs_x = torch.randn([2, 256, 128, 128])
-    target_resize_sizes = [(128, 128), (256, 256)]
-    resize_methods_list = ['nearest', 'bilinear']
-
-    for method in resize_methods_list:
-        merge_cell = BaseMergeCell(upsample_mode=method)
-        for target_size in target_resize_sizes:
-            merge_cell_out = merge_cell._resize(inputs_x, target_size)
-            gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
-            assert merge_cell_out.equal(gt_out)
-
-    target_size = (64, 64)  # resize to a smaller size
-    merge_cell = BaseMergeCell()
-    merge_cell_out = merge_cell._resize(inputs_x, target_size)
-    kernel_size = inputs_x.shape[-1] // target_size[-1]
-    gt_out = F.max_pool2d(
-        inputs_x, kernel_size=kernel_size, stride=kernel_size)
-    assert (merge_cell_out == gt_out).all()
diff --git a/tests/test_ops/test_nms.py b/tests/test_ops/test_nms.py
deleted file mode 100644
index a9063149309..00000000000
--- a/tests/test_ops/test_nms.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""
-CommandLine:
-    pytest tests/test_nms.py
-"""
-import numpy as np
-import pytest
-import torch
-
-from mmdet.ops.nms.nms_wrapper import nms, nms_match
-
-
-def test_nms_device_and_dtypes_cpu():
-    """
-    CommandLine:
-        xdoctest -m tests/test_nms.py test_nms_device_and_dtypes_cpu
-    """
-    iou_thr = 0.6
-    base_dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.1],
-                          [49.3, 32.9, 51.0, 35.3, 0.05],
-                          [35.3, 11.5, 39.9, 14.5, 0.9],
-                          [35.2, 11.7, 39.7, 15.7, 0.3]])
-
-    base_expected_suppressed = np.array([[35.3, 11.5, 39.9, 14.5, 0.9],
-                                         [49.1, 32.4, 51.0, 35.9, 0.1]])
-    # CPU can handle float32 and float64
-    dets = base_dets.astype(np.float32)
-    expected_suppressed = base_expected_suppressed.astype(np.float32)
-    suppressed, inds = nms(dets, iou_thr)
-    assert dets.dtype == suppressed.dtype
-    assert np.array_equal(suppressed, expected_suppressed)
-
-    dets = torch.FloatTensor(base_dets)
-    expected_suppressed = torch.FloatTensor(base_expected_suppressed)
-    suppressed, inds = nms(dets, iou_thr)
-    assert dets.dtype == suppressed.dtype
-    assert torch.equal(suppressed, expected_suppressed)
-
-    dets = base_dets.astype(np.float64)
-    expected_suppressed = base_expected_suppressed.astype(np.float64)
-    suppressed, inds = nms(dets, iou_thr)
-    assert dets.dtype == suppressed.dtype
-    assert np.array_equal(suppressed, expected_suppressed)
-
-    dets = torch.DoubleTensor(base_dets)
-    expected_suppressed = torch.DoubleTensor(base_expected_suppressed)
-    suppressed, inds = nms(dets, iou_thr)
-    assert dets.dtype == suppressed.dtype
-    assert torch.equal(suppressed, expected_suppressed)
-
-
-def test_nms_device_and_dtypes_gpu():
-    """
-    CommandLine:
-        xdoctest -m tests/test_nms.py test_nms_device_and_dtypes_gpu
-    """
-    if not torch.cuda.is_available():
-        import pytest
-        pytest.skip('test requires GPU and torch+cuda')
-
-    iou_thr = 0.6
-    base_dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.1],
-                          [49.3, 32.9, 51.0, 35.3, 0.05],
-                          [35.3, 11.5, 39.9, 14.5, 0.9],
-                          [35.2, 11.7, 39.7, 15.7, 0.3]])
-
-    base_expected_suppressed = np.array([[35.3, 11.5, 39.9, 14.5, 0.9],
-                                         [49.1, 32.4, 51.0, 35.9, 0.1]])
-
-    for device_id in range(torch.cuda.device_count()):
-        print(f'Run NMS on device_id = {device_id!r}')
-        # GPU can handle float32 but not float64
-        dets = base_dets.astype(np.float32)
-        expected_suppressed = base_expected_suppressed.astype(np.float32)
-        suppressed, inds = nms(dets, iou_thr, device_id)
-        assert dets.dtype == suppressed.dtype
-        assert np.array_equal(suppressed, expected_suppressed)
-
-        dets = torch.FloatTensor(base_dets).to(device_id)
-        expected_suppressed = torch.FloatTensor(base_expected_suppressed).to(
-            device_id)
-        suppressed, inds = nms(dets, iou_thr)
-        assert dets.dtype == suppressed.dtype
-        assert torch.equal(suppressed, expected_suppressed)
-
-
-def test_nms_match():
-    iou_thr = 0.6
-    # empty input
-    empty_dets = np.array([])
-    assert len(nms_match(empty_dets, iou_thr)) == 0
-
-    # non empty ndarray input
-    np_dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
-                        [49.3, 32.9, 51.0, 35.3, 0.9],
-                        [35.3, 11.5, 39.9, 14.5, 0.4],
-                        [35.2, 11.7, 39.7, 15.7, 0.3]])
-    np_groups = nms_match(np_dets, iou_thr)
-    assert isinstance(np_groups[0], np.ndarray)
-    assert len(np_groups) == 2
-    nms_keep_inds = nms(np_dets, iou_thr)[1]
-    assert set([g[0].item() for g in np_groups]) == set(nms_keep_inds.tolist())
-
-    # non empty tensor input
-    tensor_dets = torch.from_numpy(np_dets)
-    tensor_groups = nms_match(tensor_dets, iou_thr)
-    assert isinstance(tensor_groups[0], torch.Tensor)
-    for i in range(len(tensor_groups)):
-        assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()
-
-    # input of wrong shape
-    wrong_dets = np.zeros((2, 3))
-    with pytest.raises(AssertionError):
-        nms_match(wrong_dets, iou_thr)
diff --git a/tests/test_ops/test_soft_nms.py b/tests/test_ops/test_soft_nms.py
deleted file mode 100644
index 58503eaaac2..00000000000
--- a/tests/test_ops/test_soft_nms.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""
-CommandLine:
-    pytest tests/test_soft_nms.py
-"""
-import numpy as np
-import torch
-
-from mmdet.ops.nms.nms_wrapper import soft_nms
-
-
-def test_soft_nms_device_and_dtypes_cpu():
-    """
-    CommandLine:
-        xdoctest -m tests/test_soft_nms.py test_soft_nms_device_and_dtypes_cpu
-    """
-    iou_thr = 0.7
-    base_dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
-                          [49.3, 32.9, 51.0, 35.3, 0.9],
-                          [35.3, 11.5, 39.9, 14.5, 0.4],
-                          [35.2, 11.7, 39.7, 15.7, 0.3]])
-
-    # CPU can handle float32 and float64
-    dets = base_dets.astype(np.float32)
-    new_dets, inds = soft_nms(dets, iou_thr)
-    assert dets.dtype == new_dets.dtype
-    assert len(inds) == len(new_dets) == 4
-
-    dets = torch.FloatTensor(base_dets)
-    new_dets, inds = soft_nms(dets, iou_thr)
-    assert dets.dtype == new_dets.dtype
-    assert len(inds) == len(new_dets) == 4
-
-    dets = base_dets.astype(np.float64)
-    new_dets, inds = soft_nms(dets, iou_thr)
-    assert dets.dtype == new_dets.dtype
-    assert len(inds) == len(new_dets) == 4
-
-    dets = torch.DoubleTensor(base_dets)
-    new_dets, inds = soft_nms(dets, iou_thr)
-    assert dets.dtype == new_dets.dtype
-    assert len(inds) == len(new_dets) == 4
diff --git a/tests/test_ops/test_wrappers.py b/tests/test_ops/test_wrappers.py
deleted file mode 100644
index 1ae38f70478..00000000000
--- a/tests/test_ops/test_wrappers.py
+++ /dev/null
@@ -1,198 +0,0 @@
-from collections import OrderedDict
-from itertools import product
-from unittest.mock import patch
-
-import torch
-import torch.nn as nn
-
-from mmdet.ops import Conv2d, ConvTranspose2d, Linear, MaxPool2d
-
-torch.__version__ = '1.1'  # force test
-
-
-def test_conv2d():
-    """
-    CommandLine:
-        xdoctest -m tests/test_wrappers.py test_conv2d
-    """
-
-    test_cases = OrderedDict([('in_w', [10, 20]), ('in_h', [10, 20]),
-                              ('in_channel', [1, 3]), ('out_channel', [1, 3]),
-                              ('kernel_size', [3, 5]), ('stride', [1, 2]),
-                              ('padding', [0, 1]), ('dilation', [1, 2])])
-
-    # train mode
-    for in_h, in_w, in_cha, out_cha, k, s, p, d in product(
-            *list(test_cases.values())):
-        # wrapper op with 0-dim input
-        x_empty = torch.randn(0, in_cha, in_h, in_w)
-        torch.manual_seed(0)
-        wrapper = Conv2d(in_cha, out_cha, k, stride=s, padding=p, dilation=d)
-        wrapper_out = wrapper(x_empty)
-
-        # torch op with 3-dim input as shape reference
-        x_normal = torch.randn(3, in_cha, in_h, in_w).requires_grad_(True)
-        torch.manual_seed(0)
-        ref = nn.Conv2d(in_cha, out_cha, k, stride=s, padding=p, dilation=d)
-        ref_out = ref(x_normal)
-
-        assert wrapper_out.shape[0] == 0
-        assert wrapper_out.shape[1:] == ref_out.shape[1:]
-
-        wrapper_out.sum().backward()
-        assert wrapper.weight.grad is not None
-        assert wrapper.weight.grad.shape == wrapper.weight.shape
-
-        assert torch.equal(wrapper(x_normal), ref_out)
-
-    # eval mode
-    x_empty = torch.randn(0, in_cha, in_h, in_w)
-    wrapper = Conv2d(in_cha, out_cha, k, stride=s, padding=p, dilation=d)
-    wrapper.eval()
-    wrapper(x_empty)
-
-
-def test_conv_transposed_2d():
-    test_cases = OrderedDict([('in_w', [10, 20]), ('in_h', [10, 20]),
-                              ('in_channel', [1, 3]), ('out_channel', [1, 3]),
-                              ('kernel_size', [3, 5]), ('stride', [1, 2]),
-                              ('padding', [0, 1]), ('dilation', [1, 2])])
-
-    for in_h, in_w, in_cha, out_cha, k, s, p, d in product(
-            *list(test_cases.values())):
-        # wrapper op with 0-dim input
-        x_empty = torch.randn(0, in_cha, in_h, in_w, requires_grad=True)
-        # out padding must be smaller than either stride or dilation
-        op = min(s, d) - 1
-        torch.manual_seed(0)
-        wrapper = ConvTranspose2d(
-            in_cha,
-            out_cha,
-            k,
-            stride=s,
-            padding=p,
-            dilation=d,
-            output_padding=op)
-        wrapper_out = wrapper(x_empty)
-
-        # torch op with 3-dim input as shape reference
-        x_normal = torch.randn(3, in_cha, in_h, in_w)
-        torch.manual_seed(0)
-        ref = nn.ConvTranspose2d(
-            in_cha,
-            out_cha,
-            k,
-            stride=s,
-            padding=p,
-            dilation=d,
-            output_padding=op)
-        ref_out = ref(x_normal)
-
-        assert wrapper_out.shape[0] == 0
-        assert wrapper_out.shape[1:] == ref_out.shape[1:]
-
-        wrapper_out.sum().backward()
-        assert wrapper.weight.grad is not None
-        assert wrapper.weight.grad.shape == wrapper.weight.shape
-
-        assert torch.equal(wrapper(x_normal), ref_out)
-
-    # eval mode
-    x_empty = torch.randn(0, in_cha, in_h, in_w)
-    wrapper = ConvTranspose2d(
-        in_cha, out_cha, k, stride=s, padding=p, dilation=d, output_padding=op)
-    wrapper.eval()
-    wrapper(x_empty)
-
-
-def test_max_pool_2d():
-    test_cases = OrderedDict([('in_w', [10, 20]), ('in_h', [10, 20]),
-                              ('in_channel', [1, 3]), ('out_channel', [1, 3]),
-                              ('kernel_size', [3, 5]), ('stride', [1, 2]),
-                              ('padding', [0, 1]), ('dilation', [1, 2])])
-
-    for in_h, in_w, in_cha, out_cha, k, s, p, d in product(
-            *list(test_cases.values())):
-        # wrapper op with 0-dim input
-        x_empty = torch.randn(0, in_cha, in_h, in_w, requires_grad=True)
-        wrapper = MaxPool2d(k, stride=s, padding=p, dilation=d)
-        wrapper_out = wrapper(x_empty)
-
-        # torch op with 3-dim input as shape reference
-        x_normal = torch.randn(3, in_cha, in_h, in_w)
-        ref = nn.MaxPool2d(k, stride=s, padding=p, dilation=d)
-        ref_out = ref(x_normal)
-
-        assert wrapper_out.shape[0] == 0
-        assert wrapper_out.shape[1:] == ref_out.shape[1:]
-
-        assert torch.equal(wrapper(x_normal), ref_out)
-
-
-def test_linear():
-    test_cases = OrderedDict([
-        ('in_w', [10, 20]),
-        ('in_h', [10, 20]),
-        ('in_feature', [1, 3]),
-        ('out_feature', [1, 3]),
-    ])
-
-    for in_h, in_w, in_feature, out_feature in product(
-            *list(test_cases.values())):
-        # wrapper op with 0-dim input
-        x_empty = torch.randn(0, in_feature, requires_grad=True)
-        torch.manual_seed(0)
-        wrapper = Linear(in_feature, out_feature)
-        wrapper_out = wrapper(x_empty)
-
-        # torch op with 3-dim input as shape reference
-        x_normal = torch.randn(3, in_feature)
-        torch.manual_seed(0)
-        ref = nn.Linear(in_feature, out_feature)
-        ref_out = ref(x_normal)
-
-        assert wrapper_out.shape[0] == 0
-        assert wrapper_out.shape[1:] == ref_out.shape[1:]
-
-        wrapper_out.sum().backward()
-        assert wrapper.weight.grad is not None
-        assert wrapper.weight.grad.shape == wrapper.weight.shape
-
-        assert torch.equal(wrapper(x_normal), ref_out)
-
-    # eval mode
-    x_empty = torch.randn(0, in_feature)
-    wrapper = Linear(in_feature, out_feature)
-    wrapper.eval()
-    wrapper(x_empty)
-
-
-def test_nn_op_forward_called():
-    torch.__version__ = '1.4.1'
-
-    for m in ['Conv2d', 'ConvTranspose2d', 'MaxPool2d']:
-        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
-            # randn input
-            x_empty = torch.randn(0, 3, 10, 10)
-            wrapper = eval(m)(3, 2, 1)
-            wrapper(x_empty)
-            nn_module_forward.assert_called_with(x_empty)
-
-            # non-randn input
-            x_normal = torch.randn(1, 3, 10, 10)
-            wrapper = eval(m)(3, 2, 1)
-            wrapper(x_normal)
-            nn_module_forward.assert_called_with(x_normal)
-
-    with patch('torch.nn.Linear.forward') as nn_module_forward:
-        # randn input
-        x_empty = torch.randn(0, 3)
-        wrapper = Linear(3, 3)
-        wrapper(x_empty)
-        nn_module_forward.assert_not_called()
-
-        # non-randn input
-        x_normal = torch.randn(1, 3)
-        wrapper = Linear(3, 3)
-        wrapper(x_normal)
-        nn_module_forward.assert_called_with(x_normal)
diff --git a/tools/pytorch2onnx.py b/tools/pytorch2onnx.py
index 0425b280a64..4a251be3451 100644
--- a/tools/pytorch2onnx.py
+++ b/tools/pytorch2onnx.py
@@ -4,12 +4,12 @@
 import mmcv
 import onnx
 import torch
+from mmcv.ops import RoIAlign, RoIPool
 from mmcv.runner import load_checkpoint
 from onnx import optimizer
 from torch.onnx import OperatorExportTypes
 
 from mmdet.models import build_detector
-from mmdet.ops import RoIAlign, RoIPool
 
 
 def export_onnx_model(model, inputs, passes):