open-mmlab · ZwwWayne · Oct 20, 2022 · Sep 5, 2022 · Sep 9, 2022 · Sep 10, 2022
diff --git a/.circleci/test.yml b/.circleci/test.yml
@@ -66,10 +66,13 @@ jobs:
  - run: pip install "protobuf <= 3.20.1" && sudo apt-get update && sudo apt-get -y install libprotobuf-dev protobuf-compiler cmake
  - run:
  name: Install mmdet dependencies
+ # numpy may be downgraded after building pycocotools, which causes `ImportError: numpy.core.multiarray failed to import`
+ # force reinstall pycocotools to ensure pycocotools being built under the currenct numpy
  command: |
  python -m pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
  python -m pip install << parameters.mmcv >>
  pip install -r requirements/tests.txt -r requirements/optional.txt
+ pip install --force-reinstall pycocotools
  pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
  pip install git+https://github.com/cocodataset/panopticapi.git
  - run:
@@ -111,17 +114,18 @@ jobs:
  command: |
  docker build .circleci/docker -t mmdetection:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
  docker run --gpus all -t -d -v /home/circleci/project:/mmdetection -v /home/circleci/mmengine:/mmengine -w /mmdetection --name mmdetection mmdetection:gpu
+ docker exec mmdetection apt-get install -y git
  - run:
  name: Install mmdet dependencies
  # pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch${{matrix.torch_version}}/index.html
  command: |
  docker exec mmdetection pip install -e /mmengine
  docker exec mmdetection pip install << parameters.mmcv >>
- pip install -r requirements/tests.txt -r requirements/optional.txt
- pip install pycocotools
- pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
- pip install git+https://github.com/cocodataset/panopticapi.git
- python -c 'import mmcv; print(mmcv.__version__)'
+ docker exec mmdetection pip install -r requirements/tests.txt -r requirements/optional.txt
+ docker exec mmdetection pip install pycocotools
+ docker exec mmdetection pip install albumentations>=0.3.2 --no-binary imgaug,albumentations
+ docker exec mmdetection pip install git+https://github.com/cocodataset/panopticapi.git
+ docker exec mmdetection python -c 'import mmcv; print(mmcv.__version__)'
  - run:
  name: Build and install
  command: |

diff --git a/configs/detr/detr_r18_8xb2-500e_coco.py b/configs/detr/detr_r18_8xb2-500e_coco.py
@@ -4,4 +4,4 @@
  backbone=dict(
  depth=18,
  init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
- neck=dict(in_channels=[64, 128, 256, 512]))
+ bbox_head=dict(in_channels=512))
diff --git a/mmdet/datasets/transforms/transforms.py b/mmdet/datasets/transforms/transforms.py
@@ -1136,7 +1136,7 @@ def transform(self, results: dict) -> dict:
  if patch[2] == patch[0] or patch[3] == patch[1]:
  continue
  overlaps = boxes.overlaps(
- HorizontalBoxes(patch.reshape(-1, 4)),
+ HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)),
  boxes).numpy().reshape(-1)
  if len(overlaps) > 0 and overlaps.min() < min_iou:
  continue

diff --git a/mmdet/models/dense_heads/anchor_head.py b/mmdet/models/dense_heads/anchor_head.py
@@ -8,12 +8,14 @@
 from torch import Tensor
 
 from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes
 from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
  OptInstanceList, OptMultiConfig)
 from ..task_modules.prior_generators import (AnchorGenerator,
  anchor_inside_flags)
 from ..task_modules.samplers import PseudoSampler
-from ..utils import images_to_levels, multi_apply, unmap
+from ..utils import (cat_boxes, get_box_tensor, images_to_levels, multi_apply,
+ unmap)
 from .base_dense_head import BaseDenseHead
 
 
@@ -120,8 +122,9 @@ def _init_layers(self) -> None:
  self.conv_cls = nn.Conv2d(self.in_channels,
  self.num_base_priors * self.cls_out_channels,
  1)
- self.conv_reg = nn.Conv2d(self.in_channels, self.num_base_priors * 4,
- 1)
+ reg_dim = self.bbox_coder.encode_size
+ self.conv_reg = nn.Conv2d(self.in_channels,
+ self.num_base_priors * reg_dim, 1)
 
  def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
  """Forward feature of a single scale level.
@@ -197,7 +200,7 @@ def get_anchors(self,
  return anchor_list, valid_flag_list
 
  def _get_targets_single(self,
- flat_anchors: Tensor,
+ flat_anchors: Union[Tensor, BaseBoxes],
  valid_flags: Tensor,
  gt_instances: InstanceData,
  img_meta: dict,
@@ -207,8 +210,9 @@ def _get_targets_single(self,
  single image.
 
  Args:
- flat_anchors (Tensor): Multi-level anchors of the image, which are
- concatenated into a single tensor of shape (num_anchors, 4)
+ flat_anchors (Tensor or :obj:`BaseBoxes`): Multi-level anchors
+ of the image, which are concatenated into a single tensor
+ or boxlist of shape (num_anchors, 4)
  valid_flags (Tensor): Multi level valid flags of the image,
  which are concatenated into a single tensor of
  shape (num_anchors, ).
@@ -243,7 +247,7 @@ def _get_targets_single(self,
  'check the image size and anchor sizes, or set '
  '``allowed_border`` to -1 to skip the condition.')
  # assign gt and sample anchors
- anchors = flat_anchors[inside_flags, :]
+ anchors = flat_anchors[inside_flags]
 
  pred_instances = InstanceData(priors=anchors)
  assign_result = self.assigner.assign(pred_instances, gt_instances,
@@ -254,8 +258,10 @@ def _get_targets_single(self,
  gt_instances)
 
  num_valid_anchors = anchors.shape[0]
- bbox_targets = torch.zeros_like(anchors)
- bbox_weights = torch.zeros_like(anchors)
+ target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox \
+ else self.bbox_coder.encode_size
+ bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim)
+ bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim)
 
  # TODO: Considering saving memory, is it necessary to be long?
  labels = anchors.new_full((num_valid_anchors, ),
@@ -265,12 +271,16 @@ def _get_targets_single(self,
 
  pos_inds = sampling_result.pos_inds
  neg_inds = sampling_result.neg_inds
+ # `bbox_coder.encode` accepts tensor or boxlist inputs and generates
+ # tensor targets. If regressing decoded boxes, the code will convert
+ # boxlist `pos_bbox_targets` to tensor.
  if len(pos_inds) > 0:
  if not self.reg_decoded_bbox:
  pos_bbox_targets = self.bbox_coder.encode(
  sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
  else:
  pos_bbox_targets = sampling_result.pos_gt_bboxes
+ pos_bbox_targets = get_box_tensor(pos_bbox_targets)
  bbox_targets[pos_inds, :] = pos_bbox_targets
  bbox_weights[pos_inds, :] = 1.0
 
@@ -362,7 +372,7 @@ def get_targets(self,
  concat_valid_flag_list = []
  for i in range(num_imgs):
  assert len(anchor_list[i]) == len(valid_flag_list[i])
- concat_anchor_list.append(torch.cat(anchor_list[i]))
+ concat_anchor_list.append(cat_boxes(anchor_list[i]))
  concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
 
  # compute targets for each image
@@ -438,15 +448,19 @@ def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
  loss_cls = self.loss_cls(
  cls_score, labels, label_weights, avg_factor=avg_factor)
  # regression loss
- bbox_targets = bbox_targets.reshape(-1, 4)
- bbox_weights = bbox_weights.reshape(-1, 4)
- bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+ target_dim = bbox_targets.size(-1)
+ bbox_targets = bbox_targets.reshape(-1, target_dim)
+ bbox_weights = bbox_weights.reshape(-1, target_dim)
+ bbox_pred = bbox_pred.permute(0, 2, 3,
+ 1).reshape(-1,
+ self.bbox_coder.encode_size)
  if self.reg_decoded_bbox:
  # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
  # is applied directly on the decoded bounding boxes, it
  # decodes the already encoded coordinates to absolute format.
- anchors = anchors.reshape(-1, 4)
+ anchors = anchors.reshape(-1, anchors.size(-1))
  bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+ bbox_pred = get_box_tensor(bbox_pred)
  loss_bbox = self.loss_bbox(
  bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
  return loss_cls, loss_bbox
@@ -500,7 +514,7 @@ def loss_by_feat(
  # concat all level anchors and flags to a single tensor
  concat_anchor_list = []
  for i in range(len(anchor_list)):
- concat_anchor_list.append(torch.cat(anchor_list[i]))
+ concat_anchor_list.append(cat_boxes(anchor_list[i]))
  all_anchor_list = images_to_levels(concat_anchor_list,
  num_level_anchors)
 

diff --git a/mmdet/models/dense_heads/base_dense_head.py b/mmdet/models/dense_heads/base_dense_head.py
@@ -14,7 +14,8 @@
 from mmdet.structures import SampleList
 from mmdet.utils import InstanceList, OptMultiConfig
 from ..test_time_augs import merge_aug_results
-from ..utils import (filter_scores_and_topk, select_single_mlvl,
+from ..utils import (cat_boxes, filter_scores_and_topk, get_box_tensor,
+ get_box_wh, scale_boxes, select_single_mlvl,
  unpack_gt_instances)
 
 
@@ -360,7 +361,8 @@ def _predict_by_feat_single(self,
 
  assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
 
- bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+ dim = self.bbox_coder.encode_size
+ bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
  if with_score_factors:
  score_factor = score_factor.permute(1, 2,
  0).reshape(-1).sigmoid()
@@ -401,7 +403,7 @@ def _predict_by_feat_single(self,
  mlvl_score_factors.append(score_factor)
 
  bbox_pred = torch.cat(mlvl_bbox_preds)
- priors = torch.cat(mlvl_valid_priors)
+ priors = cat_boxes(mlvl_valid_priors)
  bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
 
  results = InstanceData()
@@ -452,11 +454,10 @@ def _bbox_post_process(self,
  - bboxes (Tensor): Has a shape (num_instances, 4),
  the last dimension 4 arrange as (x1, y1, x2, y2).
  """
-
  if rescale:
  assert img_meta.get('scale_factor') is not None
- results.bboxes /= results.bboxes.new_tensor(
-  img_meta['scale_factor']).repeat((1, 2))
+ scale_factor = [1 / s for s in img_meta['scale_factor']]
+ results.bboxes = scale_boxes(results.bboxes, scale_factor)
 
  if hasattr(results, 'score_factors'):
  # TODO： Add sqrt operation in order to be consistent with
@@ -466,15 +467,15 @@ def _bbox_post_process(self,
 
  # filter small size bboxes
  if cfg.get('min_bbox_size', -1) >= 0:
- w = results.bboxes[:, 2] - results.bboxes[:, 0]
- h = results.bboxes[:, 3] - results.bboxes[:, 1]
+ w, h = get_box_wh(results.bboxes)
  valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
  if not valid_mask.all():
  results = results[valid_mask]
 
  # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
  if with_nms and results.bboxes.numel() > 0:
- det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+ bboxes = get_box_tensor(results.bboxes)
+ det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
  results.labels, cfg.nms)
  results = results[keep_idxs]
  # some nms would reweight the score, such as softnms

diff --git a/mmdet/models/dense_heads/retina_head.py b/mmdet/models/dense_heads/retina_head.py
@@ -88,8 +88,9 @@ def _init_layers(self):
  self.num_base_priors * self.cls_out_channels,
  3,
  padding=1)
+ reg_dim = self.bbox_coder.encode_size
  self.retina_reg = nn.Conv2d(
- self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+ self.feat_channels, self.num_base_priors * reg_dim, 3, padding=1)
 
  def forward_single(self, x):
  """Forward feature of a single scale level.

diff --git a/mmdet/models/detectors/base.py b/mmdet/models/detectors/base.py
@@ -8,6 +8,7 @@
 
 from mmdet.structures import DetDataSample, OptSampleList, SampleList
 from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+from ..utils import samplelist_boxlist2tensor
 
 ForwardResults = Union[Dict[str, torch.Tensor], List[DetDataSample],
  Tuple[torch.Tensor], torch.Tensor]
@@ -151,4 +152,5 @@ def add_pred_to_datasample(self, data_samples: SampleList,
  """
  for data_sample, pred_instances in zip(data_samples, results_list):
  data_sample.pred_instances = pred_instances
+ samplelist_boxlist2tensor(data_samples)
  return data_samples
diff --git a/mmdet/models/detectors/detection_transformer.py b/mmdet/models/detectors/detection_transformer.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class TransformerDetector(BaseDetector):
+ """Base class for Transformer-based detectors.
+
+ Transformer-based detectors use an encoder to process output features of
+ the backbone(+neck) and a decoder to pooling features into a set of
+ learnable queries. Each query predict a bounding box.
+ """
+
+ def __init__(self,
+ backbone: ConfigType,
+ neck: OptConfigType = None,
+ encoder_cfg: OptConfigType = None,
+ decoder_cfg: OptConfigType = None,
+ positional_encoding_cfg: OptConfigType = None,
+ bbox_head: OptConfigType = None,
+ num_query: int = 100,
+ train_cfg: OptConfigType = None,
+ test_cfg: OptConfigType = None,
+ data_preprocessor: OptConfigType = None,
+ init_cfg: OptMultiConfig = None) -> None:
+ super().__init__(
+ data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+ # process args
+ bbox_head.update(train_cfg=train_cfg)
+ bbox_head.update(test_cfg=test_cfg)
+ self.train_cfg = train_cfg
+ self.test_cfg = test_cfg
+ self.encoder_cfg = encoder_cfg
+ self.decoder_cfg = decoder_cfg
+ self.positional_encoding_cfg = positional_encoding_cfg
+ self.num_query = num_query
+
+ # init model layers
+ self.backbone = MODELS.build(backbone)
+ if neck is not None:
+ self.neck = MODELS.build(neck)
+ self.bbox_head = MODELS.build(bbox_head)
+ self._init_layers()
+
+ def _init_layers(self) -> None:
+ self._init_transformer()
+
+ def _init_transformer(self) -> None:
+ """1. Initialize positional_encoding
+ 2. Initialize encoder and decoder of transformer
+ 3. Get self.embed_dims from the transformer
+ 4. Initialize query_embed"""
+ raise NotImplementedError(
+ 'The _init_transformer should be implemented for the detector.')
+
+ # def init_weight # TODO !!!!
+ # def _load_from_state_dict # TODO !!!!
+
+ def loss(self, batch_inputs: Tensor,
+ batch_data_samples: SampleList) -> Union[dict, list]:
+ img_feats = self.extract_feat(batch_inputs)
+ seq_feats = self.forward_pretransformer(img_feats, batch_data_samples)
+ outs_dec = self.forward_transformer(
+ **seq_feats, query_embed=self.query_embedding.weight)
+ losses = self.bbox_head.loss(outs_dec, batch_data_samples)
+ return losses
+
+ def predict(self,
+ batch_inputs: Tensor,
+ batch_data_samples: SampleList,
+ rescale: bool = True) -> SampleList:
+ img_feats = self.extract_feat(batch_inputs)
+ seq_feats = self.forward_pretransformer(img_feats, batch_data_samples)
+ outs_dec = self.forward_transformer(
+ **seq_feats, query_embed=self.query_embedding.weight)
+ results_list = self.bbox_head.predict(
+ outs_dec, batch_data_samples, rescale=rescale)
+ batch_data_samples = self.add_pred_to_datasample(
+ batch_data_samples, results_list)
+ return batch_data_samples
+
+ def _forward(
+ self,
+ batch_inputs: Tensor,
+ batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+ img_feats = self.extract_feat(batch_inputs)
+ seq_feats = self.forward_pretransformer(img_feats, batch_data_samples)
+ outs_dec = self.forward_transformer(
+ **seq_feats, query_embed=self.query_embedding.weight)
+ results = self.bbox_head.forward(outs_dec)
+ return results
+
+ def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+ """Extract features.
+
+ Args:
+ batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+ Returns:
+ tuple[Tensor]: Multi-level features that may have
+ different resolutions.
+ """
+ x = self.backbone(batch_inputs)
+ if self.with_neck:
+ x = self.neck(x)
+ return x
+
+ def forward_pretransformer(
+ self,
+ img_feats: Tuple[Tensor],
+ batch_data_samples: OptSampleList = None) -> Dict[str, Tensor]:
+ """1. Get batch padding mask.
+ 2. Convert image feature maps to sequential features.
+ 3. Get image positional embedding of features."""
+ raise NotImplementedError(
+ 'The forward_pretransformer should be implemented '
+ 'for the detector.')
+
+ def forward_transformer(self, **kwargs) -> Tuple[Tensor]:
+ """Process sequential features with transformer."""
+ raise NotImplementedError(
+ 'The forward_transformer should be implemented for the detector.')