[Feature] FCOS3D BBox Coder (#940)

* FCOS3D BBox Coder * Add unit tests * Change the value from long to float/double * Rename bbox_out as bbox * Add comments to forward returns
open-mmlab · Sep 15, 2021 · 911a333 · 911a333
1 parent 0b26a9a
commit 911a333
Show file tree

Hide file tree

Showing 5 changed files with 232 additions and 34 deletions.
diff --git a/configs/_base_/models/fcos3d.py b/configs/_base_/models/fcos3d.py
@@ -55,6 +55,7 @@
  type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
  loss_centerness=dict(
  type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
  norm_on_bbox=True,
  centerness_on_reg=True,
  center_sampling=True,

diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py
@@ -3,12 +3,13 @@
 from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
 from .centerpoint_bbox_coders import CenterPointBBoxCoder
 from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .fcos3d_bbox_coder import FCOS3DBBoxCoder
 from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
 from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
 from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder
 
 __all__ = [
  'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
  'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder',
- 'PointXYZWHLRBBoxCoder'
+ 'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder'
 ]
diff --git a/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py
@@ -0,0 +1,126 @@
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from ..structures import limit_period
+
+
+@BBOX_CODERS.register_module()
+class FCOS3DBBoxCoder(BaseBBoxCoder):
+ """Bounding box coder for FCOS3D.
+
+ Args:
+ base_depths (tuple[tuple[float]]): Depth references for decode box
+ depth. Defaults to None.
+ base_dims (tuple[tuple[float]]): Dimension references for decode box
+ dimension. Defaults to None.
+ code_size (int): The dimension of boxes to be encoded. Defaults to 7.
+ norm_on_bbox (bool): Whether to apply normalization on the bounding
+ box 2D attributes. Defaults to True.
+ """
+
+ def __init__(self,
+ base_depths=None,
+ base_dims=None,
+ code_size=7,
+ norm_on_bbox=True):
+ super(FCOS3DBBoxCoder, self).__init__()
+ self.base_depths = base_depths
+ self.base_dims = base_dims
+ self.bbox_code_size = code_size
+ self.norm_on_bbox = norm_on_bbox
+
+ def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
+ # TODO: refactor the encoder in the FCOS3D and PGD head
+ pass
+
+ def decode(self, bbox, scale, stride, training, cls_score=None):
+ """Decode regressed results into 3D predictions.
+
+ Note that offsets are not transformed to the projected 3D centers.
+
+ Args:
+ bbox (torch.Tensor): Raw bounding box predictions in shape
+ [N, C, H, W].
+ scale (tuple[`Scale`]): Learnable scale parameters.
+ stride (tuple[int]): Stride for a specific feature level.
+ training (bool): Whether the decoding is in the training
+ procedure.
+ cls_score (torch.Tensor): Classification score map for deciding
+ which base depth or dim is used. Defaults to None.
+
+ Returns:
+ torch.Tensor: Decoded boxes.
+ """
+ # scale the bbox of different level
+ # only apply to offset, depth and size prediction
+ scale_offset, scale_depth, scale_size = scale[0:3]
+
+ clone_bbox = bbox.clone()
+ bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float()
+ bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float()
+ bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float()
+
+ if self.base_depths is None:
+ bbox[:, 2] = bbox[:, 2].exp()
+ elif len(self.base_depths) == 1: # only single prior
+ mean = self.base_depths[0][0]
+ std = self.base_depths[0][1]
+ bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+ else: # multi-class priors
+ assert len(self.base_depths) == cls_score.shape[1], \
+ 'The number of multi-class depth priors should be equal to ' \
+ 'the number of categories.'
+ indices = cls_score.max(dim=1)[1]
+ depth_priors = cls_score.new_tensor(
+ self.base_depths)[indices, :].permute(0, 3, 1, 2)
+ mean = depth_priors[:, 0]
+ std = depth_priors[:, 1]
+ bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+
+ bbox[:, 3:6] = bbox[:, 3:6].exp()
+ if self.base_dims is not None:
+ assert len(self.base_dims) == cls_score.shape[1], \
+ 'The number of anchor sizes should be equal to the number ' \
+ 'of categories.'
+ indices = cls_score.max(dim=1)[1]
+ size_priors = cls_score.new_tensor(
+ self.base_dims)[indices, :].permute(0, 3, 1, 2)
+ bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6]
+
+ assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
+ 'has not been thoroughly tested for FCOS3D.'
+ if self.norm_on_bbox:
+ if not training:
+ # Note that this line is conducted only when testing
+ bbox[:, :2] *= stride
+
+ return bbox
+
+ @staticmethod
+ def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img):
+ """Decode yaw angle and change it from local to global.i.
+
+ Args:
+ bbox (torch.Tensor): Bounding box predictions in shape
+ [N, C] with yaws to be decoded.
+ centers2d (torch.Tensor): Projected 3D-center on the image planes
+ corresponding to the box predictions.
+ dir_cls (torch.Tensor): Predicted direction classes.
+ dir_offset (float): Direction offset before dividing all the
+ directions into several classes.
+ cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4].
+
+ Returns:
+ torch.Tensor: Bounding boxes with decoded yaws.
+ """
+ if bbox.shape[0] > 0:
+ dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi)
+ bbox[..., 6] = \
+ dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype)
+
+ bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2],
+ cam2img[0, 0]) + bbox[:, 6]
+
+ return bbox
diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
@@ -9,6 +9,7 @@
 from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam,
  xywhr2xyxyr)
 from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
 from mmdet.models.builder import HEADS, build_loss
 from .anchor_free_mono3d_head import AnchorFreeMono3DHead
 
@@ -75,6 +76,7 @@ def __init__(self,
  type='CrossEntropyLoss',
  use_sigmoid=True,
  loss_weight=1.0),
+ bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
  norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
  centerness_branch=(64, ),
  init_cfg=None,
@@ -97,6 +99,8 @@ def __init__(self,
  init_cfg=init_cfg,
  **kwargs)
  self.loss_centerness = build_loss(loss_centerness)
+ bbox_coder['code_size'] = self.bbox_code_size
+ self.bbox_coder = build_bbox_coder(bbox_coder)
  if init_cfg is None:
  self.init_cfg = dict(
  type='Normal',
@@ -112,9 +116,11 @@ def _init_layers(self):
  conv_channels=self.centerness_branch,
  conv_strides=(1, ) * len(self.centerness_branch))
  self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
+ self.scale_dim = 3 # only for offset, depth and size regression
  self.scales = nn.ModuleList([
- nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides
- ]) # only for offset, depth and size regression
+ nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+ for _ in self.strides
+ ])
 
  def forward(self, feats):
  """Forward features from the upstream network.
@@ -140,8 +146,9 @@ def forward(self, feats):
  centernesses (list[Tensor]): Centerness for each scale level,
  each is a 4D-tensor, the channel number is num_points * 1.
  """
+ # Note: we use [:5] to filter feats and only return predictions
  return multi_apply(self.forward_single, feats, self.scales,
- self.strides)
+ self.strides)[:5]
 
  def forward_single(self, x, scale, stride):
  """Forward features of a single scale levle.
@@ -171,26 +178,12 @@ def forward_single(self, x, scale, stride):
  for conv_centerness_prev_layer in self.conv_centerness_prev:
  clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
  centerness = self.conv_centerness(clone_cls_feat)
- # scale the bbox_pred of different level
- # only apply to offset, depth and size prediction
- scale_offset, scale_depth, scale_size = scale[0:3]
 
- clone_bbox_pred = bbox_pred.clone()
- bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float()
- bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float()
- bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float()
+ bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride,
+ self.training, cls_score)
 
- bbox_pred[:, 2] = bbox_pred[:, 2].exp()
- bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0
-
- assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
- 'has not been thoroughly tested for FCOS3D.'
- if self.norm_on_bbox:
- if not self.training:
- # Note that this line is conducted only when testing
- bbox_pred[:, :2] *= stride
-
- return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness
+ return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+ cls_feat, reg_feat
 
  @staticmethod
  def add_sin_difference(boxes1, boxes2):
@@ -654,19 +647,13 @@ def _get_bboxes_single(self,
  mlvl_dir_scores = torch.cat(mlvl_dir_scores)
 
  # change local yaw to global yaw for 3D nms
- if mlvl_bboxes.shape[0] > 0:
- dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0,
- np.pi)
- mlvl_bboxes[..., 6] = (
- dir_rot + self.dir_offset +
- np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype))
-
- cam_intrinsic = mlvl_centers2d.new_zeros((4, 4))
- cam_intrinsic[:view.shape[0], :view.shape[1]] = \
+ cam2img = mlvl_centers2d.new_zeros((4, 4))
+ cam2img[:view.shape[0], :view.shape[1]] = \
  mlvl_centers2d.new_tensor(view)
- mlvl_bboxes[:, 6] = torch.atan2(
- mlvl_centers2d[:, 0] - cam_intrinsic[0, 2],
- cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6]
+ mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
+ mlvl_dir_scores,
+ self.dir_offset, cam2img)
+
  mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
  mlvl_bboxes, box_dim=self.bbox_code_size,
  origin=(0.5, 0.5, 0.5)).bev)

diff --git a/tests/test_utils/test_bbox_coders.py b/tests/test_utils/test_bbox_coders.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
+from mmcv.cnn import Scale
+from torch import nn as nn
 
 from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes
 from mmdet.core import build_bbox_coder
@@ -382,3 +384,84 @@ def test_point_xyzwhlr_bbox_coder():
  # test decode
  bbox3d_out = boxcoder.decode(bbox_target, points, gt_labels_3d)
  assert torch.allclose(bbox3d_out, gt_bboxes_3d, atol=1e-4)
+
+
+def test_fcos3d_bbox_coder():
+ # test a config without priors
+ bbox_coder_cfg = dict(
+ type='FCOS3DBBoxCoder',
+ base_depths=None,
+ base_dims=None,
+ code_size=7,
+ norm_on_bbox=True)
+ bbox_coder = build_bbox_coder(bbox_coder_cfg)
+
+ # test decode
+ # [2, 7, 1, 1]
+ batch_bbox_out = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]],
+ [[0.0570]], [[0.5579]], [[0.1593]],
+ [[0.4553]]],
+ [[[0.7758]], [[0.2298]], [[0.3925]],
+ [[0.6307]], [[0.4377]], [[0.3339]],
+ [[0.1966]]]])
+ batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)])
+ stride = 2
+ training = False
+ cls_score = torch.randn([2, 2, 1, 1]).sigmoid()
+ decode_bbox_out = bbox_coder.decode(batch_bbox_out, batch_scale, stride,
+ training, cls_score)
+
+ expected_bbox_out = torch.tensor([[[[0.6261]], [[1.4188]], [[2.3971]],
+ [[1.0586]], [[1.7470]], [[1.1727]],
+ [[0.4553]]],
+ [[[1.5516]], [[0.4596]], [[1.4806]],
+ [[1.8790]], [[1.5492]], [[1.3965]],
+ [[0.1966]]]])
+ assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3)
+
+ # test a config with priors
+ prior_bbox_coder_cfg = dict(
+ type='FCOS3DBBoxCoder',
+ base_depths=((28., 13.), (25., 12.)),
+ base_dims=((2., 3., 1.), (1., 2., 3.)),
+ code_size=7,
+ norm_on_bbox=True)
+ prior_bbox_coder = build_bbox_coder(prior_bbox_coder_cfg)
+
+ # test decode
+ batch_bbox_out = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]],
+ [[0.0570]], [[0.5579]], [[0.1593]],
+ [[0.4553]]],
+ [[[0.7758]], [[0.2298]], [[0.3925]],
+ [[0.6307]], [[0.4377]], [[0.3339]],
+ [[0.1966]]]])
+ batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)])
+ stride = 2
+ training = False
+ cls_score = torch.tensor([[[[0.5811]], [[0.6198]]], [[[0.4889]],
+ [[0.8142]]]])
+ decode_bbox_out = prior_bbox_coder.decode(batch_bbox_out, batch_scale,
+ stride, training, cls_score)
+ expected_bbox_out = torch.tensor([[[[0.6260]], [[1.4188]], [[35.4916]],
+ [[1.0587]], [[3.4940]], [[3.5181]],
+ [[0.4553]]],
+ [[[1.5516]], [[0.4596]], [[29.7100]],
+ [[1.8789]], [[3.0983]], [[4.1892]],
+ [[0.1966]]]])
+ assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3)
+
+ # test decode_yaw
+ decode_bbox_out = decode_bbox_out.permute(0, 2, 3, 1).view(-1, 7)
+ batch_centers2d = torch.tensor([[100., 150.], [200., 100.]])
+ batch_dir_cls = torch.tensor([0., 1.])
+ dir_offset = 0.7854
+ cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.],
+ [0., 0., 1., 0.], [0., 0., 0., 1.]])
+ decode_bbox_out = prior_bbox_coder.decode_yaw(decode_bbox_out,
+ batch_centers2d,
+ batch_dir_cls, dir_offset,
+ cam2img)
+ expected_bbox_out = torch.tensor(
+ [[0.6260, 1.4188, 35.4916, 1.0587, 3.4940, 3.5181, 3.1332],
+ [1.5516, 0.4596, 29.7100, 1.8789, 3.0983, 4.1892, 6.1368]])
+ assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3)