open-mmlab · ZwwWayne · Mar 24, 2021 · Nov 25, 2020 · Dec 9, 2020 · Dec 24, 2020
diff --git a/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py b/configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
@@ -0,0 +1,166 @@
+_base_ = [
+ '../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/default_runtime.py'
+]
+
+model = dict(
+ type='ImVoteNet',
+ img_backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe'),
+ img_neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ img_rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[.0, .0, .0, .0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ img_roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=10,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0., 0., 0., 0.],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+ # model training and testing settings
+ train_cfg=dict(
+ img_rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ img_rpn_proposal=dict(
+ nms_across_levels=False,
+ nms_pre=2000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ img_rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ img_rpn=dict(
+ nms_across_levels=False,
+ nms_pre=1000,
+ nms_post=1000,
+ max_num=1000,
+ nms_thr=0.7,
+ min_bbox_size=0),
+ img_rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
+
+# use caffe img_norm
+img_norm_cfg = dict(
+ mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', with_bbox=True),
+ dict(
+ type='Resize',
+ img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
+ (1333, 576), (1333, 600)],
+ multiscale_mode='value',
+ keep_ratio=True),
+ dict(type='RandomFlip', flip_ratio=0.5),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(1333, 600),
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size_divisor=32),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
+
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[6])
+total_epochs = 8
+
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
diff --git a/mmdet3d/core/bbox/structures/coord_3d_mode.py b/mmdet3d/core/bbox/structures/coord_3d_mode.py
@@ -182,7 +182,7 @@ def convert_point(point, src, dst, rt_mat=None):
  """Convert points from `src` mode to `dst` mode.
 
  Args:
- box (tuple | list | np.dnarray |
+ point (tuple | list | np.dnarray |
  torch.Tensor | BasePoints):
  Can be a k-tuple, k-list or an Nxk array/tensor.
  src (:obj:`CoordMode`): The src Point mode.
@@ -218,17 +218,25 @@ def convert_point(point, src, dst, rt_mat=None):
  arr = point.clone()
 
  # convert point from `src` mode to `dst` mode.
- if rt_mat is not None:
- if not isinstance(rt_mat, torch.Tensor):
- rt_mat = arr.new_tensor(rt_mat)
+ # TODO: LIDAR
+ # only implemented provided Rt matrix in cam-depth conversion
  if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
  rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
  elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
  rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
  elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
- rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+ else:
+ rt_mat = rt_mat.new_tensor(
+ [[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ \
+ rt_mat.transpose(1, 0)
  elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
- rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+ if rt_mat is None:
+ rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+ else:
+ rt_mat = rt_mat @ rt_mat.new_tensor([[1, 0, 0], [0, 0, 1],
+ [0, -1, 0]])
  elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
  rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
  elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
@@ -245,7 +253,7 @@ def convert_point(point, src, dst, rt_mat=None):
  else:
  xyz = arr[:, :3] @ rt_mat.t()
 
- remains = arr[..., 3:]
+ remains = arr[:, 3:]
  arr = torch.cat([xyz[:, :3], remains], dim=-1)
 
  # convert arr to the original type

diff --git a/mmdet3d/core/bbox/structures/utils.py b/mmdet3d/core/bbox/structures/utils.py
@@ -122,7 +122,20 @@ def points_cam2img(points_3d, proj_mat):
  torch.Tensor: Points in image coordinates with shape [N, 2].
  """
  points_num = list(points_3d.shape)[:-1]
+
  points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+ assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
+ f'matrix should be 2 instead of {len(proj_mat.shape)}.'
+ d1, d2 = proj_mat.shape[:2]
+ assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+ d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
+ f' ({d1}*{d2}) is not supported.'
+ if d1 == 3:
+ proj_mat_expanded = torch.eye(
+ 4, device=proj_mat.device, dtype=proj_mat.dtype)
+ proj_mat_expanded[:d1, :d2] = proj_mat
+ proj_mat = proj_mat_expanded
+
  # previous implementation use new_zeros, new_one yeilds better results
  points_4 = torch.cat(
  [points_3d, points_3d.new_ones(*points_shape)], dim=-1)

diff --git a/mmdet3d/core/points/base_points.py b/mmdet3d/core/points/base_points.py
@@ -10,14 +10,14 @@ class BasePoints(object):
  tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
  points_dim (int): Number of the dimension of a point.
  Each row is (x, y, z). Default to 3.
- attribute_dims (dict): Dictinory to indicate the meaning of extra
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
  dimension. Default to None.
 
  Attributes:
  tensor (torch.Tensor): Float matrix of N x points_dim.
  points_dim (int): Integer indicating the dimension of a point.
  Each row is (x, y, z, ...).
- attribute_dims (bool): Dictinory to indicate the meaning of extra
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
  dimension. Default to None.
  rotation_axis (int): Default rotation axis for points rotation.
  """

diff --git a/mmdet3d/core/points/cam_points.py b/mmdet3d/core/points/cam_points.py
@@ -8,14 +8,14 @@ class CameraPoints(BasePoints):
  tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
  points_dim (int): Number of the dimension of a point.
  Each row is (x, y, z). Default to 3.
- attribute_dims (dict): Dictinory to indicate the meaning of extra
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
  dimension. Default to None.
 
  Attributes:
  tensor (torch.Tensor): Float matrix of N x points_dim.
  points_dim (int): Integer indicating the dimension of a point.
  Each row is (x, y, z, ...).
- attribute_dims (bool): Dictinory to indicate the meaning of extra
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
  dimension. Default to None.
  rotation_axis (int): Default rotation axis for points rotation.
  """

diff --git a/mmdet3d/core/points/depth_points.py b/mmdet3d/core/points/depth_points.py
@@ -8,14 +8,14 @@ class DepthPoints(BasePoints):
  tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
  points_dim (int): Number of the dimension of a point.
  Each row is (x, y, z). Default to 3.
- attribute_dims (dict): Dictinory to indicate the meaning of extra
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
  dimension. Default to None.
 
  Attributes:
  tensor (torch.Tensor): Float matrix of N x points_dim.
  points_dim (int): Integer indicating the dimension of a point.
  Each row is (x, y, z, ...).
- attribute_dims (bool): Dictinory to indicate the meaning of extra
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
  dimension. Default to None.
  rotation_axis (int): Default rotation axis for points rotation.
  """

diff --git a/mmdet3d/core/points/lidar_points.py b/mmdet3d/core/points/lidar_points.py
@@ -8,14 +8,14 @@ class LiDARPoints(BasePoints):
  tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
  points_dim (int): Number of the dimension of a point.
  Each row is (x, y, z). Default to 3.
- attribute_dims (dict): Dictinory to indicate the meaning of extra
+ attribute_dims (dict): Dictionary to indicate the meaning of extra
  dimension. Default to None.
 
  Attributes:
  tensor (torch.Tensor): Float matrix of N x points_dim.
  points_dim (int): Integer indicating the dimension of a point.
  Each row is (x, y, z, ...).
- attribute_dims (bool): Dictinory to indicate the meaning of extra
+ attribute_dims (bool): Dictionary to indicate the meaning of extra
  dimension. Default to None.
  rotation_axis (int): Default rotation axis for points rotation.
  """

diff --git a/mmdet3d/datasets/pipelines/transforms_3d.py b/mmdet3d/datasets/pipelines/transforms_3d.py
@@ -96,10 +96,15 @@ def __call__(self, input_dict):
  ) < self.flip_ratio_bev_vertical else False
  input_dict['pcd_vertical_flip'] = flip_vertical
 
+ if 'transformation_3d_flow' not in input_dict:
+ input_dict['transformation_3d_flow'] = []
+
  if input_dict['pcd_horizontal_flip']:
  self.random_flip_data_3d(input_dict, 'horizontal')
+ input_dict['transformation_3d_flow'].extend(['HF'])
  if input_dict['pcd_vertical_flip']:
  self.random_flip_data_3d(input_dict, 'vertical')
+ input_dict['transformation_3d_flow'].extend(['VF'])
  return input_dict
 
  def __repr__(self):
@@ -405,13 +410,18 @@ def __call__(self, input_dict):
  'pcd_scale_factor', 'pcd_trans' and keys in \
  input_dict['bbox3d_fields'] are updated in the result dict.
  """
+ if 'transformation_3d_flow' not in input_dict:
+ input_dict['transformation_3d_flow'] = []
+
  self._rot_bbox_points(input_dict)
 
  if 'pcd_scale_factor' not in input_dict:
  self._random_scale(input_dict)
  self._scale_bbox_points(input_dict)
 
  self._trans_bbox_points(input_dict)
+
+ input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
  return input_dict
 
  def __repr__(self):