From ce2065cc610dcf3852d2914268e788c9946b2f7e Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Tue, 13 Apr 2021 21:24:19 +0800 Subject: [PATCH 01/12] Support base mono3d dense head and anchor free mono3d head --- mmdet3d/models/dense_heads/__init__.py | 5 +- .../dense_heads/anchor_free_mono3d_head.py | 510 ++++++++++++++++++ .../dense_heads/base_mono3d_dense_head.py | 76 +++ 3 files changed, 590 insertions(+), 1 deletion(-) create mode 100644 mmdet3d/models/dense_heads/anchor_free_mono3d_head.py create mode 100644 mmdet3d/models/dense_heads/base_mono3d_dense_head.py diff --git a/mmdet3d/models/dense_heads/__init__.py b/mmdet3d/models/dense_heads/__init__.py index f31b256ccf..8566be7533 100644 --- a/mmdet3d/models/dense_heads/__init__.py +++ b/mmdet3d/models/dense_heads/__init__.py @@ -1,5 +1,7 @@ from .anchor3d_head import Anchor3DHead +from .anchor_free_mono3d_head import AnchorFreeMono3DHead from .base_conv_bbox_head import BaseConvBboxHead +from .base_mono3d_dense_head import BaseMono3DDenseHead from .centerpoint_head import CenterHead from .free_anchor3d_head import FreeAnchor3DHead from .parta2_rpn_head import PartA2RPNHead @@ -9,5 +11,6 @@ __all__ = [ 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', - 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead' + 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', + 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead' ] diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py new file mode 100644 index 0000000000..cc78c42a4b --- /dev/null +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -0,0 +1,510 @@ +import torch +from abc import abstractmethod +from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init +from torch import nn as nn + +from mmdet.core import force_fp32, multi_apply +from mmdet.models.builder import HEADS, build_loss +from .base_mono3d_dense_head import BaseMono3DDenseHead + + +@HEADS.register_module() +class AnchorFreeMono3DHead(BaseMono3DDenseHead): + """Anchor-free head for monocular 3D object detection. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + stacked_convs (int): Number of stacking convs of the head. + strides (tuple): Downsample factor of each feature map. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Default: False. + conv_bias (bool | str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Default: "auto". + background_label (int | None): Label ID of background, set as 0 for + RPN and num_classes for other heads. It will automatically set as + num_classes if None is given. + use_direction_classifier (bool): Whether to add a direction classifier. + diff_rad_by_sin (bool): Whether to change the difference into sin + difference for box regression loss. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + loss_dir (dict): Config of direction classifier loss. + loss_attr (dict): Config of attribute classifier loss, which is only + active when pred_attrs=True. + bbox_code_size (int): Dimensions of predicted bounding boxes. + pred_attrs (bool): Whether to predict attributes. Default to False. + num_attrs (int): The number of attributes to be predicted. Default: 9. + pred_velo (bool): Whether to predict velocity. Default to False. + group_reg_dims (tuple[int]): The dimension of each regression target + group. Default: (2, 1, 3, 1, 2). + cls_branch (tuple[int]): Channels for classification branch. + Default: (128, 64). + reg_branch (tuple[tuple]): Channels for regression branch. + Default: ( + (128, 64), # offset + (128, 64), # depth + (64, ), # size + (64, ), # rot + () # velo + ), + dir_branch (tuple[int]): Channels for direction classification branch. + Default: (64, ). + attr_branch (tuple[int]): Channels for classification branch. + Default: (64, ). + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + train_cfg (dict): Training config of anchor head. + test_cfg (dict): Testing config of anchor head. + """ # noqa: W605 + + _version = 1 + + def __init__( + self, + num_classes, + in_channels, + feat_channels=256, + stacked_convs=4, + strides=(4, 8, 16, 32, 64), + dcn_on_last_conv=False, + conv_bias='auto', + background_label=None, + use_direction_classifier=True, + diff_rad_by_sin=True, + dir_offset=0, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + bbox_code_size=9, # For nuscenes + pred_attrs=False, + num_attrs=9, # For nuscenes + pred_velo=False, + group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo, + cls_branch=(128, 64), + reg_branch=( + (128, 64), # offset + (128, 64), # depth + (64, ), # size + (64, ), # rot + () # velo + ), + dir_branch=(64, ), + attr_branch=(64, ), + conv_cfg=None, + norm_cfg=None, + train_cfg=None, + test_cfg=None): + super(AnchorFreeMono3DHead, self).__init__() + self.num_classes = num_classes + self.cls_out_channels = num_classes + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.use_direction_classifier = use_direction_classifier + self.diff_rad_by_sin = diff_rad_by_sin + self.dir_offset = dir_offset + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + self.loss_dir = build_loss(loss_dir) + self.bbox_code_size = bbox_code_size + self.group_reg_dims = list(group_reg_dims) + self.cls_branch = cls_branch + self.reg_branch = reg_branch + assert len(reg_branch) == len(group_reg_dims), 'The number of '\ + 'element in reg_branch and group_reg_dims should be the same.' + self.pred_velo = pred_velo + self.out_channels = [] + for reg_branch_channels in reg_branch: + if len(reg_branch_channels) > 0: + self.out_channels.append(reg_branch_channels[-1]) + else: + self.out_channels.append(-1) + self.dir_branch = dir_branch + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.fp16_enabled = False + self.background_label = ( + num_classes if background_label is None else background_label) + # background_label should be either 0 or num_classes + assert (self.background_label == 0 + or self.background_label == num_classes) + self.pred_attrs = pred_attrs + self.attr_background_label = -1 + self.num_attrs = num_attrs + if self.pred_attrs: + self.attr_background_label = num_attrs + self.loss_attr = build_loss(loss_attr) + self.attr_branch = attr_branch + + self._init_layers() + + def _init_layers(self): + """Initialize layers of the head.""" + self._init_cls_convs() + self._init_reg_convs() + self._init_predictor() + + def _init_cls_convs(self): + """Initialize classification conv layers of the head.""" + self.cls_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_reg_convs(self): + """Initialize bbox regression conv layers of the head.""" + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + def _init_branch(self, conv_channels=(64), conv_strides=(1)): + """Initialize conv layers as a prediction branch.""" + conv_before_pred = nn.ModuleList() + if isinstance(conv_channels, int): + conv_channels = [self.feat_channels] + [conv_channels] + conv_strides = [conv_strides] + else: + conv_channels = [self.feat_channels] + list(conv_channels) + conv_strides = list(conv_strides) + for i in range(len(conv_strides)): + conv_before_pred.append( + ConvModule( + conv_channels[i], + conv_channels[i + 1], + 3, + stride=conv_strides[i], + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias)) + + return conv_before_pred + + def _init_predictor(self): + """Initialize predictor layers of the head.""" + self.conv_cls_prev = self._init_branch( + conv_channels=self.cls_branch, + conv_strides=(1, ) * len(self.cls_branch)) + self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, + 1) + self.conv_reg_prevs = nn.ModuleList() + self.conv_regs = nn.ModuleList() + for i in range(len(self.group_reg_dims)): + reg_dim = self.group_reg_dims[i] + reg_branch_channels = self.reg_branch[i] + out_channel = self.out_channels[i] + if len(reg_branch_channels) > 0: + self.conv_reg_prevs.append( + self._init_branch( + conv_channels=reg_branch_channels, + conv_strides=(1, ) * len(reg_branch_channels))) + self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1)) + else: + self.conv_reg_prevs.append(None) + self.conv_regs.append( + nn.Conv2d(self.feat_channels, reg_dim, 1)) + if self.use_direction_classifier: + self.conv_dir_cls_prev = self._init_branch( + conv_channels=self.dir_branch, + conv_strides=(1, ) * len(self.dir_branch)) + self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1) + if self.pred_attrs: + self.conv_attr_prev = self._init_branch( + conv_channels=self.attr_branch, + conv_strides=(1, ) * len(self.attr_branch)) + self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1) + + def init_weights(self): + """Initialize weights of the head.""" + for m in self.cls_convs: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + for m in self.reg_convs: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + for m in self.conv_cls_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + for conv_reg_prev in self.conv_reg_prevs: + if conv_reg_prev is None: + continue + for m in conv_reg_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + if self.use_direction_classifier: + for m in self.conv_dir_cls_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + if self.pred_attrs: + for m in self.conv_attr_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + bias_cls = bias_init_with_prob(0.01) + normal_init(self.conv_cls, std=0.01, bias=bias_cls) + for conv_reg in self.conv_regs: + normal_init(conv_reg, std=0.01) + if self.use_direction_classifier: + normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls) + if self.pred_attrs: + normal_init(self.conv_attr, std=0.01, bias=bias_cls) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually contain classification scores, bbox predictions, \ + and direction class predictions. + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + attr_preds (list[Tensor]): Attribute scores for each scale + level, each is a 4D-tensor, the channel number is + num_points * num_attrs. + """ + return multi_apply(self.forward_single, feats)[:5] + + def forward_single(self, x): + """Forward features of a single scale levle. + + Args: + x (Tensor): FPN feature maps of the specified stride. + + Returns: + tuple: Scores for each class, bbox predictions, direction class, + and attributes, features after classification and regression + conv layers, some models needs these features like FCOS. + """ + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + clone_cls_feat = cls_feat.clone() + for conv_cls_prev_layer in self.conv_cls_prev: + clone_cls_feat = conv_cls_prev_layer(clone_cls_feat) + cls_score = self.conv_cls(clone_cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + bbox_pred = [] + for i in range(len(self.group_reg_dims)): + clone_reg_feat = reg_feat.clone() + if len(self.reg_branch[i]) > 0: + for conv_reg_prev_layer in self.conv_reg_prevs[i]: + clone_reg_feat = conv_reg_prev_layer(clone_reg_feat) + bbox_pred.append(self.conv_regs[i](clone_reg_feat)) + bbox_pred = torch.cat(bbox_pred, dim=1) + + dir_cls_pred = None + if self.use_direction_classifier: + clone_reg_feat = reg_feat.clone() + for conv_dir_cls_prev_layer in self.conv_dir_cls_prev: + clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat) + dir_cls_pred = self.conv_dir_cls(clone_reg_feat) + + attr_pred = None + if self.pred_attrs: + clone_cls_feat = cls_feat.clone() + for conv_attr_prev_layer in self.conv_attr_prev: + clone_cls_feat = conv_attr_prev_layer(clone_cls_feat) + attr_pred = self.conv_attr(clone_cls_feat) + + return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \ + reg_feat + + @abstractmethod + @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) + def loss(self, + cls_scores, + bbox_preds, + dir_cls_preds, + attr_preds, + gt_bboxes, + gt_labels, + gt_bboxes_3d, + gt_labels_3d, + centers2d, + depths, + attr_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + attr_preds (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_attrs. + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each + image with shape (num_gts, bbox_code_size). + gt_labels_3d (list[Tensor]): 3D class indices of each box. + centers2d (list[Tensor]): Projected 3D centers onto 2D images. + depths (list[Tensor]): Depth of projected centers on 2D images. + attr_labels (list[Tensor], optional): Attribute indices + corresponding to each box + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + """ + + raise NotImplementedError + + @abstractmethod + @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) + def get_bboxes(self, + cls_scores, + bbox_preds, + dir_cls_preds, + attr_preds, + img_metas, + cfg=None, + rescale=None): + """Transform network output for a batch into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_points * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_points * bbox_code_size, H, W) + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + attr_preds (list[Tensor]): Attribute scores for each scale level + Has shape (N, num_points * num_attrs, H, W) + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used + rescale (bool): If True, return boxes in original image space + """ + + raise NotImplementedError + + @abstractmethod + def get_targets(self, points, gt_bboxes_list, gt_labels_list, + gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, + depths_list, attr_labels_list): + """Compute regression, classification and centerss targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + each has shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each box, + each has shape (num_gt,). + gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each + image, each has shape (num_gt, bbox_code_size). + gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each + box, each has shape (num_gt,). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + each has shape (num_gt, 2). + depths_list (list[Tensor]): Depth of projected 3D centers onto 2D + image, each has shape (num_gt, 1). + attr_labels_list (list[Tensor]): Attribute labels of each box, + each has shape (num_gt,). + """ + raise NotImplementedError + + def _get_points_single(self, + featmap_size, + stride, + dtype, + device, + flatten=False): + """Get points of a single scale level.""" + h, w = featmap_size + x_range = torch.arange(w, dtype=dtype, device=device) + y_range = torch.arange(h, dtype=dtype, device=device) + y, x = torch.meshgrid(y_range, x_range) + if flatten: + y = y.flatten() + x = x.flatten() + return y, x + + def get_points(self, featmap_sizes, dtype, device, flatten=False): + """Get points according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + dtype (torch.dtype): Type of points. + device (torch.device): Device of points. + + Returns: + tuple: points of each image. + """ + mlvl_points = [] + for i in range(len(featmap_sizes)): + mlvl_points.append( + self._get_points_single(featmap_sizes[i], self.strides[i], + dtype, device, flatten)) + return mlvl_points diff --git a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py new file mode 100644 index 0000000000..bff464def9 --- /dev/null +++ b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py @@ -0,0 +1,76 @@ +from abc import ABCMeta, abstractmethod +from torch import nn as nn + + +class BaseMono3DDenseHead(nn.Module, metaclass=ABCMeta): + """Base class for Monocular 3D DenseHeads.""" + + def __init__(self): + super(BaseMono3DDenseHead, self).__init__() + + @abstractmethod + def loss(self, **kwargs): + """Compute losses of the head.""" + pass + + @abstractmethod + def get_bboxes(self, **kwargs): + """Transform network output for a batch into bbox predictions.""" + pass + + def forward_train(self, + x, + img_metas, + gt_bboxes, + gt_labels=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + centers2d=None, + depths=None, + attr_labels=None, + gt_bboxes_ignore=None, + proposal_cfg=None, + **kwargs): + """ + Args: + x (list[Tensor]): Features from FPN. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes (list[Tensor]): Ground truth bboxes of the image, + shape (num_gts, 4). + gt_labels (list[Tensor]): Ground truth labels of each box, + shape (num_gts,). + gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image, + shape (num_gts, self.bbox_code_size). + gt_labels_3d (list[Tensor]): 3D ground truth labels of each box, + shape (num_gts,). + centers2d (list[Tensor]): Projected 3D center of each box, + shape (num_gts, 2). + depths (list[Tensor]): Depth of projected 3D center of each box, + shape (num_gts,). + attr_labels (list[Tensor]): Attribute labels of each box, + shape (num_gts,). + gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be + ignored, shape (num_ignored_gts, 4). + proposal_cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used + + Returns: + tuple: + losses: (dict[str, Tensor]): A dictionary of loss components. + proposal_list (list[Tensor]): Proposals of each image. + """ + outs = self(x) + if gt_labels is None: + loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths, + attr_labels, img_metas) + else: + loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, + img_metas) + losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + if proposal_cfg is None: + return losses + else: + proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg) + return losses, proposal_list From ef23977f87bef862157bec8b248f542f8f2ee446 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Wed, 14 Apr 2021 21:17:38 +0800 Subject: [PATCH 02/12] Support FCOS3D head --- mmdet3d/models/dense_heads/__init__.py | 3 +- .../models/dense_heads/fcos_mono3d_head.py | 942 ++++++++++++++++++ 2 files changed, 944 insertions(+), 1 deletion(-) create mode 100644 mmdet3d/models/dense_heads/fcos_mono3d_head.py diff --git a/mmdet3d/models/dense_heads/__init__.py b/mmdet3d/models/dense_heads/__init__.py index 8566be7533..1f69c5910c 100644 --- a/mmdet3d/models/dense_heads/__init__.py +++ b/mmdet3d/models/dense_heads/__init__.py @@ -3,6 +3,7 @@ from .base_conv_bbox_head import BaseConvBboxHead from .base_mono3d_dense_head import BaseMono3DDenseHead from .centerpoint_head import CenterHead +from .fcos_mono3d_head import FCOSMono3DHead from .free_anchor3d_head import FreeAnchor3DHead from .parta2_rpn_head import PartA2RPNHead from .shape_aware_head import ShapeAwareHead @@ -12,5 +13,5 @@ __all__ = [ 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', - 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead' + 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead' ] diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py new file mode 100644 index 0000000000..ac94be43b1 --- /dev/null +++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py @@ -0,0 +1,942 @@ +import numpy as np +import torch +from mmcv.cnn import Scale, normal_init +from torch import nn as nn + +from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr +from mmdet.core import force_fp32, multi_apply +from mmdet.models.builder import HEADS, build_loss +from .anchor_free_mono3d_head import AnchorFreeMono3DHead + +INF = 1e8 + + +@HEADS.register_module() +class FCOSMono3DHead(AnchorFreeMono3DHead): + """Anchor-free head used in FCOS3D. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + regress_ranges (tuple[tuple[int, int]]): Regress range of multiple + level points. + center_sampling (bool): If true, use center sampling. Default: True. + center_sample_radius (float): Radius of center sampling. Default: 1.5. + norm_on_bbox (bool): If true, normalize the regression targets + with FPN strides. Default: True. + centerness_on_reg (bool): If true, position centerness on the + regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. + Default: True. + centerness_alpha: Parameter used to adjust the intensity attenuation + from the center to the periphery. Default: 2.5. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + loss_dir (dict): Config of direction classification loss. + loss_attr (dict): Config of attribute classification loss. + loss_centerness (dict): Config of centerness loss. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). + centerness_branch (tuple[int]): Channels for centerness branch. + Default: (64, ). + """ # noqa: E501 + + def __init__(self, + num_classes, + in_channels, + regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384), + (384, INF)), + center_sampling=True, + center_sample_radius=1.5, + norm_on_bbox=True, + centerness_on_reg=True, + centerness_alpha=2.5, + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + centerness_branch=(64, ), + **kwargs): + self.regress_ranges = regress_ranges + self.center_sampling = center_sampling + self.center_sample_radius = center_sample_radius + self.norm_on_bbox = norm_on_bbox + self.centerness_on_reg = centerness_on_reg + self.centerness_alpha = centerness_alpha + self.centerness_branch = centerness_branch + super().__init__( + num_classes, + in_channels, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_dir=loss_dir, + loss_attr=loss_attr, + norm_cfg=norm_cfg, + **kwargs) + self.loss_centerness = build_loss(loss_centerness) + + def _init_layers(self): + """Initialize layers of the head.""" + super()._init_layers() + self.conv_centerness_prev = self._init_branch( + conv_channels=self.centerness_branch, + conv_strides=(1, ) * len(self.centerness_branch)) + self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1) + self.scales = nn.ModuleList([ + nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides + ]) # only for offset, depth and size regression + + def init_weights(self): + """Initialize weights of the head.""" + super().init_weights() + for m in self.conv_centerness_prev: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) + normal_init(self.conv_centerness, std=0.01) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2). + attr_preds (list[Tensor]): Attribute scores for each scale + level, each is a 4D-tensor, the channel number is + num_points * num_attrs. + centernesses (list[Tensor]): Centerness for each scale level, + each is a 4D-tensor, the channel number is num_points * 1. + """ + return multi_apply(self.forward_single, feats, self.scales, + self.strides) + + def forward_single(self, x, scale, stride): + """Forward features of a single scale levle. + + Args: + x (Tensor): FPN feature maps of the specified stride. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + stride (int): The corresponding stride for feature maps, only + used to normalize the bbox prediction when self.norm_on_bbox + is True. + + Returns: + tuple: scores for each class, bbox and direction class \ + predictions, centerness predictions of input feature maps. + """ + cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \ + super().forward_single(x) + + if self.centerness_on_reg: + clone_reg_feat = reg_feat.clone() + for conv_centerness_prev_layer in self.conv_centerness_prev: + clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat) + centerness = self.conv_centerness(clone_reg_feat) + else: + clone_cls_feat = cls_feat.clone() + for conv_centerness_prev_layer in self.conv_centerness_prev: + clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat) + centerness = self.conv_centerness(clone_cls_feat) + # scale the bbox_pred of different level + # only apply to offset, depth and size prediction + scale_offset, scale_depth, scale_size = scale[0:3] + + clone_bbox_pred = bbox_pred.clone() + bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float() + bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float() + bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float() + + bbox_pred[:, 2] = bbox_pred[:, 2].exp() + bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0 + + assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ + 'has not been thoroughly tested for FCOS3D.' + if self.norm_on_bbox: + if not self.training: + # Note that this line is conducted only when testing + bbox_pred[:, :2] *= stride + + return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness + + @staticmethod + def add_sin_difference(boxes1, boxes2): + """Convert the rotation difference to difference in sine function. + + Args: + boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7 + and the 7th dimension is rotation dimension. + boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and + the 7th dimension is rotation dimension. + + Returns: + tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \ + dimensions are changed. + """ + rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( + boxes2[..., 6:7]) + rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[..., + 6:7]) + boxes1 = torch.cat( + [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1) + boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]], + dim=-1) + return boxes1, boxes2 + + @staticmethod + def get_direction_target(reg_targets, + dir_offset=0, + num_bins=2, + one_hot=True): + """Encode direction to 0 ~ num_bins-1. + + Args: + reg_targets (torch.Tensor): Bbox regression targets. + dir_offset (int): Direction offset. + num_bins (int): Number of bins to divide 2*PI. + one_hot (bool): Whether to encode as one hot. + + Returns: + torch.Tensor: Encoded direction targets. + """ + rot_gt = reg_targets[..., 6] + offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi) + dir_cls_targets = torch.floor(offset_rot / + (2 * np.pi / num_bins)).long() + dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) + if one_hot: + dir_targets = torch.zeros( + *list(dir_cls_targets.shape), + num_bins, + dtype=reg_targets.dtype, + device=dir_cls_targets.device) + dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0) + dir_cls_targets = dir_targets + return dir_cls_targets + + @force_fp32( + apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds', + 'centernesses')) + def loss(self, + cls_scores, + bbox_preds, + dir_cls_preds, + attr_preds, + centernesses, + gt_bboxes, + gt_labels, + gt_bboxes_3d, + gt_labels_3d, + centers2d, + depths, + attr_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * bbox_code_size. + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + attr_preds (list[Tensor]): Attribute scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_attrs. + centernesses (list[Tensor]): Centerness for each scale level, each + is a 4D-tensor, the channel number is num_points * 1. + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of + (num_gts, code_size). + gt_labels_3d (list[Tensor]): same as gt_labels + centers2d (list[Tensor]): 2D centers on the image with shape of + (num_gts, 2). + depths (list[Tensor]): Depth ground truth with shape of + (num_gts, ). + attr_labels (list[Tensor]): Attributes indices of each box. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len( + attr_preds) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \ + self.get_targets( + all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels) + + num_imgs = cls_scores[0].size(0) + # flatten cls_scores, bbox_preds, dir_cls_preds and centerness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims)) + for bbox_pred in bbox_preds + ] + flatten_dir_cls_preds = [ + dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2) + for dir_cls_pred in dir_cls_preds + ] + flatten_centerness = [ + centerness.permute(0, 2, 3, 1).reshape(-1) + for centerness in centernesses + ] + flatten_cls_scores = torch.cat(flatten_cls_scores) + flatten_bbox_preds = torch.cat(flatten_bbox_preds) + flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds) + flatten_centerness = torch.cat(flatten_centerness) + flatten_labels_3d = torch.cat(labels_3d) + flatten_bbox_targets_3d = torch.cat(bbox_targets_3d) + flatten_centerness_targets = torch.cat(centerness_targets) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((flatten_labels_3d >= 0) + & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1) + num_pos = len(pos_inds) + + loss_cls = self.loss_cls( + flatten_cls_scores, + flatten_labels_3d, + avg_factor=num_pos + num_imgs) # avoid num_pos is 0 + + pos_bbox_preds = flatten_bbox_preds[pos_inds] + pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds] + pos_centerness = flatten_centerness[pos_inds] + + if self.pred_attrs: + flatten_attr_preds = [ + attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs) + for attr_pred in attr_preds + ] + flatten_attr_preds = torch.cat(flatten_attr_preds) + flatten_attr_targets = torch.cat(attr_targets) + pos_attr_preds = flatten_attr_preds[pos_inds] + + if num_pos > 0: + pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] + pos_centerness_targets = flatten_centerness_targets[pos_inds] + if self.pred_attrs: + pos_attr_targets = flatten_attr_targets[pos_inds] + bbox_weights = pos_centerness_targets.new_ones( + len(pos_centerness_targets), sum(self.group_reg_dims)) + equal_weights = pos_centerness_targets.new_ones( + pos_centerness_targets.shape) + + code_weight = self.train_cfg.get('code_weight', None) + if code_weight: + assert len(code_weight) == sum(self.group_reg_dims) + bbox_weights = bbox_weights * bbox_weights.new_tensor( + code_weight) + + if self.use_direction_classifier: + pos_dir_cls_targets = self.get_direction_target( + pos_bbox_targets_3d, self.dir_offset, one_hot=False) + + if self.diff_rad_by_sin: + pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference( + pos_bbox_preds, pos_bbox_targets_3d) + + loss_offset = self.loss_bbox( + pos_bbox_preds[:, :2], + pos_bbox_targets_3d[:, :2], + weight=bbox_weights[:, :2], + avg_factor=equal_weights.sum()) + loss_depth = self.loss_bbox( + pos_bbox_preds[:, 2], + pos_bbox_targets_3d[:, 2], + weight=bbox_weights[:, 2], + avg_factor=equal_weights.sum()) + loss_size = self.loss_bbox( + pos_bbox_preds[:, 3:6], + pos_bbox_targets_3d[:, 3:6], + weight=bbox_weights[:, 3:6], + avg_factor=equal_weights.sum()) + loss_rotsin = self.loss_bbox( + pos_bbox_preds[:, 6], + pos_bbox_targets_3d[:, 6], + weight=bbox_weights[:, 6], + avg_factor=equal_weights.sum()) + loss_velo = None + if self.pred_velo: + loss_velo = self.loss_bbox( + pos_bbox_preds[:, 7:9], + pos_bbox_targets_3d[:, 7:9], + weight=bbox_weights[:, 7:9], + avg_factor=equal_weights.sum()) + + loss_centerness = self.loss_centerness(pos_centerness, + pos_centerness_targets) + + # direction classification loss + loss_dir = None + # TODO: add more check for use_direction_classifier + if self.use_direction_classifier: + loss_dir = self.loss_dir( + pos_dir_cls_preds, + pos_dir_cls_targets, + equal_weights, + avg_factor=equal_weights.sum()) + + # attribute classification loss + loss_attr = None + if self.pred_attrs: + loss_attr = self.loss_attr( + pos_attr_preds, + pos_attr_targets, + pos_centerness_targets, + avg_factor=pos_centerness_targets.sum()) + + else: + # need absolute due to possible negative delta x/y + loss_offset = pos_bbox_preds[:, :2].sum() + loss_depth = pos_bbox_preds[:, 2].sum() + loss_size = pos_bbox_preds[:, 3:6].sum() + loss_rotsin = pos_bbox_preds[:, 6].sum() + loss_velo = None + if self.pred_velo: + loss_velo = pos_bbox_preds[:, 7:9].sum() + loss_centerness = pos_centerness.sum() + loss_dir = None + if self.use_direction_classifier: + loss_dir = pos_dir_cls_preds.sum() + loss_attr = None + if self.pred_attrs: + loss_attr = pos_attr_preds.sum() + + loss_dict = dict( + loss_cls=loss_cls, + loss_offset=loss_offset, + loss_depth=loss_depth, + loss_size=loss_size, + loss_rotsin=loss_rotsin, + loss_centerness=loss_centerness) + + if loss_velo is not None: + loss_dict['loss_velo'] = loss_velo + + if loss_dir is not None: + loss_dict['loss_dir'] = loss_dir + + if loss_attr is not None: + loss_dict['loss_attr'] = loss_attr + + return loss_dict + + @force_fp32( + apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds', + 'centernesses')) + def get_bboxes(self, + cls_scores, + bbox_preds, + dir_cls_preds, + attr_preds, + centernesses, + img_metas, + cfg=None, + rescale=None): + """Transform network output for a batch into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_points * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_points * 4, H, W) + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on each scale level, each is a 4D-tensor, + the channel number is num_points * 2. (bin = 2) + attr_preds (list[Tensor]): Attribute scores for each scale level + Has shape (N, num_points * num_attrs, H, W) + centernesses (list[Tensor]): Centerness for each scale level with + shape (N, num_points * 1, H, W) + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used + rescale (bool): If True, return boxes in original image space + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. \ + The first item is an (n, 5) tensor, where the first 4 columns \ + are bounding box positions (tl_x, tl_y, br_x, br_y) and the \ + 5-th column is a score between 0 and 1. The second item is a \ + (n,) tensor where each item is the predicted class label of \ + the corresponding box. + """ + assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ + len(centernesses) == len(attr_preds) + num_levels = len(cls_scores) + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, + bbox_preds[0].device) + result_list = [] + for img_id in range(len(img_metas)): + cls_score_list = [ + cls_scores[i][img_id].detach() for i in range(num_levels) + ] + bbox_pred_list = [ + bbox_preds[i][img_id].detach() for i in range(num_levels) + ] + if self.use_direction_classifier: + dir_cls_pred_list = [ + dir_cls_preds[i][img_id].detach() + for i in range(num_levels) + ] + else: + dir_cls_pred_list = [ + cls_scores[i][img_id].new_full( + [2, *cls_scores[i][img_id].shape[1:]], 0).detach() + for i in range(num_levels) + ] + if self.pred_attrs: + attr_pred_list = [ + attr_preds[i][img_id].detach() for i in range(num_levels) + ] + else: + attr_pred_list = [ + cls_scores[i][img_id].new_full( + [self.num_attrs, *cls_scores[i][img_id].shape[1:]], + self.attr_background_label).detach() + for i in range(num_levels) + ] + centerness_pred_list = [ + centernesses[i][img_id].detach() for i in range(num_levels) + ] + input_meta = img_metas[img_id] + det_bboxes = self._get_bboxes_single( + cls_score_list, bbox_pred_list, dir_cls_pred_list, + attr_pred_list, centerness_pred_list, mlvl_points, input_meta, + cfg, rescale) + result_list.append(det_bboxes) + return result_list + + def _get_bboxes_single(self, + cls_scores, + bbox_preds, + dir_cls_preds, + attr_preds, + centernesses, + mlvl_points, + input_meta, + cfg, + rescale=False): + """Transform outputs for a single batch item into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for a single scale level + Has shape (num_points * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for a single scale + level with shape (num_points * bbox_code_size, H, W). + dir_cls_preds (list[Tensor]): Box scores for direction class + predictions on a single scale level with shape \ + (num_points * 2, H, W) + attr_preds (list[Tensor]): Attribute scores for each scale level + Has shape (N, num_points * num_attrs, H, W) + centernesses (list[Tensor]): Centerness for a single scale level + with shape (num_points, H, W). + mlvl_points (list[Tensor]): Box reference for a single scale level + with shape (num_total_points, 2). + input_meta (dict): Metadata of input image. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + + Returns: + tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes. + """ + view = np.array(input_meta['cam_intrinsic']) + scale_factor = input_meta['scale_factor'] + cfg = self.test_cfg if cfg is None else cfg + assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) + mlvl_centers2d = [] + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_dir_scores = [] + mlvl_attr_scores = [] + mlvl_centerness = [] + + for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \ + points in zip(cls_scores, bbox_preds, dir_cls_preds, + attr_preds, centernesses, mlvl_points): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + scores = cls_score.permute(1, 2, 0).reshape( + -1, self.cls_out_channels).sigmoid() + dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) + dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] + attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs) + attr_score = torch.max(attr_pred, dim=-1)[1] + centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() + + bbox_pred = bbox_pred.permute(1, 2, + 0).reshape(-1, + sum(self.group_reg_dims)) + bbox_pred = bbox_pred[:, :self.bbox_code_size] + nms_pre = cfg.get('nms_pre', -1) + if nms_pre > 0 and scores.shape[0] > nms_pre: + max_scores, _ = (scores * centerness[:, None]).max(dim=1) + _, topk_inds = max_scores.topk(nms_pre) + points = points[topk_inds, :] + bbox_pred = bbox_pred[topk_inds, :] + scores = scores[topk_inds, :] + dir_cls_pred = dir_cls_pred[topk_inds, :] + centerness = centerness[topk_inds] + dir_cls_score = dir_cls_score[topk_inds] + attr_score = attr_score[topk_inds] + # change the offset to actual center predictions + bbox_pred[:, :2] = points - bbox_pred[:, :2] + if rescale: + bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor) + pred_center2d = bbox_pred[:, :3].clone() + bbox_pred[:, :3] = self.pts2Dto3D(bbox_pred[:, :3], view) + mlvl_centers2d.append(pred_center2d) + mlvl_bboxes.append(bbox_pred) + mlvl_scores.append(scores) + mlvl_dir_scores.append(dir_cls_score) + mlvl_attr_scores.append(attr_score) + mlvl_centerness.append(centerness) + + mlvl_centers2d = torch.cat(mlvl_centers2d) + mlvl_bboxes = torch.cat(mlvl_bboxes) + mlvl_dir_scores = torch.cat(mlvl_dir_scores) + + # change local yaw to global yaw for 3D nms + if mlvl_bboxes.shape[0] > 0: + dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0, + np.pi) + mlvl_bboxes[..., 6] = ( + dir_rot + self.dir_offset + + np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype)) + + cam_intrinsic = mlvl_centers2d.new_zeros((4, 4)) + cam_intrinsic[:view.shape[0], :view.shape[1]] = \ + mlvl_centers2d.new_tensor(view) + mlvl_bboxes[:, 6] = torch.atan2( + mlvl_centers2d[:, 0] - cam_intrinsic[0, 2], + cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6] + mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( + mlvl_bboxes, box_dim=self.bbox_code_size, + origin=(0.5, 0.5, 0.5)).bev) + + mlvl_scores = torch.cat(mlvl_scores) + padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) + # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 + # BG cat_id: num_class + mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) + mlvl_attr_scores = torch.cat(mlvl_attr_scores) + mlvl_centerness = torch.cat(mlvl_centerness) + # no scale_factors in box3d_multiclass_nms + # Then we multiply it from outside + mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None] + results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, + mlvl_nms_scores, cfg.score_thr, + cfg.max_per_img, cfg, mlvl_dir_scores, + mlvl_attr_scores) + bboxes, scores, labels, dir_scores, attrs = results + attrs = attrs.to(labels.dtype) # change data type to int + bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.bbox_code_size) + # Note that the predictions use origin (0.5, 0.5, 0.5) + # Due to the ground truth centers2d are the gravity center of objects + # The center has been transformed when computing bev bbox!!!! + if not self.pred_attrs: + attrs = None + + return bboxes, scores, labels, attrs + + @staticmethod + def pts2Dto3D(points, view): + """ + Args: + points (torch.Tensor): points in 2D images, [N, 3], \ + 3 corresponds with x, y in the image and depth. + view (np.ndarray): camera instrinsic, [3, 3] + + Returns: + torch.Tensor: points in 3D space. [N, 3], \ + 3 corresponds with x, y, z in 3D space. + """ + assert view.shape[0] <= 4 + assert view.shape[1] <= 4 + assert points.shape[1] == 3 + + points2D = points[:, :2] + depths = points[:, 2].view(-1, 1) + unnorm_points2D = torch.cat([points2D * depths, depths], dim=1) + + viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device) + viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view) + inv_viewpad = torch.inverse(viewpad).transpose(0, 1) + + # Do operation in homogenous coordinates. + nbr_points = unnorm_points2D.shape[0] + homo_points2D = torch.cat( + [unnorm_points2D, + points2D.new_ones((nbr_points, 1))], dim=1) + points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3] + + return points3D + + def _get_points_single(self, + featmap_size, + stride, + dtype, + device, + flatten=False): + """Get points according to feature map sizes.""" + y, x = super()._get_points_single(featmap_size, stride, dtype, device) + points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride), + dim=-1) + stride // 2 + return points + + def get_targets(self, points, gt_bboxes_list, gt_labels_list, + gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, + depths_list, attr_labels_list): + """Compute regression, classification and centerss targets for points + in multiple images. + + Args: + points (list[Tensor]): Points of each fpn level, each has shape + (num_points, 2). + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, + each has shape (num_gt, 4). + gt_labels_list (list[Tensor]): Ground truth labels of each box, + each has shape (num_gt,). + gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each + image, each has shape (num_gt, bbox_code_size). + gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each + box, each has shape (num_gt,). + centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, + each has shape (num_gt, 2). + depths_list (list[Tensor]): Depth of projected 3D centers onto 2D + image, each has shape (num_gt, 1). + attr_labels_list (list[Tensor]): Attribute labels of each box, + each has shape (num_gt,). + + Returns: + tuple: + concat_lvl_labels (list[Tensor]): Labels of each level. \ + concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ + level. + """ + assert len(points) == len(self.regress_ranges) + num_levels = len(points) + # expand regress ranges to align with points + expanded_regress_ranges = [ + points[i].new_tensor(self.regress_ranges[i])[None].expand_as( + points[i]) for i in range(num_levels) + ] + # concat all levels points and regress ranges + concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) + concat_points = torch.cat(points, dim=0) + + # the number of points per img, per lvl + num_points = [center.size(0) for center in points] + + if attr_labels_list is None: + attr_labels_list = [ + gt_labels.new_full(gt_labels.shape, self.attr_background_label) + for gt_labels in gt_labels_list + ] + + # get labels and bbox_targets of each image + _, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \ + attr_targets_list = multi_apply( + self._get_target_single, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_3d_list, + gt_labels_3d_list, + centers2d_list, + depths_list, + attr_labels_list, + points=concat_points, + regress_ranges=concat_regress_ranges, + num_points_per_lvl=num_points) + + # split to per img, per level + labels_3d_list = [ + labels_3d.split(num_points, 0) for labels_3d in labels_3d_list + ] + bbox_targets_3d_list = [ + bbox_targets_3d.split(num_points, 0) + for bbox_targets_3d in bbox_targets_3d_list + ] + centerness_targets_list = [ + centerness_targets.split(num_points, 0) + for centerness_targets in centerness_targets_list + ] + attr_targets_list = [ + attr_targets.split(num_points, 0) + for attr_targets in attr_targets_list + ] + + # concat per level image + concat_lvl_labels_3d = [] + concat_lvl_bbox_targets_3d = [] + concat_lvl_centerness_targets = [] + concat_lvl_attr_targets = [] + for i in range(num_levels): + concat_lvl_labels_3d.append( + torch.cat([labels[i] for labels in labels_3d_list])) + concat_lvl_centerness_targets.append( + torch.cat([ + centerness_targets[i] + for centerness_targets in centerness_targets_list + ])) + bbox_targets_3d = torch.cat([ + bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list + ]) + concat_lvl_attr_targets.append( + torch.cat( + [attr_targets[i] for attr_targets in attr_targets_list])) + if self.norm_on_bbox: + bbox_targets_3d[:, : + 2] = bbox_targets_3d[:, :2] / self.strides[i] + concat_lvl_bbox_targets_3d.append(bbox_targets_3d) + return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \ + concat_lvl_centerness_targets, concat_lvl_attr_targets + + def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, + points, regress_ranges, num_points_per_lvl): + """Compute regression and classification targets for a single image.""" + num_points = points.size(0) + num_gts = gt_labels.size(0) + if not isinstance(gt_bboxes_3d, torch.Tensor): + gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device) + if num_gts == 0: + return gt_labels.new_full((num_points,), self.background_label), \ + gt_bboxes.new_zeros((num_points, 4)), \ + gt_labels_3d.new_full( + (num_points,), self.background_label), \ + gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \ + gt_bboxes_3d.new_zeros((num_points,)), \ + attr_labels.new_full( + (num_points,), self.attr_background_label) + + areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( + gt_bboxes[:, 3] - gt_bboxes[:, 1]) + areas = areas[None].repeat(num_points, 1) + regress_ranges = regress_ranges[:, None, :].expand( + num_points, num_gts, 2) + gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) + centers2d = centers2d[None].expand(num_points, num_gts, 2) + gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts, + self.bbox_code_size) + depths = depths[None, :, None].expand(num_points, num_gts, 1) + xs, ys = points[:, 0], points[:, 1] + xs = xs[:, None].expand(num_points, num_gts) + ys = ys[:, None].expand(num_points, num_gts) + + delta_xs = (xs - centers2d[..., 0])[..., None] + delta_ys = (ys - centers2d[..., 1])[..., None] + bbox_targets_3d = torch.cat( + (delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1) + + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + bbox_targets = torch.stack((left, top, right, bottom), -1) + + assert self.center_sampling is True, 'Setting center_sampling to '\ + 'False has not been implemented for FCOS3D.' + # condition1: inside a `center bbox` + radius = self.center_sample_radius + center_xs = centers2d[..., 0] + center_ys = centers2d[..., 1] + center_gts = torch.zeros_like(gt_bboxes) + stride = center_xs.new_zeros(center_xs.shape) + + # project the points on current lvl back to the `original` sizes + lvl_begin = 0 + for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl): + lvl_end = lvl_begin + num_points_lvl + stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius + lvl_begin = lvl_end + + center_gts[..., 0] = center_xs - stride + center_gts[..., 1] = center_ys - stride + center_gts[..., 2] = center_xs + stride + center_gts[..., 3] = center_ys + stride + + cb_dist_left = xs - center_gts[..., 0] + cb_dist_right = center_gts[..., 2] - xs + cb_dist_top = ys - center_gts[..., 1] + cb_dist_bottom = center_gts[..., 3] - ys + center_bbox = torch.stack( + (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1) + inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 + + # condition2: limit the regression range for each location + max_regress_distance = bbox_targets.max(-1)[0] + inside_regress_range = ( + (max_regress_distance >= regress_ranges[..., 0]) + & (max_regress_distance <= regress_ranges[..., 1])) + + # center-based criterion to deal with ambiguity + dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1)) + dists[inside_gt_bbox_mask == 0] = INF + dists[inside_regress_range == 0] = INF + min_dist, min_dist_inds = dists.min(dim=1) + + labels = gt_labels[min_dist_inds] + labels_3d = gt_labels_3d[min_dist_inds] + attr_labels = attr_labels[min_dist_inds] + labels[min_dist == INF] = self.background_label # set as BG + labels_3d[min_dist == INF] = self.background_label # set as BG + attr_labels[min_dist == INF] = self.attr_background_label + + bbox_targets = bbox_targets[range(num_points), min_dist_inds] + bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds] + relative_dists = torch.sqrt( + torch.sum(bbox_targets_3d[..., :2]**2, + dim=-1)) / (1.414 * stride[:, 0]) + # [N, 1] / [N, 1] + centerness_targets = torch.exp(-self.centerness_alpha * relative_dists) + + return labels, bbox_targets, labels_3d, bbox_targets_3d, \ + centerness_targets, attr_labels From b16595e8f541f2073585528fe829483dfec05555 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Sun, 18 Apr 2021 21:30:57 +0800 Subject: [PATCH 03/12] Support FCOS3D baseline on nuScenes --- configs/_base_/datasets/nus-mono3d.py | 89 +++++++++++++++++++ configs/_base_/models/fcos3d.py | 75 ++++++++++++++++ ...caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py | 75 ++++++++++++++++ .../dense_heads/anchor_free_mono3d_head.py | 3 +- .../models/dense_heads/fcos_mono3d_head.py | 3 +- 5 files changed, 243 insertions(+), 2 deletions(-) create mode 100644 configs/_base_/datasets/nus-mono3d.py create mode 100644 configs/_base_/models/fcos3d.py create mode 100644 configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py diff --git a/configs/_base_/datasets/nus-mono3d.py b/configs/_base_/datasets/nus-mono3d.py new file mode 100644 index 0000000000..3a87067f29 --- /dev/null +++ b/configs/_base_/datasets/nus-mono3d.py @@ -0,0 +1,89 @@ +dataset_type = 'NuScenesMonoDataset' +data_root = 'data/nuscenes/' +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=True, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', + 'gt_labels_3d', 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline, + modality=input_modality, + test_mode=False, + box_type_3d='Camera'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera')) +evaluation = dict(interval=2) diff --git a/configs/_base_/models/fcos3d.py b/configs/_base_/models/fcos3d.py new file mode 100644 index 0000000000..3a1797b262 --- /dev/null +++ b/configs/_base_/models/fcos3d.py @@ -0,0 +1,75 @@ +model = dict( + type='FCOSMono3D', + pretrained='open-mmlab://detectron2/resnet101_caffe', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs=True, + extra_convs_on_inputs=False, # use P5 + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSMono3DHead', + num_classes=10, + in_channels=256, + stacked_convs=2, + feat_channels=256, + use_direction_classifier=True, + diff_rad_by_sin=True, + pred_attrs=True, + pred_velo=True, + dir_offset=0.7854, # pi/4 + strides=[8, 16, 32, 64, 128], + group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo + cls_branch=(128, 64), + reg_branch=( + (128, 64), # offset + (128, 64), # depth + (64, ), # size + (64, ), # rot + () # velo + ), + dir_branch=(64, ), + attr_branch=(64, ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_on_bbox=True, + centerness_on_reg=True, + center_sampling=True, + conv_bias=True, + dcn_on_last_conv=True), + train_cfg=dict( + allowed_border=0, + code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05], + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.8, + score_thr=0.05, + min_bbox_size=0, + max_per_img=200)) diff --git a/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py b/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py new file mode 100644 index 0000000000..3a7d7c5d1f --- /dev/null +++ b/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py @@ -0,0 +1,75 @@ +_base_ = [ + '../_base_/datasets/nus-mono3d.py', '../_base_/models/fcos3d.py', + '../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py' +] +# model settings +model = dict( + backbone=dict( + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + stage_with_dcn=(False, False, True, True))) + +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=True, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', + 'gt_labels_3d', 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + lr=0.002, paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.)) +optimizer_config = dict( + _delete_=True, grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) +total_epochs = 12 +evaluation = dict(interval=2) diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index cc78c42a4b..fe5f139c02 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -1,9 +1,10 @@ import torch from abc import abstractmethod from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init +from mmcv.runner import force_fp32 from torch import nn as nn -from mmdet.core import force_fp32, multi_apply +from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from .base_mono3d_dense_head import BaseMono3DDenseHead diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py index ac94be43b1..e9d4902890 100644 --- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py +++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py @@ -1,10 +1,11 @@ import numpy as np import torch from mmcv.cnn import Scale, normal_init +from mmcv.runner import force_fp32 from torch import nn as nn from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr -from mmdet.core import force_fp32, multi_apply +from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead From 4a58acf2b231149d9d6bc86b12c18bae9f19d49d Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Sun, 18 Apr 2021 21:33:31 +0800 Subject: [PATCH 04/12] Fix an import error caused by update of mmcv/mmdet --- mmdet3d/models/dense_heads/anchor_free_mono3d_head.py | 3 ++- mmdet3d/models/dense_heads/fcos_mono3d_head.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index cc78c42a4b..fe5f139c02 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -1,9 +1,10 @@ import torch from abc import abstractmethod from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init +from mmcv.runner import force_fp32 from torch import nn as nn -from mmdet.core import force_fp32, multi_apply +from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from .base_mono3d_dense_head import BaseMono3DDenseHead diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py index ac94be43b1..e9d4902890 100644 --- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py +++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py @@ -1,10 +1,11 @@ import numpy as np import torch from mmcv.cnn import Scale, normal_init +from mmcv.runner import force_fp32 from torch import nn as nn from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr -from mmdet.core import force_fp32, multi_apply +from mmdet.core import multi_apply from mmdet.models.builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead From f69e412709523422e91f198987a57dc4b7a1b932 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Mon, 19 Apr 2021 15:06:11 +0800 Subject: [PATCH 05/12] Change img_scale to scale_factor in the MultiScaleFlipAug in the config --- configs/_base_/datasets/nus-mono3d.py | 2 +- .../fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/_base_/datasets/nus-mono3d.py b/configs/_base_/datasets/nus-mono3d.py index 3a87067f29..6772ef5b5f 100644 --- a/configs/_base_/datasets/nus-mono3d.py +++ b/configs/_base_/datasets/nus-mono3d.py @@ -40,7 +40,7 @@ dict(type='LoadImageFromFileMono3D'), dict( type='MultiScaleFlipAug', - img_scale=(1600, 900), + scale_factor=1.0, flip=False, transforms=[ dict(type='RandomFlip3D'), diff --git a/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py b/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py index 3a7d7c5d1f..3b7eb99fce 100644 --- a/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py +++ b/configs/fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py @@ -40,7 +40,7 @@ dict(type='LoadImageFromFileMono3D'), dict( type='MultiScaleFlipAug', - img_scale=(1600, 900), + scale_factor=1.0, flip=False, transforms=[ dict(type='RandomFlip3D'), From b092f0d21fdc85fefd6ea6cbf2f942d8128b3e93 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Mon, 19 Apr 2021 15:08:22 +0800 Subject: [PATCH 06/12] Add pred_bbox2d in the params of anchor_free_mono3d_head --- mmdet3d/models/dense_heads/anchor_free_mono3d_head.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index fe5f139c02..27ae1ab50a 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -40,6 +40,7 @@ class AnchorFreeMono3DHead(BaseMono3DDenseHead): pred_attrs (bool): Whether to predict attributes. Default to False. num_attrs (int): The number of attributes to be predicted. Default: 9. pred_velo (bool): Whether to predict velocity. Default to False. + pred_bbox2d (bool): Whether to predict 2D boxes. Default to False. group_reg_dims (tuple[int]): The dimension of each regression target group. Default: (2, 1, 3, 1, 2). cls_branch (tuple[int]): Channels for classification branch. @@ -93,6 +94,7 @@ def __init__( pred_attrs=False, num_attrs=9, # For nuscenes pred_velo=False, + pred_bbox2d=False, group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo, cls_branch=(128, 64), reg_branch=( @@ -131,6 +133,7 @@ def __init__( assert len(reg_branch) == len(group_reg_dims), 'The number of '\ 'element in reg_branch and group_reg_dims should be the same.' self.pred_velo = pred_velo + self.pred_bbox2d = pred_bbox2d self.out_channels = [] for reg_branch_channels in reg_branch: if len(reg_branch_channels) > 0: From 8b412767221caeea124744019f4ffaef014a1dae Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Mon, 19 Apr 2021 20:46:21 +0800 Subject: [PATCH 07/12] Add unit test for fcos3d head --- tests/test_models/test_heads/test_heads.py | 82 +++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/tests/test_models/test_heads/test_heads.py b/tests/test_models/test_heads/test_heads.py index cccbd8109c..9fe27ee394 100644 --- a/tests/test_models/test_heads/test_heads.py +++ b/tests/test_models/test_heads/test_heads.py @@ -5,8 +5,8 @@ import torch from os.path import dirname, exists, join -from mmdet3d.core.bbox import (Box3DMode, DepthInstance3DBoxes, - LiDARInstance3DBoxes) +from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, + DepthInstance3DBoxes, LiDARInstance3DBoxes) from mmdet3d.models.builder import build_head from mmdet.apis import set_random_seed @@ -1044,3 +1044,81 @@ def test_shape_aware_head_getboxes(): input_metas) assert len(result_list[0][1]) > 0 # ensure not all boxes are filtered assert (result_list[0][1] > 0.3).all() + + +def test_fcos_mono3d_head(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + _setup_seed(0) + fcos3d_head_cfg = _get_head_cfg( + 'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py') + self = build_head(fcos3d_head_cfg).cuda() + + feats = [ + torch.rand([2, 256, 116, 200], dtype=torch.float32).cuda(), + torch.rand([2, 256, 58, 100], dtype=torch.float32).cuda(), + torch.rand([2, 256, 29, 50], dtype=torch.float32).cuda(), + torch.rand([2, 256, 15, 25], dtype=torch.float32).cuda(), + torch.rand([2, 256, 8, 13], dtype=torch.float32).cuda() + ] + + # test forward + ret_dict = self(feats) + assert len(ret_dict) == 5 + assert len(ret_dict[0]) == 5 + assert ret_dict[0][0].shape == torch.Size([2, 10, 116, 200]) + + # test loss + gt_bboxes = [ + torch.rand([3, 4], dtype=torch.float32).cuda(), + torch.rand([3, 4], dtype=torch.float32).cuda(), + ] + gt_bboxes_3d = CameraInstance3DBoxes( + torch.rand([3, 9], device='cuda'), box_dim=9) + gt_labels = [torch.randint(0, 10, [3], device='cuda') for i in range(2)] + gt_labels_3d = gt_labels + centers2d = [ + torch.rand([3, 2], dtype=torch.float32).cuda(), + torch.rand([3, 2], dtype=torch.float32).cuda(), + ] + depths = [ + torch.rand([3], dtype=torch.float32).cuda(), + torch.rand([3], dtype=torch.float32).cuda(), + ] + attr_labels = [torch.randint(0, 9, [3], device='cuda') for i in range(2)] + img_metas = [ + dict( + cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554], + [0.0, 1260.8474446004698, 495.3344268742088], + [0.0, 0.0, 1.0]], + scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), + box_type_3d=CameraInstance3DBoxes) + ] + losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d, + gt_labels_3d, centers2d, depths, attr_labels, img_metas) + assert losses['loss_cls'] >= 0 + assert losses['loss_offset'] >= 0 + assert losses['loss_depth'] >= 0 + assert losses['loss_size'] >= 0 + assert losses['loss_rotsin'] >= 0 + assert losses['loss_centerness'] >= 0 + assert losses['loss_velo'] >= 0 + assert losses['loss_dir'] >= 0 + assert losses['loss_attr'] >= 0 + + # test get_boxes + feats = [ + torch.rand([1, 256, 116, 200], dtype=torch.float32).cuda(), + torch.rand([1, 256, 58, 100], dtype=torch.float32).cuda(), + torch.rand([1, 256, 29, 50], dtype=torch.float32).cuda(), + torch.rand([1, 256, 15, 25], dtype=torch.float32).cuda(), + torch.rand([1, 256, 8, 13], dtype=torch.float32).cuda() + ] + ret_dict = self(feats) + results = self.get_bboxes(*ret_dict, img_metas) + assert len(results) == 1 + assert len(results[0]) == 4 + assert results[0][0].tensor.shape == torch.Size([200, 9]) + assert results[0][1].shape == torch.Size([200]) + assert results[0][2].shape == torch.Size([200]) + assert results[0][3].shape == torch.Size([200]) From bb50b852aa4a36c24283fdfa1118371a733d7c60 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Tue, 20 Apr 2021 10:49:16 +0800 Subject: [PATCH 08/12] Fix a minor bug when setting img_metas in the unit test --- tests/test_models/test_heads/test_heads.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/test_models/test_heads/test_heads.py b/tests/test_models/test_heads/test_heads.py index 9fe27ee394..c2eba8d55d 100644 --- a/tests/test_models/test_heads/test_heads.py +++ b/tests/test_models/test_heads/test_heads.py @@ -1092,7 +1092,7 @@ def test_fcos_mono3d_head(): [0.0, 1260.8474446004698, 495.3344268742088], [0.0, 0.0, 1.0]], scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), - box_type_3d=CameraInstance3DBoxes) + box_type_3d=CameraInstance3DBoxes) for i in range(2) ] losses = self.loss(*ret_dict, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas) @@ -1107,16 +1107,8 @@ def test_fcos_mono3d_head(): assert losses['loss_attr'] >= 0 # test get_boxes - feats = [ - torch.rand([1, 256, 116, 200], dtype=torch.float32).cuda(), - torch.rand([1, 256, 58, 100], dtype=torch.float32).cuda(), - torch.rand([1, 256, 29, 50], dtype=torch.float32).cuda(), - torch.rand([1, 256, 15, 25], dtype=torch.float32).cuda(), - torch.rand([1, 256, 8, 13], dtype=torch.float32).cuda() - ] - ret_dict = self(feats) results = self.get_bboxes(*ret_dict, img_metas) - assert len(results) == 1 + assert len(results) == 2 assert len(results[0]) == 4 assert results[0][0].tensor.shape == torch.Size([200, 9]) assert results[0][1].shape == torch.Size([200]) From cd31401c02cf4a89e6a3c4e04e26b9b06780a4ca Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Tue, 20 Apr 2021 11:48:24 +0800 Subject: [PATCH 09/12] Add unit test for fcos3d detector --- tests/test_models/test_detectors.py | 60 +++++++++++++++++++++- tests/test_models/test_heads/test_heads.py | 6 +-- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/tests/test_models/test_detectors.py b/tests/test_models/test_detectors.py index aa3fef8e55..011b93f873 100644 --- a/tests/test_models/test_detectors.py +++ b/tests/test_models/test_detectors.py @@ -5,7 +5,8 @@ import torch from os.path import dirname, exists, join -from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes +from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, + LiDARInstance3DBoxes) from mmdet3d.models.builder import build_detector @@ -316,3 +317,60 @@ def test_centerpoint(): assert boxes_3d_0.tensor.shape[1] == 9 assert scores_3d_0.shape[0] >= 0 assert labels_3d_0.shape[0] >= 0 + + +def test_fcos3d(): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + _setup_seed(0) + fcos3d_cfg = _get_detector_cfg( + 'fcos3d/fcos3d_r101_caffe_fpn_gn-head_dcn_2x8_1x_nus-mono3d.py') + self = build_detector(fcos3d_cfg).cuda() + imgs = torch.rand([1, 3, 928, 1600], dtype=torch.float32).cuda() + gt_bboxes = [torch.rand([3, 4], dtype=torch.float32).cuda()] + gt_bboxes_3d = CameraInstance3DBoxes( + torch.rand([3, 9], device='cuda'), box_dim=9) + gt_labels = [torch.randint(0, 10, [3], device='cuda')] + gt_labels_3d = gt_labels + centers2d = [torch.rand([3, 2], dtype=torch.float32).cuda()] + depths = [torch.rand([3], dtype=torch.float32).cuda()] + attr_labels = [torch.randint(0, 9, [3], device='cuda')] + img_metas = [ + dict( + cam_intrinsic=[[1260.8474446004698, 0.0, 807.968244525554], + [0.0, 1260.8474446004698, 495.3344268742088], + [0.0, 0.0, 1.0]], + scale_factor=np.array([1., 1., 1., 1.], dtype=np.float32), + box_type_3d=CameraInstance3DBoxes) + ] + + # test forward_train + losses = self.forward_train(imgs, img_metas, gt_bboxes, gt_labels, + gt_bboxes_3d, gt_labels_3d, centers2d, depths, + attr_labels) + assert losses['loss_cls'] >= 0 + assert losses['loss_offset'] >= 0 + assert losses['loss_depth'] >= 0 + assert losses['loss_size'] >= 0 + assert losses['loss_rotsin'] >= 0 + assert losses['loss_centerness'] >= 0 + assert losses['loss_velo'] >= 0 + assert losses['loss_dir'] >= 0 + assert losses['loss_attr'] >= 0 + + # test simple_test + results = self.simple_test(imgs, img_metas) + boxes_3d = results[0]['img_bbox']['boxes_3d'] + scores_3d = results[0]['img_bbox']['scores_3d'] + labels_3d = results[0]['img_bbox']['labels_3d'] + attrs_3d = results[0]['img_bbox']['attrs_3d'] + assert boxes_3d.tensor.shape[0] >= 0 + assert boxes_3d.tensor.shape[1] == 9 + assert scores_3d.shape[0] >= 0 + assert labels_3d.shape[0] >= 0 + assert attrs_3d.shape[0] >= 0 + + +if __name__ == '__main__': + test_fcos3d() diff --git a/tests/test_models/test_heads/test_heads.py b/tests/test_models/test_heads/test_heads.py index c2eba8d55d..7e9865ab37 100644 --- a/tests/test_models/test_heads/test_heads.py +++ b/tests/test_models/test_heads/test_heads.py @@ -1071,7 +1071,7 @@ def test_fcos_mono3d_head(): # test loss gt_bboxes = [ torch.rand([3, 4], dtype=torch.float32).cuda(), - torch.rand([3, 4], dtype=torch.float32).cuda(), + torch.rand([3, 4], dtype=torch.float32).cuda() ] gt_bboxes_3d = CameraInstance3DBoxes( torch.rand([3, 9], device='cuda'), box_dim=9) @@ -1079,11 +1079,11 @@ def test_fcos_mono3d_head(): gt_labels_3d = gt_labels centers2d = [ torch.rand([3, 2], dtype=torch.float32).cuda(), - torch.rand([3, 2], dtype=torch.float32).cuda(), + torch.rand([3, 2], dtype=torch.float32).cuda() ] depths = [ torch.rand([3], dtype=torch.float32).cuda(), - torch.rand([3], dtype=torch.float32).cuda(), + torch.rand([3], dtype=torch.float32).cuda() ] attr_labels = [torch.randint(0, 9, [3], device='cuda') for i in range(2)] img_metas = [ From 7c888dd99bc36489f4eda2a8fff9fb7692b37498 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Wed, 21 Apr 2021 15:05:45 +0800 Subject: [PATCH 10/12] Simplify the logic of weights initialization --- .../models/dense_heads/anchor_free_mono3d_head.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index 27ae1ab50a..a9f45abbb6 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -266,15 +266,10 @@ def _init_predictor(self): def init_weights(self): """Initialize weights of the head.""" - for m in self.cls_convs: - if isinstance(m.conv, nn.Conv2d): - normal_init(m.conv, std=0.01) - for m in self.reg_convs: - if isinstance(m.conv, nn.Conv2d): - normal_init(m.conv, std=0.01) - for m in self.conv_cls_prev: - if isinstance(m.conv, nn.Conv2d): - normal_init(m.conv, std=0.01) + for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]: + for m in modules: + if isinstance(m.conv, nn.Conv2d): + normal_init(m.conv, std=0.01) for conv_reg_prev in self.conv_reg_prevs: if conv_reg_prev is None: continue From 80fcc9d65046d2608530e0a103513803cf4a223a Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Wed, 21 Apr 2021 15:09:09 +0800 Subject: [PATCH 11/12] Add comments to specify the reason of cloning features --- mmdet3d/models/dense_heads/anchor_free_mono3d_head.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py index a9f45abbb6..7dea2e3f07 100644 --- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py +++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py @@ -334,6 +334,7 @@ def forward_single(self, x): for cls_layer in self.cls_convs: cls_feat = cls_layer(cls_feat) + # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_cls_prev_layer in self.conv_cls_prev: clone_cls_feat = conv_cls_prev_layer(clone_cls_feat) @@ -343,6 +344,7 @@ def forward_single(self, x): reg_feat = reg_layer(reg_feat) bbox_pred = [] for i in range(len(self.group_reg_dims)): + # clone the reg_feat for reusing the feature map afterwards clone_reg_feat = reg_feat.clone() if len(self.reg_branch[i]) > 0: for conv_reg_prev_layer in self.conv_reg_prevs[i]: @@ -359,6 +361,7 @@ def forward_single(self, x): attr_pred = None if self.pred_attrs: + # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_attr_prev_layer in self.conv_attr_prev: clone_cls_feat = conv_attr_prev_layer(clone_cls_feat) From a0575628f578b9f721fb6df9d62927192468fb61 Mon Sep 17 00:00:00 2001 From: Tai-Wang Date: Fri, 23 Apr 2021 20:25:58 +0800 Subject: [PATCH 12/12] Update head config --- configs/_base_/models/fcos3d.py | 14 +++++++------- tests/test_models/test_detectors.py | 4 ---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/configs/_base_/models/fcos3d.py b/configs/_base_/models/fcos3d.py index 3a1797b262..b0928e455e 100644 --- a/configs/_base_/models/fcos3d.py +++ b/configs/_base_/models/fcos3d.py @@ -32,16 +32,16 @@ dir_offset=0.7854, # pi/4 strides=[8, 16, 32, 64, 128], group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo - cls_branch=(128, 64), + cls_branch=(256, ), reg_branch=( - (128, 64), # offset - (128, 64), # depth - (64, ), # size - (64, ), # rot + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot () # velo ), - dir_branch=(64, ), - attr_branch=(64, ), + dir_branch=(256, ), + attr_branch=(256, ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, diff --git a/tests/test_models/test_detectors.py b/tests/test_models/test_detectors.py index 011b93f873..1a08b20486 100644 --- a/tests/test_models/test_detectors.py +++ b/tests/test_models/test_detectors.py @@ -370,7 +370,3 @@ def test_fcos3d(): assert scores_3d.shape[0] >= 0 assert labels_3d.shape[0] >= 0 assert attrs_3d.shape[0] >= 0 - - -if __name__ == '__main__': - test_fcos3d()