open-mmlab · JingweiZhang12 · May 11, 2023 · Feb 27, 2023 · Feb 28, 2023 · Mar 1, 2023
diff --git a/mmdet3d/models/decode_heads/__init__.py b/mmdet3d/models/decode_heads/__init__.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .cylinder3d_head import Cylinder3DHead
+from .decode_head import Base3DDecodeHead
 from .dgcnn_head import DGCNNHead
 from .minkunet_head import MinkUNetHead
 from .paconv_head import PAConvHead
 from .pointnet2_head import PointNet2Head
 
 __all__ = [
  'PointNet2Head', 'DGCNNHead', 'PAConvHead', 'Cylinder3DHead',
- 'MinkUNetHead'
+ 'Base3DDecodeHead', 'MinkUNetHead'
 ]
diff --git a/...configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py b/...configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -1,4 +1,4 @@
-_base_ = ['mmdet3d::_base_/default_runtime.py']
+_base_ = ['../../../configs/_base_/default_runtime.py']
 custom_imports = dict(
  imports=['projects.CenterFormer.centerformer'], allow_failed_imports=False)
 

diff --git a/projects/DETR3D/configs/detr3d_r101_gridmask.py b/projects/DETR3D/configs/detr3d_r101_gridmask.py
@@ -1,6 +1,6 @@
 _base_ = [
  # 'mmdet3d::_base_/datasets/nus-3d.py',
- 'mmdet3d::_base_/default_runtime.py'
+ '../../../configs/_base_/default_runtime.py'
 ]
 
 custom_imports = dict(imports=['projects.DETR3D.detr3d'])

diff --git a/projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py b/projects/PETR/configs/petr_vovnet_gridmask_p4_800x320.py
@@ -1,6 +1,7 @@
 _base_ = [
- 'mmdet3d::_base_/datasets/nus-3d.py', 'mmdet3d::_base_/default_runtime.py',
- 'mmdet3d::_base_/schedules/cyclic-20e.py'
+ '../../../configs/_base_/datasets/nus-3d.py',
+ '../../../configs/_base_/default_runtime.py',
+ '../../../configs/_base_/schedules/cyclic-20e.py'
 ]
 backbone_norm_cfg = dict(type='LN', requires_grad=True)
 custom_imports = dict(imports=['projects.PETR.petr'])

diff --git a/projects/TPVFormer/config/tpvformer_8xb1-2x_nus-seg.py b/projects/TPVFormer/config/tpvformer_8xb1-2x_nus-seg.py
@@ -0,0 +1,317 @@
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(
+ imports=['projects.TPVFormer.tpvformer'], allow_failed_imports=False)
+
+dataset_type = 'NuScenesSegDataset'
+data_root = 'data/nuscenes/'
+data_prefix = dict(
+ pts='samples/LIDAR_TOP',
+ pts_semantic_mask='lidarseg/v1.0-trainval',
+ CAM_FRONT='samples/CAM_FRONT',
+ CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+ CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+ CAM_BACK='samples/CAM_BACK',
+ CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+ CAM_BACK_LEFT='samples/CAM_BACK_LEFT')
+
+backend_args = None
+
+train_pipeline = [
+ dict(
+ type='BEVLoadMultiViewImageFromFiles',
+ to_float32=False,
+ color_type='unchanged',
+ num_views=6,
+ backend_args=backend_args),
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=3,
+ backend_args=backend_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_seg_3d=True,
+ with_attr_label=False,
+ seg_3d_dtype='np.uint8'),
+ dict(
+ type='MultiViewWrapper',
+ transforms=dict(type='PhotoMetricDistortion3D')),
+ dict(type='SegLabelMapping'),
+ dict(
+ type='Pack3DDetInputs',
+ keys=['img', 'points', 'pts_semantic_mask'],
+ meta_keys=['lidar2img'])
+]
+
+val_pipeline = [
+ dict(
+ type='BEVLoadMultiViewImageFromFiles',
+ to_float32=False,
+ color_type='unchanged',
+ num_views=6,
+ backend_args=backend_args),
+ dict(
+ type='LoadPointsFromFile',
+ coord_type='LIDAR',
+ load_dim=5,
+ use_dim=3,
+ backend_args=backend_args),
+ dict(
+ type='LoadAnnotations3D',
+ with_bbox_3d=False,
+ with_label_3d=False,
+ with_seg_3d=True,
+ with_attr_label=False,
+ seg_3d_dtype='np.uint8'),
+ dict(type='SegLabelMapping'),
+ dict(
+ type='Pack3DDetInputs',
+ keys=['img', 'points', 'pts_semantic_mask'],
+ meta_keys=['lidar2img'])
+]
+
+test_pipeline = val_pipeline
+
+train_dataloader = dict(
+ batch_size=1,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_prefix=data_prefix,
+ ann_file='nuscenes_infos_train.pkl',
+ pipeline=train_pipeline,
+ test_mode=False))
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=4,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ data_prefix=data_prefix,
+ ann_file='nuscenes_infos_val.pkl',
+ pipeline=val_pipeline,
+ test_mode=True))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='SegMetric')
+
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+ type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01),
+ paramwise_cfg=dict(custom_keys={
+ 'backbone': dict(lr_mult=0.1),
+ }),
+ clip_grad=dict(max_norm=35, norm_type=2),
+)
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=1e-5, by_epoch=False, begin=0, end=500),
+ dict(
+ type='CosineAnnealingLR',
+ begin=0,
+ T_max=24,
+ by_epoch=True,
+ eta_min=1e-6,
+ convert_to_iter_based=True)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1))
+
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+_dim_ = 128
+num_heads = 8
+_ffn_dim_ = _dim_ * 2
+
+tpv_h_ = 200
+tpv_w_ = 200
+tpv_z_ = 16
+scale_h = 1
+scale_w = 1
+scale_z = 1
+num_points_in_pillar = [4, 32, 32]
+num_points = [8, 64, 64]
+hybrid_attn_anchors = 16
+hybrid_attn_points = 32
+hybrid_attn_init = 0
+
+grid_shape = [tpv_h_ * scale_h, tpv_w_ * scale_w, tpv_z_ * scale_z]
+
+self_cross_layer = dict(
+ type='TPVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TPVCrossViewHybridAttention',
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_,
+ num_anchors=hybrid_attn_anchors,
+ embed_dims=_dim_,
+ num_heads=num_heads,
+ num_points=hybrid_attn_points,
+ init_mode=hybrid_attn_init,
+ dropout=0.1),
+ dict(
+ type='TPVImageCrossAttention',
+ pc_range=point_cloud_range,
+ num_cams=6,
+ dropout=0.1,
+ deformable_attention=dict(
+ type='TPVMSDeformableAttention3D',
+ embed_dims=_dim_,
+ num_heads=num_heads,
+ num_points=num_points,
+ num_z_anchors=num_points_in_pillar,
+ num_levels=4,
+ floor_sampling_offset=False,
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_),
+ embed_dims=_dim_,
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm'))
+
+self_layer = dict(
+ type='TPVFormerLayer',
+ attn_cfgs=[
+ dict(
+ type='TPVCrossViewHybridAttention',
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_,
+ num_anchors=hybrid_attn_anchors,
+ embed_dims=_dim_,
+ num_heads=num_heads,
+ num_points=hybrid_attn_points,
+ init_mode=hybrid_attn_init,
+ dropout=0.1)
+ ],
+ feedforward_channels=_ffn_dim_,
+ ffn_dropout=0.1,
+ operation_order=('self_attn', 'norm', 'ffn', 'norm'))
+
+model = dict(
+ type='TPVFormer',
+ data_preprocessor=dict(
+ type='TPVFormerDataPreprocessor',
+ pad_size_divisor=32,
+ mean=[103.530, 116.280, 123.675],
+ std=[1.0, 1.0, 1.0],
+ voxel=True,
+ voxel_type='cylindrical',
+ voxel_layer=dict(
+ grid_shape=grid_shape,
+ point_cloud_range=point_cloud_range,
+ max_num_points=-1,
+ max_voxels=-1,
+ ),
+ batch_augments=[
+ dict(
+ type='GridMask',
+ use_h=True,
+ use_w=True,
+ rotate=1,
+ offset=False,
+ ratio=0.5,
+ mode=1,
+ prob=0.7)
+ ]),
+ backbone=dict(
+ type='mmdet.ResNet',
+ depth=101,
+ num_stages=4,
+ out_indices=(1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN2d', requires_grad=False),
+ norm_eval=True,
+ style='caffe',
+ dcn=dict(
+ type='DCNv2', deform_groups=1, fallback_on_stride=False
+ ), # original DCNv2 will print log when perform load_state_dict
+ stage_with_dcn=(False, False, True, True),
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='checkpoints/tpvformer_r101_dcn_fcos3d_pretrain.pth',
+ prefix='backbone.')),
+ neck=dict(
+ type='mmdet.FPN',
+ in_channels=[512, 1024, 2048],
+ out_channels=_dim_,
+ start_level=0,
+ add_extra_convs='on_output',
+ num_outs=4,
+ relu_before_extra_convs=True,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='checkpoints/tpvformer_r101_dcn_fcos3d_pretrain.pth',
+ prefix='neck.')),
+ encoder=dict(
+ type='TPVFormerEncoder',
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_,
+ num_layers=5,
+ pc_range=point_cloud_range,
+ num_points_in_pillar=num_points_in_pillar,
+ num_points_in_pillar_cross_view=[16, 16, 16],
+ return_intermediate=False,
+ transformerlayers=[
+ self_cross_layer, self_cross_layer, self_cross_layer, self_layer,
+ self_layer
+ ],
+ embed_dims=_dim_,
+ positional_encoding=dict(
+ type='TPVFormerPositionalEncoding',
+ num_feats=[48, 48, 32],
+ h=tpv_h_,
+ w=tpv_w_,
+ z=tpv_z_)),
+ decode_head=dict(
+ type='TPVFormerDecoder',
+ tpv_h=tpv_h_,
+ tpv_w=tpv_w_,
+ tpv_z=tpv_z_,
+ num_classes=17,
+ in_dims=_dim_,
+ hidden_dims=2 * _dim_,
+ out_dims=_dim_,
+ scale_h=scale_h,
+ scale_w=scale_w,
+ scale_z=scale_z,
+ loss_ce=dict(
+ type='mmdet.CrossEntropyLoss',
+ use_sigmoid=False,
+ class_weight=None,
+ avg_non_ignore=True,
+ loss_weight=1.0),
+ loss_lovasz=dict(type='LovaszLoss', loss_weight=1.0, reduction='none'),
+ lovasz_input='points',
+ ce_input='voxel',
+ ignore_index=0))
diff --git a/projects/TPVFormer/tpvformer/__init__.py b/projects/TPVFormer/tpvformer/__init__.py
@@ -0,0 +1,17 @@
+from .cross_view_hybrid_attention import TPVCrossViewHybridAttention
+from .data_preprocessor import TPVFormerDataPreprocessor
+from .image_cross_attention import TPVImageCrossAttention
+from .loading import BEVLoadMultiViewImageFromFiles, SegLabelMapping
+from .nuscenes_dataset import NuScenesSegDataset
+from .positional_encoding import TPVFormerPositionalEncoding
+from .tpvformer import TPVFormer
+from .tpvformer_encoder import TPVFormerEncoder
+from .tpvformer_head import TPVFormerDecoder
+from .tpvformer_layer import TPVFormerLayer
+
+__all__ = [
+ 'TPVCrossViewHybridAttention', 'TPVImageCrossAttention',
+ 'TPVFormerPositionalEncoding', 'TPVFormer', 'TPVFormerEncoder',
+ 'TPVFormerLayer', 'NuScenesSegDataset', 'BEVLoadMultiViewImageFromFiles',
+ 'SegLabelMapping', 'TPVFormerDecoder', 'TPVFormerDataPreprocessor'
+]