-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
2,449 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,12 @@ | ||
# Copyright (c) OpenMMLab. All rights reserved. | ||
from .cylinder3d_head import Cylinder3DHead | ||
from .decode_head import Base3DDecodeHead | ||
from .dgcnn_head import DGCNNHead | ||
from .minkunet_head import MinkUNetHead | ||
from .paconv_head import PAConvHead | ||
from .pointnet2_head import PointNet2Head | ||
|
||
__all__ = [ | ||
'PointNet2Head', 'DGCNNHead', 'PAConvHead', 'Cylinder3DHead', | ||
'MinkUNetHead' | ||
'Base3DDecodeHead', 'MinkUNetHead' | ||
] |
2 changes: 1 addition & 1 deletion
2
...configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,317 @@ | ||
_base_ = ['../../../configs/_base_/default_runtime.py'] | ||
|
||
custom_imports = dict( | ||
imports=['projects.TPVFormer.tpvformer'], allow_failed_imports=False) | ||
|
||
dataset_type = 'NuScenesSegDataset' | ||
data_root = 'data/nuscenes/' | ||
data_prefix = dict( | ||
pts='samples/LIDAR_TOP', | ||
pts_semantic_mask='lidarseg/v1.0-trainval', | ||
CAM_FRONT='samples/CAM_FRONT', | ||
CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', | ||
CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', | ||
CAM_BACK='samples/CAM_BACK', | ||
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', | ||
CAM_BACK_LEFT='samples/CAM_BACK_LEFT') | ||
|
||
backend_args = None | ||
|
||
train_pipeline = [ | ||
dict( | ||
type='BEVLoadMultiViewImageFromFiles', | ||
to_float32=False, | ||
color_type='unchanged', | ||
num_views=6, | ||
backend_args=backend_args), | ||
dict( | ||
type='LoadPointsFromFile', | ||
coord_type='LIDAR', | ||
load_dim=5, | ||
use_dim=3, | ||
backend_args=backend_args), | ||
dict( | ||
type='LoadAnnotations3D', | ||
with_bbox_3d=False, | ||
with_label_3d=False, | ||
with_seg_3d=True, | ||
with_attr_label=False, | ||
seg_3d_dtype='np.uint8'), | ||
dict( | ||
type='MultiViewWrapper', | ||
transforms=dict(type='PhotoMetricDistortion3D')), | ||
dict(type='SegLabelMapping'), | ||
dict( | ||
type='Pack3DDetInputs', | ||
keys=['img', 'points', 'pts_semantic_mask'], | ||
meta_keys=['lidar2img']) | ||
] | ||
|
||
val_pipeline = [ | ||
dict( | ||
type='BEVLoadMultiViewImageFromFiles', | ||
to_float32=False, | ||
color_type='unchanged', | ||
num_views=6, | ||
backend_args=backend_args), | ||
dict( | ||
type='LoadPointsFromFile', | ||
coord_type='LIDAR', | ||
load_dim=5, | ||
use_dim=3, | ||
backend_args=backend_args), | ||
dict( | ||
type='LoadAnnotations3D', | ||
with_bbox_3d=False, | ||
with_label_3d=False, | ||
with_seg_3d=True, | ||
with_attr_label=False, | ||
seg_3d_dtype='np.uint8'), | ||
dict(type='SegLabelMapping'), | ||
dict( | ||
type='Pack3DDetInputs', | ||
keys=['img', 'points', 'pts_semantic_mask'], | ||
meta_keys=['lidar2img']) | ||
] | ||
|
||
test_pipeline = val_pipeline | ||
|
||
train_dataloader = dict( | ||
batch_size=1, | ||
num_workers=4, | ||
persistent_workers=True, | ||
drop_last=True, | ||
sampler=dict(type='DefaultSampler', shuffle=True), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
data_prefix=data_prefix, | ||
ann_file='nuscenes_infos_train.pkl', | ||
pipeline=train_pipeline, | ||
test_mode=False)) | ||
|
||
val_dataloader = dict( | ||
batch_size=1, | ||
num_workers=4, | ||
persistent_workers=True, | ||
drop_last=False, | ||
sampler=dict(type='DefaultSampler', shuffle=False), | ||
dataset=dict( | ||
type=dataset_type, | ||
data_root=data_root, | ||
data_prefix=data_prefix, | ||
ann_file='nuscenes_infos_val.pkl', | ||
pipeline=val_pipeline, | ||
test_mode=True)) | ||
|
||
test_dataloader = val_dataloader | ||
|
||
val_evaluator = dict(type='SegMetric') | ||
|
||
test_evaluator = val_evaluator | ||
|
||
vis_backends = [dict(type='LocalVisBackend')] | ||
visualizer = dict( | ||
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') | ||
|
||
optim_wrapper = dict( | ||
type='OptimWrapper', | ||
optimizer=dict(type='AdamW', lr=2e-4, weight_decay=0.01), | ||
paramwise_cfg=dict(custom_keys={ | ||
'backbone': dict(lr_mult=0.1), | ||
}), | ||
clip_grad=dict(max_norm=35, norm_type=2), | ||
) | ||
|
||
param_scheduler = [ | ||
dict(type='LinearLR', start_factor=1e-5, by_epoch=False, begin=0, end=500), | ||
dict( | ||
type='CosineAnnealingLR', | ||
begin=0, | ||
T_max=24, | ||
by_epoch=True, | ||
eta_min=1e-6, | ||
convert_to_iter_based=True) | ||
] | ||
|
||
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=1) | ||
val_cfg = dict(type='ValLoop') | ||
test_cfg = dict(type='TestLoop') | ||
|
||
default_hooks = dict(checkpoint=dict(type='CheckpointHook', interval=1)) | ||
|
||
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] | ||
_dim_ = 128 | ||
num_heads = 8 | ||
_ffn_dim_ = _dim_ * 2 | ||
|
||
tpv_h_ = 200 | ||
tpv_w_ = 200 | ||
tpv_z_ = 16 | ||
scale_h = 1 | ||
scale_w = 1 | ||
scale_z = 1 | ||
num_points_in_pillar = [4, 32, 32] | ||
num_points = [8, 64, 64] | ||
hybrid_attn_anchors = 16 | ||
hybrid_attn_points = 32 | ||
hybrid_attn_init = 0 | ||
|
||
grid_shape = [tpv_h_ * scale_h, tpv_w_ * scale_w, tpv_z_ * scale_z] | ||
|
||
self_cross_layer = dict( | ||
type='TPVFormerLayer', | ||
attn_cfgs=[ | ||
dict( | ||
type='TPVCrossViewHybridAttention', | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_, | ||
num_anchors=hybrid_attn_anchors, | ||
embed_dims=_dim_, | ||
num_heads=num_heads, | ||
num_points=hybrid_attn_points, | ||
init_mode=hybrid_attn_init, | ||
dropout=0.1), | ||
dict( | ||
type='TPVImageCrossAttention', | ||
pc_range=point_cloud_range, | ||
num_cams=6, | ||
dropout=0.1, | ||
deformable_attention=dict( | ||
type='TPVMSDeformableAttention3D', | ||
embed_dims=_dim_, | ||
num_heads=num_heads, | ||
num_points=num_points, | ||
num_z_anchors=num_points_in_pillar, | ||
num_levels=4, | ||
floor_sampling_offset=False, | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_), | ||
embed_dims=_dim_, | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_) | ||
], | ||
feedforward_channels=_ffn_dim_, | ||
ffn_dropout=0.1, | ||
operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')) | ||
|
||
self_layer = dict( | ||
type='TPVFormerLayer', | ||
attn_cfgs=[ | ||
dict( | ||
type='TPVCrossViewHybridAttention', | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_, | ||
num_anchors=hybrid_attn_anchors, | ||
embed_dims=_dim_, | ||
num_heads=num_heads, | ||
num_points=hybrid_attn_points, | ||
init_mode=hybrid_attn_init, | ||
dropout=0.1) | ||
], | ||
feedforward_channels=_ffn_dim_, | ||
ffn_dropout=0.1, | ||
operation_order=('self_attn', 'norm', 'ffn', 'norm')) | ||
|
||
model = dict( | ||
type='TPVFormer', | ||
data_preprocessor=dict( | ||
type='TPVFormerDataPreprocessor', | ||
pad_size_divisor=32, | ||
mean=[103.530, 116.280, 123.675], | ||
std=[1.0, 1.0, 1.0], | ||
voxel=True, | ||
voxel_type='cylindrical', | ||
voxel_layer=dict( | ||
grid_shape=grid_shape, | ||
point_cloud_range=point_cloud_range, | ||
max_num_points=-1, | ||
max_voxels=-1, | ||
), | ||
batch_augments=[ | ||
dict( | ||
type='GridMask', | ||
use_h=True, | ||
use_w=True, | ||
rotate=1, | ||
offset=False, | ||
ratio=0.5, | ||
mode=1, | ||
prob=0.7) | ||
]), | ||
backbone=dict( | ||
type='mmdet.ResNet', | ||
depth=101, | ||
num_stages=4, | ||
out_indices=(1, 2, 3), | ||
frozen_stages=1, | ||
norm_cfg=dict(type='BN2d', requires_grad=False), | ||
norm_eval=True, | ||
style='caffe', | ||
dcn=dict( | ||
type='DCNv2', deform_groups=1, fallback_on_stride=False | ||
), # original DCNv2 will print log when perform load_state_dict | ||
stage_with_dcn=(False, False, True, True), | ||
init_cfg=dict( | ||
type='Pretrained', | ||
checkpoint='checkpoints/tpvformer_r101_dcn_fcos3d_pretrain.pth', | ||
prefix='backbone.')), | ||
neck=dict( | ||
type='mmdet.FPN', | ||
in_channels=[512, 1024, 2048], | ||
out_channels=_dim_, | ||
start_level=0, | ||
add_extra_convs='on_output', | ||
num_outs=4, | ||
relu_before_extra_convs=True, | ||
init_cfg=dict( | ||
type='Pretrained', | ||
checkpoint='checkpoints/tpvformer_r101_dcn_fcos3d_pretrain.pth', | ||
prefix='neck.')), | ||
encoder=dict( | ||
type='TPVFormerEncoder', | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_, | ||
num_layers=5, | ||
pc_range=point_cloud_range, | ||
num_points_in_pillar=num_points_in_pillar, | ||
num_points_in_pillar_cross_view=[16, 16, 16], | ||
return_intermediate=False, | ||
transformerlayers=[ | ||
self_cross_layer, self_cross_layer, self_cross_layer, self_layer, | ||
self_layer | ||
], | ||
embed_dims=_dim_, | ||
positional_encoding=dict( | ||
type='TPVFormerPositionalEncoding', | ||
num_feats=[48, 48, 32], | ||
h=tpv_h_, | ||
w=tpv_w_, | ||
z=tpv_z_)), | ||
decode_head=dict( | ||
type='TPVFormerDecoder', | ||
tpv_h=tpv_h_, | ||
tpv_w=tpv_w_, | ||
tpv_z=tpv_z_, | ||
num_classes=17, | ||
in_dims=_dim_, | ||
hidden_dims=2 * _dim_, | ||
out_dims=_dim_, | ||
scale_h=scale_h, | ||
scale_w=scale_w, | ||
scale_z=scale_z, | ||
loss_ce=dict( | ||
type='mmdet.CrossEntropyLoss', | ||
use_sigmoid=False, | ||
class_weight=None, | ||
avg_non_ignore=True, | ||
loss_weight=1.0), | ||
loss_lovasz=dict(type='LovaszLoss', loss_weight=1.0, reduction='none'), | ||
lovasz_input='points', | ||
ce_input='voxel', | ||
ignore_index=0)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from .cross_view_hybrid_attention import TPVCrossViewHybridAttention | ||
from .data_preprocessor import TPVFormerDataPreprocessor | ||
from .image_cross_attention import TPVImageCrossAttention | ||
from .loading import BEVLoadMultiViewImageFromFiles, SegLabelMapping | ||
from .nuscenes_dataset import NuScenesSegDataset | ||
from .positional_encoding import TPVFormerPositionalEncoding | ||
from .tpvformer import TPVFormer | ||
from .tpvformer_encoder import TPVFormerEncoder | ||
from .tpvformer_head import TPVFormerDecoder | ||
from .tpvformer_layer import TPVFormerLayer | ||
|
||
__all__ = [ | ||
'TPVCrossViewHybridAttention', 'TPVImageCrossAttention', | ||
'TPVFormerPositionalEncoding', 'TPVFormer', 'TPVFormerEncoder', | ||
'TPVFormerLayer', 'NuScenesSegDataset', 'BEVLoadMultiViewImageFromFiles', | ||
'SegLabelMapping', 'TPVFormerDecoder', 'TPVFormerDataPreprocessor' | ||
] |
Oops, something went wrong.