diff --git a/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py b/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py new file mode 100644 index 0000000..6883b4d --- /dev/null +++ b/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py @@ -0,0 +1,284 @@ +_base_ = [ + '/nvme/konglingdong/models/mmdetection3d/configs/_base_/datasets/nus-3d.py', + '/nvme/konglingdong/models/mmdetection3d/configs/_base_/default_runtime.py' +] + +plugin=True +plugin_dir='projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +radius_range=[1., 65., 1.] # [start, end, interval] +grid_res = 0.8 +voxel_size = [grid_res, grid_res, grid_res] + +output_size = [256, 64, 10] # [azimuth, radius, height] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) # different from r101 +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +model = dict( + type='PolarFormer', + use_grid_mask=True, + img_backbone=dict( + type='VoVNet', + spec_name='V-99-eSE', + norm_eval=True, + frozen_stages=1, + input_ch=3, + out_features=['stage3', 'stage4', 'stage5']), + img_neck=dict( + type='FPN_TRANS', + num_encoder=0, # encoder is not used here + num_decoder=3, + num_levels=3, + radius_range=radius_range, + use_different_res=True, + use_bev_aug=True, + output_multi_scale=True, + grid_res=grid_res, + pc_range=point_cloud_range, + output_size=output_size, + fpn_cfg=dict( + in_channels=[512, 768, 1024], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=3, + relu_before_extra_convs=True), + ), + pts_bbox_head=dict( + type='PolarFormerHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + radius_range=radius_range, + code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + transformer=dict( + type='PolarTransformer', + num_feature_levels=3, + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', embed_dims=256, num_levels=3), + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), + decoder=dict( + type='PolarTransformerDecoder', + num_layers=6, + return_intermediate=True, + pc_range=point_cloud_range, + radius_range=radius_range, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_levels=3) + ], + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'TransNuScenesDataset' +data_root = '/nvme/share/data/sets/nuScenes/' +corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/' +anno_root = '../../data/' + + +file_client_args = dict(backend='disk') + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'cam2lidar', 'cam_intrinsic', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow')) +] +test_pipeline = [ + dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'cam2lidar', 'cam_intrinsic', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow')) + ]) +] + + +data = dict( + samples_per_gpu=1, + workers_per_gpu=6, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality)) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1), + }), + weight_decay=0.075) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from='pretrained/dd3_det_final.pth' +find_unused_parameters=True diff --git a/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py b/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py new file mode 100644 index 0000000..d40dd0b --- /dev/null +++ b/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py @@ -0,0 +1,289 @@ +_base_ = [ + '/nvme/konglingdong/models/mmdetection3d/configs/_base_/datasets/nus-3d.py', + '/nvme/konglingdong/models/mmdetection3d/configs/_base_/default_runtime.py' +] + +plugin=True +plugin_dir='projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +radius_range=[1., 65., 1.] # [start, end, interval] +grid_res = 0.8 +voxel_size = [grid_res, grid_res, grid_res] + +output_size = [256, 64, 10] # [azimuth, radius, height] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) + +model = dict( + type='PolarFormer', + use_grid_mask=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='SyncBN', requires_grad=True), + norm_eval=False, + style='caffe', + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # dcn is not available for BasicBlock + stage_with_dcn=(False, False, True, True)), + img_neck=dict( + type='FPN_TRANS', + num_encoder=0, # encoder is not used here + num_decoder=3, + num_levels=3, + radius_range=radius_range, + use_different_res=True, + use_bev_aug=True, + output_multi_scale=True, + grid_res=grid_res, + pc_range=point_cloud_range, + output_size=output_size, + fpn_cfg=dict( + in_channels=[512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=3, + relu_before_extra_convs=True, + norm_cfg = dict(type='SyncBN', requires_grad=True)), + ), + pts_bbox_head=dict( + type='PolarFormerHead', + num_query=900, + num_classes=10, + in_channels=256, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + radius_range=radius_range, + code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + transformer=dict( + type='PolarTransformer', + num_feature_levels=3, + encoder=dict( + type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='MultiScaleDeformableAttention', embed_dims=256, num_levels=3), + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'ffn', 'norm'))), + decoder=dict( + type='PolarTransformerDecoder', + num_layers=6, + return_intermediate=True, + pc_range=point_cloud_range, + radius_range=radius_range, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + dict( + type='MultiScaleDeformableAttention', + embed_dims=256, + num_levels=3) + ], + feedforward_channels=1024, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='NMSFreeCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10), + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + normalize=True, + offset=-0.5), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_iou=dict(type='GIoULoss', loss_weight=0.0)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range)))) + +dataset_type = 'TransNuScenesDataset' +data_root = '/nvme/share/data/sets/nuScenes/' +corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/' +anno_root = '../../data/' + +file_client_args = dict(backend='disk') + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'cam2lidar', 'cam_intrinsic', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow')) +] +test_pipeline = [ + dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img'], + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'cam2lidar', 'cam_intrinsic', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow')) + ]) +] + + +data = dict( + samples_per_gpu=1, + workers_per_gpu=6, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality)) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + 'sampling_offsets': dict(lr_mult=0.1), + 'reference_points': dict(lr_mult=0.1), + }), + weight_decay=0.075) +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) +total_epochs = 24 +evaluation = dict(interval=1, pipeline=test_pipeline) + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from='pretrained/fcos3d.pth' + +corruptions = ['CameraCrash','FrameLost','ColorQuant','MotionBlur','Brightness','LowLight','Fog','Snow'] \ No newline at end of file diff --git a/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py new file mode 100644 index 0000000..40e1a17 --- /dev/null +++ b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py @@ -0,0 +1,265 @@ +_base_ = [ + '../default_runtime.py' +] + +class_names = [ + 'car', + 'truck', + 'construction_vehicle', + 'bus', + 'trailer', + 'barrier', + 'motorcycle', + 'bicycle', + 'pedestrian', + 'traffic_cone' +] + +num_classes = len(class_names) +embed_dims = 256 +num_groups = 8 +num_decoder = 6 +model = dict( + type='Sparse4D', + use_grid_mask=True, + img_backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + frozen_stages=1, + norm_eval=True, + style='caffe', + with_cp=True, + out_indices=(0, 1, 2, 3), + stage_with_dcn=(False, False, True, True), + norm_cfg=dict(type='BN2d', requires_grad=False), + dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), + ), + img_neck=dict( + type='FPN', + num_outs=4, + start_level=1, + out_channels=embed_dims, + add_extra_convs='on_output', + relu_before_extra_convs=True, + in_channels=[256, 512, 1024, 2048], + ), + head=dict( + type="Sparse4DHead", + num_anchor=900, + anchor_file="nuscenes_kmeans900.npy", + num_decoder=num_decoder, + embed_dims=embed_dims, + cls_threshold_to_reg=0.05, + anchor_encoder=dict( + type="SparseBox3DEncoder", + embed_dims=embed_dims, + vel_dims=3, + ), + graph_model=dict( + type="MultiheadAttention", + embed_dims=embed_dims, + num_heads=num_groups, + batch_first=True, + dropout=0.1, + ), + norm_layer=dict(type='LN', normalized_shape=embed_dims), + ffn=dict( + type="FFN", + embed_dims=embed_dims, + feedforward_channels=embed_dims * 2, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True), + ), + deformable_model=dict( + type="DeformableFeatureAggregation", + embed_dims=embed_dims, + num_groups=num_groups, + num_levels=4, + num_cams=6, + dropout=0.1, + kps_generator=dict( + type="SparseBox3DKeyPointsGenerator", + num_learnable_pts=6, + fix_scale=[ + [0, 0, 0], + [0.45, 0, 0], + [-0.45, 0, 0], + [0, 0.45, 0], + [0, -0.45, 0], + [0, 0, 0.45], + [0, 0, -0.45], + ], + ), + ), + refine_layer=dict( + type="SparseBox3DRefinementModule", + embed_dims=embed_dims, + num_cls=num_classes, + ), + # pre_norm=True, + sampler=dict( + type="SparseBox3DTarget", + cls_weight=2.0, + box_weight=0.25, + reg_weights=[2.0] * 3 + [1.0] * 7, + cls_wise_reg_weights={ + class_names.index("traffic_cone"): [ + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0 + ], + }, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0, + ), + loss_reg=dict(type='L1Loss', loss_weight=0.25), + gt_cls_key="gt_labels_3d", + gt_reg_key="gt_bboxes_3d", + decoder=dict(type="SparseBox3DDecoder"), + reg_weights=[2.0] * 3 + [1.0] * 7, + kps_generator=dict( + type="SparseBox3DKeyPointsGenerator", + fix_scale=[ + [0, 0, 0], + [0.45, 0, 0], + [-0.45, 0, 0], + [0, 0.45, 0], + [0, -0.45, 0], + [0, 0, 0.45], + [0, 0, -0.45], + ], + ), + depth_module=dict( + type="DepthReweightModule", + embed_dims=embed_dims, + ), + ), +) + +dataset_type = 'NuScenes3DDetTrackDataset' +data_root = '/nvme/share/data/sets/nuScenes/' +anno_root = '/nvme/konglingdong/models/RoboDet/data/' +corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/' +file_client_args = dict(backend='disk') + +img_crop_range = [260, 900, 0, 1600] +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False +) +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type="CustomCropMultiViewImage", crop_range=img_crop_range), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False + ), + dict( + type='CircleObjectRangeFilter', + class_dist_thred=[55] * len(class_names) + ), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='NuScenesSparse4DAdaptor'), + dict( + type='Collect3D', + keys=[ + 'gt_bboxes_3d', + 'gt_labels_3d', + 'img', + "timestamp", + "projection_mat", + "image_wh", + ], + meta_keys=["timestamp", "T_global", "T_global_inv"], + ) +] +test_pipeline = [ + dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root), + dict(type="CustomCropMultiViewImage", crop_range=img_crop_range), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False + ), + dict(type='NuScenesSparse4DAdaptor'), + dict( + type='Collect3D', + keys=[ + 'img', + "timestamp", + "projection_mat", + "image_wh", + ], + meta_keys=["timestamp", "T_global", "T_global_inv"], + ) +] + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False +) + +data_basic_config = dict( + type=dataset_type, + data_root=data_root, + classes=class_names, + modality=input_modality, + box_type_3d='LiDAR', + version='v1.0-trainval', +) + +data = dict( + samples_per_gpu=1, + workers_per_gpu=2, + train=dict( + **data_basic_config, + ann_file=anno_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + test_mode=False, + ), + val=dict( + **data_basic_config, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + test_mode=True, + ), + test=dict( + **data_basic_config, + ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl', + pipeline=test_pipeline, + test_mode=True, + ), +) + +vis_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False + ), + dict( + type='Collect3D', + keys=['img'], + meta_keys=["timestamp", "lidar2img"], + ) +] + +total_epochs = 24 +evaluation = dict(interval=24, pipeline=vis_pipeline) +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) +load_from='fcos3d.pth' +corruptions = ['CameraCrash','FrameLost','ColorQuant','MotionBlur','Brightness','LowLight','Fog','Snow'] diff --git a/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py new file mode 100644 index 0000000..0ae27b8 --- /dev/null +++ b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py @@ -0,0 +1,50 @@ +_base_ = [ + './sparse4d_r101_H1.py' +] + +H = 4 +max_queue_length = H - 1 + +model = dict( + head=dict( + max_queue_length=max_queue_length, + deformable_model=dict( + temporal_fusion_module=dict( + type="LinearFusionModule", + ) + ), + ) +) + +data = dict( + train=dict( + max_interval=2, + fix_interval=True, + max_time_interval=5, + seq_frame=max_queue_length, + ) +) + +''' +mAP: 0.4409 +mATE: 0.6282 +mASE: 0.2721 +mAOE: 0.3853 +mAVE: 0.2922 +mAAE: 0.1888 +NDS: 0.5438 +Eval time: 235.2s + +Per-class results: +Object Class AP ATE ASE AOE AVE AAE +car 0.633 0.432 0.146 0.064 0.225 0.183 +truck 0.364 0.685 0.201 0.087 0.262 0.207 +bus 0.432 0.770 0.215 0.096 0.589 0.238 +trailer 0.198 1.035 0.281 0.516 0.298 0.139 +construction_vehicle 0.120 0.956 0.471 1.059 0.116 0.345 +pedestrian 0.530 0.588 0.289 0.398 0.308 0.150 +motorcycle 0.458 0.600 0.254 0.439 0.363 0.222 +bicycle 0.403 0.491 0.267 0.671 0.176 0.026 +traffic_cone 0.674 0.324 0.311 nan nan nan +barrier 0.597 0.400 0.286 0.139 nan nan +''' \ No newline at end of file