diff --git a/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py b/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py
new file mode 100644
index 0000000..6883b4d
--- /dev/null
+++ b/zoo/PolarFormer/projects/configs/polarformer/polarformer_vovnet_fastbev_aug.py
@@ -0,0 +1,284 @@
+_base_ = [
+    '/nvme/konglingdong/models/mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '/nvme/konglingdong/models/mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+plugin=True
+plugin_dir='projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+radius_range=[1., 65., 1.] # [start, end, interval]
+grid_res = 0.8 
+voxel_size = [grid_res, grid_res, grid_res]
+
+output_size = [256, 64, 10] # [azimuth, radius, height]
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) # different from r101
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+model = dict(
+    type='PolarFormer',
+    use_grid_mask=True,
+    img_backbone=dict(
+        type='VoVNet',
+        spec_name='V-99-eSE',
+        norm_eval=True,
+        frozen_stages=1,
+        input_ch=3,
+        out_features=['stage3', 'stage4', 'stage5']),
+    img_neck=dict(
+        type='FPN_TRANS',
+        num_encoder=0, # encoder is not used here
+        num_decoder=3,
+        num_levels=3,
+        radius_range=radius_range,
+        use_different_res=True,
+        use_bev_aug=True,
+        output_multi_scale=True,
+        grid_res=grid_res,
+        pc_range=point_cloud_range,
+        output_size=output_size,
+        fpn_cfg=dict(
+                in_channels=[512, 768, 1024],
+                out_channels=256,
+                start_level=1,
+                add_extra_convs='on_output',
+                num_outs=3,
+                relu_before_extra_convs=True),
+            ),
+    pts_bbox_head=dict(
+        type='PolarFormerHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        radius_range=radius_range,
+        code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        transformer=dict(
+            type='PolarTransformer',
+            num_feature_levels=3,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention', embed_dims=256, num_levels=3),
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+             decoder=dict(
+                type='PolarTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                pc_range=point_cloud_range,
+                radius_range=radius_range,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256,
+                            num_levels=3)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10), 
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
+            pc_range=point_cloud_range))))
+
+dataset_type = 'TransNuScenesDataset'
+data_root = '/nvme/share/data/sets/nuScenes/'
+corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/'
+anno_root = '../../data/'
+
+
+file_client_args = dict(backend='disk')
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                           meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 
+                            'cam2lidar', 'cam_intrinsic',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow'))
+]
+test_pipeline = [
+    dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'],
+                                   meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 
+                                    'cam2lidar', 'cam_intrinsic',
+                                    'depth2img', 'cam2img', 'pad_shape',
+                                    'scale_factor', 'flip', 'pcd_horizontal_flip',
+                                    'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                                    'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                                    'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                                    'transformation_3d_flow'))
+        ])
+]
+
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=6,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type, 
+        data_root=data_root,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline, 
+        classes=class_names, 
+        modality=input_modality),
+    test=dict(
+        type=dataset_type, 
+        data_root=data_root,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline, 
+        classes=class_names, 
+        modality=input_modality))
+
+optimizer = dict(
+    type='AdamW', 
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.075)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from='pretrained/dd3_det_final.pth'
+find_unused_parameters=True
diff --git a/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py b/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py
new file mode 100644
index 0000000..d40dd0b
--- /dev/null
+++ b/zoo/PolarFormer/projects/configs/robust_test/polarformer_r101.py
@@ -0,0 +1,289 @@
+_base_ = [
+    '/nvme/konglingdong/models/mmdetection3d/configs/_base_/datasets/nus-3d.py',
+    '/nvme/konglingdong/models/mmdetection3d/configs/_base_/default_runtime.py'
+]
+
+plugin=True
+plugin_dir='projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+radius_range=[1., 65., 1.] # [start, end, interval]
+grid_res = 0.8 
+voxel_size = [grid_res, grid_res, grid_res]
+
+output_size = [256, 64, 10] # [azimuth, radius, height]
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+model = dict(
+    type='PolarFormer',
+    use_grid_mask=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        norm_eval=False,
+        style='caffe',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # dcn is not available for BasicBlock
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='FPN_TRANS',
+        num_encoder=0, # encoder is not used here
+        num_decoder=3,
+        num_levels=3,
+        radius_range=radius_range,
+        use_different_res=True,
+        use_bev_aug=True,
+        output_multi_scale=True,
+        grid_res=grid_res,
+        pc_range=point_cloud_range,
+        output_size=output_size,
+        fpn_cfg=dict(
+                in_channels=[512, 1024, 2048],
+                out_channels=256,
+                start_level=1,
+                add_extra_convs='on_output',
+                num_outs=3,
+                relu_before_extra_convs=True,
+                norm_cfg = dict(type='SyncBN', requires_grad=True)),
+            ),
+    pts_bbox_head=dict(
+        type='PolarFormerHead',
+        num_query=900,
+        num_classes=10,
+        in_channels=256,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        radius_range=radius_range,
+        code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        transformer=dict(
+            type='PolarTransformer',
+            num_feature_levels=3,
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention', embed_dims=256, num_levels=3),
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+             decoder=dict(
+                type='PolarTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                pc_range=point_cloud_range,
+                radius_range=radius_range,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=256,
+                            num_levels=3)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10), 
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=128,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
+            pc_range=point_cloud_range))))
+
+dataset_type = 'TransNuScenesDataset'
+data_root = '/nvme/share/data/sets/nuScenes/'
+corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/'
+anno_root = '../../data/'
+
+file_client_args = dict(backend='disk')
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                           meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 
+                            'cam2lidar', 'cam_intrinsic',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow'))
+]
+test_pipeline = [
+    dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'],
+                                   meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 
+                                    'cam2lidar', 'cam_intrinsic',
+                                    'depth2img', 'cam2img', 'pad_shape',
+                                    'scale_factor', 'flip', 'pcd_horizontal_flip',
+                                    'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                                    'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                                    'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                                    'transformation_3d_flow'))
+        ])
+]
+
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=6,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type, 
+        data_root=data_root,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline, 
+        classes=class_names, 
+        modality=input_modality),
+    test=dict(
+        type=dataset_type, 
+        data_root=data_root,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline, 
+        classes=class_names, 
+        modality=input_modality))
+
+optimizer = dict(
+    type='AdamW', 
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.075)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from='pretrained/fcos3d.pth'
+
+corruptions = ['CameraCrash','FrameLost','ColorQuant','MotionBlur','Brightness','LowLight','Fog','Snow']
\ No newline at end of file
diff --git a/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py
new file mode 100644
index 0000000..40e1a17
--- /dev/null
+++ b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H1.py
@@ -0,0 +1,265 @@
+_base_ = [
+    '../default_runtime.py'
+]
+
+class_names = [
+    'car',
+    'truck',
+    'construction_vehicle',
+    'bus',
+    'trailer',
+    'barrier',
+    'motorcycle',
+    'bicycle',
+    'pedestrian',
+    'traffic_cone'
+]
+
+num_classes = len(class_names)
+embed_dims = 256
+num_groups = 8
+num_decoder = 6
+model = dict(
+    type='Sparse4D',
+    use_grid_mask=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        frozen_stages=1,
+        norm_eval=True,
+        style='caffe',
+        with_cp=True,
+        out_indices=(0, 1, 2, 3),
+        stage_with_dcn=(False, False, True, True),
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+    ),
+    img_neck=dict(
+        type='FPN',
+        num_outs=4,
+        start_level=1,
+        out_channels=embed_dims,
+        add_extra_convs='on_output',
+        relu_before_extra_convs=True,
+        in_channels=[256, 512, 1024, 2048],
+    ),
+    head=dict(
+        type="Sparse4DHead",
+        num_anchor=900,
+        anchor_file="nuscenes_kmeans900.npy",
+        num_decoder=num_decoder,
+        embed_dims=embed_dims,
+        cls_threshold_to_reg=0.05,
+        anchor_encoder=dict(
+            type="SparseBox3DEncoder",
+            embed_dims=embed_dims,
+            vel_dims=3,
+        ),
+        graph_model=dict(
+            type="MultiheadAttention",
+            embed_dims=embed_dims,
+            num_heads=num_groups,
+            batch_first=True,
+            dropout=0.1,
+        ),
+        norm_layer=dict(type='LN', normalized_shape=embed_dims),
+        ffn=dict(
+            type="FFN",
+            embed_dims=embed_dims,
+            feedforward_channels=embed_dims * 2,
+            num_fcs=2,
+            ffn_drop=0.1,
+            act_cfg=dict(type='ReLU', inplace=True),
+        ),
+        deformable_model=dict(
+            type="DeformableFeatureAggregation",
+            embed_dims=embed_dims,
+            num_groups=num_groups,
+            num_levels=4,
+            num_cams=6,
+            dropout=0.1,
+            kps_generator=dict(
+                type="SparseBox3DKeyPointsGenerator",
+                num_learnable_pts=6,
+                fix_scale=[
+                    [0, 0, 0],
+                    [0.45, 0, 0],
+                    [-0.45, 0, 0],
+                    [0, 0.45, 0],
+                    [0, -0.45, 0],
+                    [0, 0, 0.45],
+                    [0, 0, -0.45],
+                ],
+            ),
+        ),
+        refine_layer=dict(
+            type="SparseBox3DRefinementModule",
+            embed_dims=embed_dims,
+            num_cls=num_classes,
+        ),
+        # pre_norm=True,
+        sampler=dict(
+            type="SparseBox3DTarget",
+            cls_weight=2.0,
+            box_weight=0.25,
+            reg_weights=[2.0] * 3 + [1.0] * 7,
+            cls_wise_reg_weights={
+                class_names.index("traffic_cone"): [
+                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0
+                ],
+            },
+        ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0,
+        ),
+        loss_reg=dict(type='L1Loss', loss_weight=0.25),
+        gt_cls_key="gt_labels_3d",
+        gt_reg_key="gt_bboxes_3d",
+        decoder=dict(type="SparseBox3DDecoder"),
+        reg_weights=[2.0] * 3 + [1.0] * 7,
+        kps_generator=dict(
+            type="SparseBox3DKeyPointsGenerator",
+            fix_scale=[
+                [0, 0, 0],
+                [0.45, 0, 0],
+                [-0.45, 0, 0],
+                [0, 0.45, 0],
+                [0, -0.45, 0],
+                [0, 0, 0.45],
+                [0, 0, -0.45],
+            ],
+        ),
+        depth_module=dict(
+            type="DepthReweightModule",
+            embed_dims=embed_dims,
+        ),
+    ),
+)
+
+dataset_type = 'NuScenes3DDetTrackDataset'
+data_root = '/nvme/share/data/sets/nuScenes/'
+anno_root = '/nvme/konglingdong/models/RoboDet/data/'
+corruption_root = '/nvme/konglingdong/data/sets/nuScenes-c/'
+file_client_args = dict(backend='disk')
+
+img_crop_range = [260, 900, 0, 1600]
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False
+)
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type="CustomCropMultiViewImage", crop_range=img_crop_range),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False
+    ),
+    dict(
+        type='CircleObjectRangeFilter',
+        class_dist_thred=[55] * len(class_names)
+    ),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='NuScenesSparse4DAdaptor'),
+    dict(
+        type='Collect3D',
+        keys=[
+            'gt_bboxes_3d',
+            'gt_labels_3d',
+            'img',
+            "timestamp",
+            "projection_mat",
+            "image_wh",
+        ],
+        meta_keys=["timestamp", "T_global", "T_global_inv"],
+    )
+]
+test_pipeline = [
+    dict(type='Custom_LoadMultiViewImageFromFiles', to_float32=True, corruption_root=corruption_root),
+    dict(type="CustomCropMultiViewImage", crop_range=img_crop_range),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False
+    ),
+    dict(type='NuScenesSparse4DAdaptor'),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img',
+            "timestamp",
+            "projection_mat",
+            "image_wh",
+        ],
+        meta_keys=["timestamp", "T_global", "T_global_inv"],
+    )
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False
+)
+
+data_basic_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    box_type_3d='LiDAR',
+    version='v1.0-trainval',
+)
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=2,
+    train=dict(
+        **data_basic_config,
+        ann_file=anno_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        test_mode=False,
+    ),
+    val=dict(
+        **data_basic_config,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline,
+        test_mode=True,
+    ),
+    test=dict(
+        **data_basic_config,
+        ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=test_pipeline,
+        test_mode=True,
+    ),
+)
+
+vis_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False
+    ),
+    dict(
+        type='Collect3D',
+        keys=['img'],
+        meta_keys=["timestamp", "lidar2img"],
+    )
+]
+
+total_epochs = 24
+evaluation = dict(interval=24, pipeline=vis_pipeline)
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from='fcos3d.pth'
+corruptions = ['CameraCrash','FrameLost','ColorQuant','MotionBlur','Brightness','LowLight','Fog','Snow']
diff --git a/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py
new file mode 100644
index 0000000..0ae27b8
--- /dev/null
+++ b/zoo/Sparse4D/projects/configs/robust_test/sparse4d_r101_H4.py
@@ -0,0 +1,50 @@
+_base_ = [
+    './sparse4d_r101_H1.py'
+]
+
+H = 4
+max_queue_length = H - 1
+
+model = dict(
+    head=dict(
+        max_queue_length=max_queue_length,
+        deformable_model=dict(
+            temporal_fusion_module=dict(
+                type="LinearFusionModule",
+            )
+        ),
+    )
+)
+
+data = dict(
+    train=dict(
+        max_interval=2,
+        fix_interval=True,
+        max_time_interval=5,
+        seq_frame=max_queue_length,
+    )
+)
+
+'''
+mAP: 0.4409
+mATE: 0.6282
+mASE: 0.2721
+mAOE: 0.3853
+mAVE: 0.2922
+mAAE: 0.1888
+NDS: 0.5438
+Eval time: 235.2s
+
+Per-class results:
+Object Class	AP	ATE	ASE	AOE	AVE	AAE
+car	0.633	0.432	0.146	0.064	0.225	0.183
+truck	0.364	0.685	0.201	0.087	0.262	0.207
+bus	0.432	0.770	0.215	0.096	0.589	0.238
+trailer	0.198	1.035	0.281	0.516	0.298	0.139
+construction_vehicle	0.120	0.956	0.471	1.059	0.116	0.345
+pedestrian	0.530	0.588	0.289	0.398	0.308	0.150
+motorcycle	0.458	0.600	0.254	0.439	0.363	0.222
+bicycle	0.403	0.491	0.267	0.671	0.176	0.026
+traffic_cone	0.674	0.324	0.311	nan	nan	nan
+barrier	0.597	0.400	0.286	0.139	nan	nan
+'''
\ No newline at end of file