open-mmlab · ZwwWayne · Aug 30, 2020 · Aug 11, 2020 · Aug 16, 2020 · Aug 17, 2020
diff --git a/configs/_base_/models/h3dnet.py b/configs/_base_/models/h3dnet.py
@@ -0,0 +1,332 @@
+primitive_z_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=2,
+ num_classes=18,
+ primitive_mode='z',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_moudule_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=1,
+ num_classes=18,
+ primitive_mode='xy',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_moudule_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=0.5,
+ loss_dst_weight=0.5),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+primitive_line_cfg = dict(
+ type='PrimitiveHead',
+ num_dims=0,
+ num_classes=18,
+ primitive_mode='line',
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ vote_moudule_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=1,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ num_point=1024,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.4, 0.6],
+ reduction='mean',
+ loss_weight=30.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_reg_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='sum',
+ loss_src_weight=1.0,
+ loss_dst_weight=1.0),
+ semantic_cls_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+ train_cfg=dict(
+ dist_thresh=0.2,
+ var_thresh=1e-2,
+ lower_thresh=1e-6,
+ num_point=100,
+ num_point_line=10,
+ line_thresh=0.2))
+
+proposal_module_cfg = dict(
+ suface_matching_cfg=dict(
+ num_point=256 * 6,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 6, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ line_matching_cfg=dict(
+ num_point=256 * 12,
+ radius=0.5,
+ num_sample=32,
+ mlp_channels=[128 + 12, 128, 64, 32],
+ use_xyz=True,
+ normalize_xyz=True),
+ primitive_refine_channels=[128, 128, 128],
+ upper_thresh=100.0,
+ surface_thresh=0.5,
+ line_thresh=0.5,
+ train_cfg=dict(
+ far_threshold=0.6,
+ near_threshold=0.3,
+ mask_surface_threshold=0.3,
+ label_surface_threshold=0.3,
+ mask_line_threshold=0.3,
+ label_line_threshold=0.3),
+ cues_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ cues_semantic_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.3, 0.7],
+ reduction='mean',
+ loss_weight=5.0),
+ proposal_objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='none',
+ loss_weight=5.0),
+ primitive_center_loss=dict(
+ type='MSELoss', reduction='none', loss_weight=1.0))
+
+model = dict(
+ type='H3DNet',
+ backbone=dict(
+ type='MultiBackbone',
+ num_streams=4,
+ suffixes=['net0', 'net1', 'net2', 'net3'],
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+ act_cfg=dict(type='ReLU'),
+ backbones=dict(
+ type='PointNet2SASSG',
+ in_channels=4,
+ num_points=(2048, 1024, 512, 256),
+ radius=(0.2, 0.4, 0.8, 1.2),
+ num_samples=(64, 32, 16, 16),
+ sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+ (128, 128, 256)),
+ fp_channels=((256, 256), (256, 256)),
+ norm_cfg=dict(type='BN2d'),
+ pool_mod='max')),
+ rpn_head=dict(
+ type='VoteHead',
+ vote_moudule_cfg=dict(
+ in_channels=256,
+ vote_per_seed=1,
+ gt_per_seed=3,
+ conv_channels=(256, 256),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ norm_feats=True,
+ vote_loss=dict(
+ type='ChamferDistance',
+ mode='l1',
+ reduction='none',
+ loss_dst_weight=10.0)),
+ vote_aggregation_cfg=dict(
+ num_point=256,
+ radius=0.3,
+ num_sample=16,
+ mlp_channels=[256, 128, 128, 128],
+ use_xyz=True,
+ normalize_xyz=True),
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+ roi_head=dict(
+ type='H3DRoIHead',
+ primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+ bbox_head=dict(
+ type='H3DBboxHead',
+ gt_per_seed=3,
+ num_proposal=256,
+ proposal_module_cfg=proposal_module_cfg,
+ feat_channels=(128, 128),
+ conv_cfg=dict(type='Conv1d'),
+ norm_cfg=dict(type='BN1d'),
+ objectness_loss=dict(
+ type='CrossEntropyLoss',
+ class_weight=[0.2, 0.8],
+ reduction='sum',
+ loss_weight=5.0),
+ center_loss=dict(
+ type='ChamferDistance',
+ mode='l2',
+ reduction='sum',
+ loss_src_weight=10.0,
+ loss_dst_weight=10.0),
+ dir_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ dir_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ size_class_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+ size_res_loss=dict(
+ type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+ semantic_loss=dict(
+ type='CrossEntropyLoss', reduction='sum', loss_weight=0.1))))
+
+# model training and testing settings
+train_cfg = dict(
+ rpn=dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+ rpn_proposal=dict(use_nms=False),
+ rcnn=dict(pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'))
+
+test_cfg = dict(
+ rpn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True,
+ use_nms=False),
+ rcnn=dict(
+ sample_mod='seed',
+ nms_thr=0.25,
+ score_thr=0.05,
+ per_class_proposal=True))
diff --git a/configs/h3dnet/README.md b/configs/h3dnet/README.md
@@ -0,0 +1,19 @@
+# H3DNet: 3D Object Detection Using Hybrid Geometric Primitives
+
+## Introduction
+We implement H3DNet and provide the result and checkpoints on ScanNet datasets.
+```
+@inproceedings{zhang2020h3dnet,
+ author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing},
+ title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives},
+ booktitle = {Proceedings of the European Conference on Computer Vision},
+ year = {2020}
+}
+```
+
+## Results
+
+### ScanNet
+| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
+| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
+| [MultiBackbone](./h3dnet_scannet-3d-18class.py) | 3x |7.9||66.43|48.01|[model](https://openmmlab.oss-accelerate.aliyuncs.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth) &#124; [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238.log.json)|