diff --git a/configs/hand/mobilenet_v2/README.md b/configs/hand/mobilenet_v2/README.md new file mode 100644 index 0000000000..ecf6ff38df --- /dev/null +++ b/configs/hand/mobilenet_v2/README.md @@ -0,0 +1,28 @@ +# Simple baselines for human pose estimation and tracking + +## Introduction +``` +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +## Results and models + +### 2d Hand Pose Estimation + +#### Results on OneHand10K val set. + +| Arch | Input Size | PCK@0.2 | AUC | EPE | ckpt | log | +| :--- | :--------: | :------: | :------: | :------: |:------: |:------: | +| [pose_mobilenet_v2](/configs/hand/mobilenet_v2/onehand10k/mobilenetv2_onehand10k_256x256.py) | 256x256 | 0.984 | 0.526 | 29.52 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_onehand10k_256x256-55d34d7d_20201218.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_onehand10k_256x256_20201218.log.json) | + +#### Results on CMU Panoptic (MPII+NZSL val set). + +| Arch | Input Size | PCKh@0.7 | AUC | EPE | ckpt | log | +| :--- | :--------: | :------: | :------: | :------: |:------: |:------: | +| [pose_mobilenet_v2](/configs/hand/mobilenet_v2/panoptic/mobilenetv2_panoptic_256x256.py) | 256x256 | 0.998 | 0.684 | 10.09 | [ckpt](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_panoptic_256x256-b9ec9b68_20201218.pth) | [log](https://download.openmmlab.com/mmpose/top_down/mobilenetv2/mobilenetv2_panoptic_256x256_20201218.log.json) | diff --git a/configs/hand/mobilenet_v2/onehand10k/mobilenetv2_onehand10k_256x256.py b/configs/hand/mobilenet_v2/onehand10k/mobilenetv2_onehand10k_256x256.py new file mode 100644 index 0000000000..d319e2e284 --- /dev/null +++ b/configs/hand/mobilenet_v2/onehand10k/mobilenetv2_onehand10k_256x256.py @@ -0,0 +1,130 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict( + interval=10, metric=['PCK', 'AUC', 'EPE'], key_indicator='AUC') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=21, + dataset_joints=21, + dataset_channel=[ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20 + ], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='mmcls://mobilenet_v2', + backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=1280, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=20, scale_factor=0.3), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']), +] + +test_pipeline = val_pipeline + +data_root = 'data/onehand10k' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='OneHand10KDataset', + ann_file=f'{data_root}/annotations/onehand10k_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='OneHand10KDataset', + ann_file=f'{data_root}/annotations/onehand10k_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='OneHand10KDataset', + ann_file=f'{data_root}/annotations/onehand10k_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/hand/mobilenet_v2/panoptic/mobilenetv2_panoptic_256x256.py b/configs/hand/mobilenet_v2/panoptic/mobilenetv2_panoptic_256x256.py new file mode 100644 index 0000000000..08c2ffe2bc --- /dev/null +++ b/configs/hand/mobilenet_v2/panoptic/mobilenetv2_panoptic_256x256.py @@ -0,0 +1,130 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict( + interval=10, metric=['PCKh', 'AUC', 'EPE'], key_indicator='AUC') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=21, + dataset_joints=21, + dataset_channel=[ + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20 + ], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='mmcls://mobilenet_v2', + backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=1280, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=20, scale_factor=0.3), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']), +] + +test_pipeline = val_pipeline + +data_root = 'data/panoptic' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='PanopticDataset', + ann_file=f'{data_root}/annotations/panoptic_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='PanopticDataset', + ann_file=f'{data_root}/annotations/panoptic_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='PanopticDataset', + ann_file=f'{data_root}/annotations/panoptic_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline), +)