diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py index 08966d7ffe..7a3a4be366 100644 --- a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py +++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py @@ -198,14 +198,3 @@ data_cfg=data_cfg, pipeline=val_pipeline), ) - -loss = dict( - type='MultiLossFactory', - num_stages=2, - ae_loss_type='exp', - with_ae_loss=[True, False], - push_loss_factor=[0.001, 0.001], - pull_loss_factor=[0.001, 0.001], - with_heatmaps_loss=[True, True], - heatmaps_loss_factor=[1.0, 1.0], -) diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py index 6ce95f2594..192005a6cc 100755 --- a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py +++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py @@ -199,14 +199,3 @@ data_cfg=data_cfg, pipeline=val_pipeline), ) - -loss = dict( - type='MultiLossFactory', - num_stages=2, - ae_loss_type='exp', - with_ae_loss=[True, False], - push_loss_factor=[0.001, 0.001], - pull_loss_factor=[0.001, 0.001], - with_heatmaps_loss=[True, True], - heatmaps_loss_factor=[1.0, 1.0], -) diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py index 575b8b4b12..3a53b8cd7d 100755 --- a/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py +++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py @@ -199,14 +199,3 @@ data_cfg=data_cfg, pipeline=val_pipeline), ) - -loss = dict( - type='MultiLossFactory', - num_stages=2, - ae_loss_type='exp', - with_ae_loss=[True, False], - push_loss_factor=[0.001, 0.001], - pull_loss_factor=[0.001, 0.001], - with_heatmaps_loss=[True, True], - heatmaps_loss_factor=[1.0, 1.0], -) diff --git a/configs/bottom_up/hrnet/README.md b/configs/bottom_up/hrnet/README.md new file mode 100644 index 0000000000..7a844590ac --- /dev/null +++ b/configs/bottom_up/hrnet/README.md @@ -0,0 +1,35 @@ +# Associative Embedding (AE) + HRNet + +## Introduction +``` +@inproceedings{newell2017associative, + title={Associative embedding: End-to-end learning for joint detection and grouping}, + author={Newell, Alejandro and Huang, Zhiao and Deng, Jia}, + booktitle={Advances in neural information processing systems}, + pages={2277--2287}, + year={2017} +} +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +## Results and models + +### Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [HRNet-w32](/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py) | 512x512 | 0.654 | 0.863 | 0.720 | 0.710 | 0.892 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) | +| [HRNet-w48](/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py) | 512x512 | 0.665 | 0.860 | 0.727 | 0.716 | 0.889 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) | + +### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5]. + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [HRNet-w32](/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py) | 512x512 | 0.698 | 0.877 | 0.760 | 0.748 | 0.907 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) | +| [HRNet-w48](/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py) | 512x512 | 0.712 | 0.880 | 0.771 | 0.757 | 0.909 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) | diff --git a/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py new file mode 100644 index 0000000000..7907bd0d9c --- /dev/null +++ b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py @@ -0,0 +1,196 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=32, + num_joints=17, + num_deconv_layers=0, + tag_per_joint=True, + with_ae_loss=[True], + extra=dict(final_conv_kernel=1, )), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py new file mode 100644 index 0000000000..ba63fde5c1 --- /dev/null +++ b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py @@ -0,0 +1,196 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256))), + ), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=32, + num_joints=17, + num_deconv_layers=0, + tag_per_joint=True, + with_ae_loss=[True], + extra=dict(final_conv_kernel=1, )), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py new file mode 100644 index 0000000000..7dc03c0e2b --- /dev/null +++ b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py @@ -0,0 +1,196 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/hrnet_w48-8ef0771d.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + ), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=48, + num_joints=17, + num_deconv_layers=0, + tag_per_joint=True, + with_ae_loss=[True], + extra=dict(final_conv_kernel=1, )), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py new file mode 100644 index 0000000000..e881ca7ee0 --- /dev/null +++ b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py @@ -0,0 +1,196 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/hrnet_w48-8ef0771d.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384))), + ), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=48, + num_joints=17, + num_deconv_layers=0, + tag_per_joint=True, + with_ae_loss=[True], + extra=dict(final_conv_kernel=1, )), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=8, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/mobilenet/README.md b/configs/bottom_up/mobilenet/README.md new file mode 100644 index 0000000000..d93f308da1 --- /dev/null +++ b/configs/bottom_up/mobilenet/README.md @@ -0,0 +1,33 @@ +# Associative Embedding (AE) + Mobilenetv2 + +## Introduction +``` +@inproceedings{newell2017associative, + title={Associative embedding: End-to-end learning for joint detection and grouping}, + author={Newell, Alejandro and Huang, Zhiao and Deng, Jia}, + booktitle={Advances in neural information processing systems}, + pages={2277--2287}, + year={2017} +} +@inproceedings{sandler2018mobilenetv2, + title={Mobilenetv2: Inverted residuals and linear bottlenecks}, + author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={4510--4520}, + year={2018} +} +``` + +## Results and models + +### Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [pose_mobilenetv2](/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py) | 512x512 | 0.380 | 0.671 | 0.368 | 0.473 | 0.741 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/mobilenetv2_coco_512x512_20200816.log.json) | + +### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5]. + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [pose_mobilenetv2](/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py) | 512x512 | 0.442 | 0.696 | 0.422 | 0.517 | 0.766 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) | diff --git a/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py b/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py new file mode 100644 index 0000000000..04237e4011 --- /dev/null +++ b/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py @@ -0,0 +1,167 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/' + 'mobilenet_v2_batch256_20200708-3b2dc3af.pth', + backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=1280, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/README.md b/configs/bottom_up/resnet/README.md new file mode 100644 index 0000000000..b80e17aca2 --- /dev/null +++ b/configs/bottom_up/resnet/README.md @@ -0,0 +1,35 @@ +# Associative Embedding (AE) + ResNet + +## Introduction +``` +@inproceedings{newell2017associative, + title={Associative embedding: End-to-end learning for joint detection and grouping}, + author={Newell, Alejandro and Huang, Zhiao and Deng, Jia}, + booktitle={Advances in neural information processing systems}, + pages={2277--2287}, + year={2017} +} +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` + +## Results and models + +### Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [pose_resnet_50](/configs/bottom_up/resnet/coco/res50_coco_512x512.py) | 512x512 | 0.466 | 0.742 | 0.479 | 0.552 | 0.797 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) | +| [pose_resnet_101](/configs/bottom_up/resnet/coco/res101_coco_512x512.py) | 512x512 | 0.554 | 0.807 | 0.599 | 0.622 | 0.841 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) | + +### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5]. + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [pose_resnet_50](/configs/bottom_up/resnet/coco/res50_coco_512x512.py) | 512x512 | 0.503 | 0.765 | 0.521 | 0.591 | 0.821 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) | +| [pose_resnet_101](/configs/bottom_up/resnet/coco/res101_coco_512x512.py) | 512x512 | 0.603 | 0.831 | 0.641 | 0.668 | 0.870 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) | diff --git a/configs/bottom_up/resnet/coco/res101_coco_512x512.py b/configs/bottom_up/resnet/coco/res101_coco_512x512.py new file mode 100644 index 0000000000..9be26fa556 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res101_coco_512x512.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth', + backbone=dict(type='ResNet', depth=101), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/coco/res101_coco_640x640.py b/configs/bottom_up/resnet/coco/res101_coco_640x640.py new file mode 100644 index 0000000000..9001715ff2 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res101_coco_640x640.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth', + backbone=dict(type='ResNet', depth=101), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/coco/res152_coco_512x512.py b/configs/bottom_up/resnet/coco/res152_coco_512x512.py new file mode 100644 index 0000000000..42bca05e43 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res152_coco_512x512.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth', + backbone=dict(type='ResNet', depth=152), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/coco/res152_coco_640x640.py b/configs/bottom_up/resnet/coco/res152_coco_640x640.py new file mode 100644 index 0000000000..c144940400 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res152_coco_640x640.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth', + backbone=dict(type='ResNet', depth=152), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=16, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/coco/res50_coco_512x512.py b/configs/bottom_up/resnet/coco/res50_coco_512x512.py new file mode 100644 index 0000000000..463fbf7b26 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res50_coco_512x512.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth', + backbone=dict(type='ResNet', depth=50), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/bottom_up/resnet/coco/res50_coco_640x640.py b/configs/bottom_up/resnet/coco/res50_coco_640x640.py new file mode 100644 index 0000000000..c2ccb86598 --- /dev/null +++ b/configs/bottom_up/resnet/coco/res50_coco_640x640.py @@ -0,0 +1,166 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=100, metric='mAP') + +optimizer = dict( + type='Adam', + lr=0.0015, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='BottomUp', + pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth', + backbone=dict(type='ResNet', depth=50), + keypoint_head=dict( + type='BottomUpSimpleHead', + in_channels=2048, + num_joints=17, + tag_per_joint=True, + with_ae_loss=[True]), + train_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + img_size=data_cfg['image_size']), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + scale_factor=[1], + with_heatmaps=[True], + with_ae=[True], + project2image=True, + nms_kernel=5, + nms_padding=2, + tag_per_joint=True, + detection_threshold=0.1, + tag_threshold=1, + use_detection_val=True, + ignore_too_much=False, + adjust=True, + refine=True, + flip_test=True), + loss_pose=dict( + type='MultiLossFactory', + num_joints=17, + num_stages=1, + ae_loss_type='exp', + with_ae_loss=[True], + push_loss_factor=[0.001], + pull_loss_factor=[0.001], + with_heatmaps_loss=[True], + heatmaps_loss_factor=[1.0], + ), +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='BottomUpGenerateTarget', + sigma=2, + max_num_people=30, + ), + dict( + type='Collect', + keys=['img', 'joints', 'targets', 'masks'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=24, + workers_per_gpu=1, + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/mmpose/models/keypoint_heads/bottom_up_simple_head.py b/mmpose/models/keypoint_heads/bottom_up_simple_head.py index 5e03768cd1..69154c33ac 100644 --- a/mmpose/models/keypoint_heads/bottom_up_simple_head.py +++ b/mmpose/models/keypoint_heads/bottom_up_simple_head.py @@ -1,5 +1,6 @@ import torch.nn as nn -from mmcv.cnn import build_conv_layer, normal_init +from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init, + normal_init) from ..registry import HEADS @@ -11,6 +12,12 @@ class BottomUpSimpleHead(nn.Module): Args: in_channels (int): Number of input channels. num_joints (int): Number of joints. + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means + no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + If num_deconv_layers > 0, the length of + num_deconv_kernels (list|tuple): Kernel sizes. tag_per_joint (bool): If tag_per_joint is True, the dimension of tags equals to num_joints, else the dimension of tags is 1. Default: True @@ -20,6 +27,9 @@ class BottomUpSimpleHead(nn.Module): def __init__(self, in_channels, num_joints, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), tag_per_joint=True, with_ae_loss=None, extra=None): @@ -35,6 +45,18 @@ def __init__(self, if extra is not None and not isinstance(extra, dict): raise TypeError('extra should be dict or None.') + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + if extra is not None and 'final_conv_kernel' in extra: assert extra['final_conv_kernel'] in [1, 3] if extra['final_conv_kernel'] == 3: @@ -46,9 +68,10 @@ def __init__(self, kernel_size = 1 padding = 0 - self.final_layers = build_conv_layer( + self.final_layer = build_conv_layer( cfg=dict(type='Conv2d'), - in_channels=in_channels, + in_channels=num_deconv_filters[-1] + if num_deconv_layers > 0 else in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, @@ -59,12 +82,67 @@ def forward(self, x): if isinstance(x, list): x = x[0] final_outputs = [] - y = self.final_layers(x) + x = self.deconv_layers(x) + y = self.final_layer(x) final_outputs.append(y) return final_outputs + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + def _get_deconv_cfg(self, deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + def init_weights(self): """Initialize model weights.""" - for m in self.final_layers.modules(): + for name, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): if isinstance(m, nn.Conv2d): normal_init(m, std=0.001, bias=0) diff --git a/tests/test_model/test_bottom_up_forward.py b/tests/test_model/test_bottom_up_forward.py index 61d1d027a5..6207104ccc 100644 --- a/tests/test_model/test_bottom_up_forward.py +++ b/tests/test_model/test_bottom_up_forward.py @@ -13,8 +13,10 @@ def test_bottomup_forward(): type='BottomUpSimpleHead', in_channels=512, num_joints=17, + num_deconv_layers=0, + tag_per_joint=True, with_ae_loss=[True], - extra={'final_conv_kernel': 3}), + extra=dict(final_conv_kernel=1, )), train_cfg=dict(), test_cfg=dict( num_joints=17, diff --git a/tests/test_model/test_bottom_up_head.py b/tests/test_model/test_bottom_up_head.py index e96abd31ba..dc2b95b4fe 100644 --- a/tests/test_model/test_bottom_up_head.py +++ b/tests/test_model/test_bottom_up_head.py @@ -25,22 +25,23 @@ def test_bottom_up_simple_head(): with_ae_loss=[True], extra={'final_conv_kernel': 3}) head.init_weights() - assert head.final_layers.padding == (1, 1) + assert head.final_layer.padding == (1, 1) head = BottomUpSimpleHead( in_channels=512, num_joints=17, with_ae_loss=[True], extra={'final_conv_kernel': 1}) head.init_weights() - assert head.final_layers.padding == (0, 0) + assert head.final_layer.padding == (0, 0) head = BottomUpSimpleHead( in_channels=512, num_joints=17, with_ae_loss=[True]) head.init_weights() - assert head.final_layers.padding == (0, 0) + assert head.final_layer.padding == (0, 0) # test with_ae_loss head = BottomUpSimpleHead( in_channels=512, num_joints=17, + num_deconv_layers=0, with_ae_loss=[True], extra={'final_conv_kernel': 3}) head.init_weights() @@ -51,6 +52,7 @@ def test_bottom_up_simple_head(): head = BottomUpSimpleHead( in_channels=512, num_joints=17, + num_deconv_layers=0, with_ae_loss=[False], extra={'final_conv_kernel': 3}) head.init_weights() @@ -62,6 +64,7 @@ def test_bottom_up_simple_head(): head = BottomUpSimpleHead( in_channels=512, num_joints=17, + num_deconv_layers=0, tag_per_joint=False, with_ae_loss=[False], extra={'final_conv_kernel': 3}) @@ -73,6 +76,7 @@ def test_bottom_up_simple_head(): head = BottomUpSimpleHead( in_channels=512, num_joints=17, + num_deconv_layers=0, tag_per_joint=False, with_ae_loss=[True], extra={'final_conv_kernel': 3}) @@ -84,6 +88,7 @@ def test_bottom_up_simple_head(): head = BottomUpSimpleHead( in_channels=512, num_joints=17, + num_deconv_layers=0, tag_per_joint=False, with_ae_loss=[True], extra={'final_conv_kernel': 3})