diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py
index 08966d7ffe..7a3a4be366 100644
--- a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py
+++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_512x512.py
@@ -198,14 +198,3 @@
data_cfg=data_cfg,
pipeline=val_pipeline),
)
-
-loss = dict(
- type='MultiLossFactory',
- num_stages=2,
- ae_loss_type='exp',
- with_ae_loss=[True, False],
- push_loss_factor=[0.001, 0.001],
- pull_loss_factor=[0.001, 0.001],
- with_heatmaps_loss=[True, True],
- heatmaps_loss_factor=[1.0, 1.0],
-)
diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py
index 6ce95f2594..192005a6cc 100755
--- a/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py
+++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet32_coco_640x640.py
@@ -199,14 +199,3 @@
data_cfg=data_cfg,
pipeline=val_pipeline),
)
-
-loss = dict(
- type='MultiLossFactory',
- num_stages=2,
- ae_loss_type='exp',
- with_ae_loss=[True, False],
- push_loss_factor=[0.001, 0.001],
- pull_loss_factor=[0.001, 0.001],
- with_heatmaps_loss=[True, True],
- heatmaps_loss_factor=[1.0, 1.0],
-)
diff --git a/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py b/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py
index 575b8b4b12..3a53b8cd7d 100755
--- a/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py
+++ b/configs/bottom_up/higherhrnet/coco/higher_hrnet48_coco_512x512.py
@@ -199,14 +199,3 @@
data_cfg=data_cfg,
pipeline=val_pipeline),
)
-
-loss = dict(
- type='MultiLossFactory',
- num_stages=2,
- ae_loss_type='exp',
- with_ae_loss=[True, False],
- push_loss_factor=[0.001, 0.001],
- pull_loss_factor=[0.001, 0.001],
- with_heatmaps_loss=[True, True],
- heatmaps_loss_factor=[1.0, 1.0],
-)
diff --git a/configs/bottom_up/hrnet/README.md b/configs/bottom_up/hrnet/README.md
new file mode 100644
index 0000000000..7a844590ac
--- /dev/null
+++ b/configs/bottom_up/hrnet/README.md
@@ -0,0 +1,35 @@
+# Associative Embedding (AE) + HRNet
+
+## Introduction
+```
+@inproceedings{newell2017associative,
+ title={Associative embedding: End-to-end learning for joint detection and grouping},
+ author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+ booktitle={Advances in neural information processing systems},
+ pages={2277--2287},
+ year={2017}
+}
+@inproceedings{sun2019deep,
+ title={Deep high-resolution representation learning for human pose estimation},
+ author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={5693--5703},
+ year={2019}
+}
+```
+
+## Results and models
+
+### Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py) | 512x512 | 0.654 | 0.863 | 0.720 | 0.710 | 0.892 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
+| [HRNet-w48](/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py) | 512x512 | 0.665 | 0.860 | 0.727 | 0.716 | 0.889 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) |
+
+### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5].
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [HRNet-w32](/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py) | 512x512 | 0.698 | 0.877 | 0.760 | 0.748 | 0.907 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
+| [HRNet-w48](/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py) | 512x512 | 0.712 | 0.880 | 0.771 | 0.757 | 0.909 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512-cf72fcdf_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w48_coco_512x512_20200816.log.json) |
diff --git a/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py
new file mode 100644
index 0000000000..7907bd0d9c
--- /dev/null
+++ b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_512x512.py
@@ -0,0 +1,196 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/hrnet_w32-36af842e.pth',
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ ),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=32,
+ num_joints=17,
+ num_deconv_layers=0,
+ tag_per_joint=True,
+ with_ae_loss=[True],
+ extra=dict(final_conv_kernel=1, )),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=24,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py
new file mode 100644
index 0000000000..ba63fde5c1
--- /dev/null
+++ b/configs/bottom_up/hrnet/coco/hrnet_w32_coco_640x640.py
@@ -0,0 +1,196 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=640,
+ base_size=320,
+ base_sigma=2,
+ heatmap_size=[160],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/hrnet_w32-36af842e.pth',
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(32, 64)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(32, 64, 128)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(32, 64, 128, 256))),
+ ),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=32,
+ num_joints=17,
+ num_deconv_layers=0,
+ tag_per_joint=True,
+ with_ae_loss=[True],
+ extra=dict(final_conv_kernel=1, )),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py
new file mode 100644
index 0000000000..7dc03c0e2b
--- /dev/null
+++ b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_512x512.py
@@ -0,0 +1,196 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/hrnet_w48-8ef0771d.pth',
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ ),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=48,
+ num_joints=17,
+ num_deconv_layers=0,
+ tag_per_joint=True,
+ with_ae_loss=[True],
+ extra=dict(final_conv_kernel=1, )),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py
new file mode 100644
index 0000000000..e881ca7ee0
--- /dev/null
+++ b/configs/bottom_up/hrnet/coco/hrnet_w48_coco_640x640.py
@@ -0,0 +1,196 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=640,
+ base_size=320,
+ base_sigma=2,
+ heatmap_size=[160],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/hrnet_w48-8ef0771d.pth',
+ backbone=dict(
+ type='HRNet',
+ in_channels=3,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(48, 96)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(48, 96, 192)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(48, 96, 192, 384))),
+ ),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=48,
+ num_joints=17,
+ num_deconv_layers=0,
+ tag_per_joint=True,
+ with_ae_loss=[True],
+ extra=dict(final_conv_kernel=1, )),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/mobilenet/README.md b/configs/bottom_up/mobilenet/README.md
new file mode 100644
index 0000000000..d93f308da1
--- /dev/null
+++ b/configs/bottom_up/mobilenet/README.md
@@ -0,0 +1,33 @@
+# Associative Embedding (AE) + Mobilenetv2
+
+## Introduction
+```
+@inproceedings{newell2017associative,
+ title={Associative embedding: End-to-end learning for joint detection and grouping},
+ author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+ booktitle={Advances in neural information processing systems},
+ pages={2277--2287},
+ year={2017}
+}
+@inproceedings{sandler2018mobilenetv2,
+ title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+ author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={4510--4520},
+ year={2018}
+}
+```
+
+## Results and models
+
+### Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py) | 512x512 | 0.380 | 0.671 | 0.368 | 0.473 | 0.741 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/mobilenetv2_coco_512x512-4d96e309_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/mobilenetv2_coco_512x512_20200816.log.json) |
+
+### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5].
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_mobilenetv2](/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py) | 512x512 | 0.442 | 0.696 | 0.422 | 0.517 | 0.766 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512-bcb8c247_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/hrnet_w32_coco_512x512_20200816.log.json) |
diff --git a/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py b/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py
new file mode 100644
index 0000000000..04237e4011
--- /dev/null
+++ b/configs/bottom_up/mobilenet/coco/mobilenetv2_coco_512x512.py
@@ -0,0 +1,167 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/'
+ 'mobilenet_v2_batch256_20200708-3b2dc3af.pth',
+ backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=1280,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=24,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/README.md b/configs/bottom_up/resnet/README.md
new file mode 100644
index 0000000000..b80e17aca2
--- /dev/null
+++ b/configs/bottom_up/resnet/README.md
@@ -0,0 +1,35 @@
+# Associative Embedding (AE) + ResNet
+
+## Introduction
+```
+@inproceedings{newell2017associative,
+ title={Associative embedding: End-to-end learning for joint detection and grouping},
+ author={Newell, Alejandro and Huang, Zhiao and Deng, Jia},
+ booktitle={Advances in neural information processing systems},
+ pages={2277--2287},
+ year={2017}
+}
+@inproceedings{he2016deep,
+ title={Deep residual learning for image recognition},
+ author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={770--778},
+ year={2016}
+}
+```
+
+## Results and models
+
+### Results on COCO val2017 without multi-scale test
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/bottom_up/resnet/coco/res50_coco_512x512.py) | 512x512 | 0.466 | 0.742 | 0.479 | 0.552 | 0.797 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) |
+| [pose_resnet_101](/configs/bottom_up/resnet/coco/res101_coco_512x512.py) | 512x512 | 0.554 | 0.807 | 0.599 | 0.622 | 0.841 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) |
+
+### Results on COCO val2017 with multi-scale test with scales [2, 1, 0.5].
+
+| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log |
+| :----------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: |
+| [pose_resnet_50](/configs/bottom_up/resnet/coco/res50_coco_512x512.py) | 512x512 | 0.503 | 0.765 | 0.521 | 0.591 | 0.821 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512-5521bead_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res50_coco_512x512_20200816.log.json) |
+| [pose_resnet_101](/configs/bottom_up/resnet/coco/res101_coco_512x512.py) | 512x512 | 0.603 | 0.831 | 0.641 | 0.668 | 0.870 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512-e0c95157_20200816.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/bottom_up/res101_coco_512x512_20200816.log.json) |
diff --git a/configs/bottom_up/resnet/coco/res101_coco_512x512.py b/configs/bottom_up/resnet/coco/res101_coco_512x512.py
new file mode 100644
index 0000000000..9be26fa556
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res101_coco_512x512.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth',
+ backbone=dict(type='ResNet', depth=101),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/coco/res101_coco_640x640.py b/configs/bottom_up/resnet/coco/res101_coco_640x640.py
new file mode 100644
index 0000000000..9001715ff2
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res101_coco_640x640.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=640,
+ base_size=320,
+ base_sigma=2,
+ heatmap_size=[160],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth',
+ backbone=dict(type='ResNet', depth=101),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/coco/res152_coco_512x512.py b/configs/bottom_up/resnet/coco/res152_coco_512x512.py
new file mode 100644
index 0000000000..42bca05e43
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res152_coco_512x512.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth',
+ backbone=dict(type='ResNet', depth=152),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/coco/res152_coco_640x640.py b/configs/bottom_up/resnet/coco/res152_coco_640x640.py
new file mode 100644
index 0000000000..c144940400
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res152_coco_640x640.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=640,
+ base_size=320,
+ base_sigma=2,
+ heatmap_size=[160],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth',
+ backbone=dict(type='ResNet', depth=152),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=16,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/coco/res50_coco_512x512.py b/configs/bottom_up/resnet/coco/res50_coco_512x512.py
new file mode 100644
index 0000000000..463fbf7b26
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res50_coco_512x512.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=512,
+ base_size=256,
+ base_sigma=2,
+ heatmap_size=[128],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth',
+ backbone=dict(type='ResNet', depth=50),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=24,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/configs/bottom_up/resnet/coco/res50_coco_640x640.py b/configs/bottom_up/resnet/coco/res50_coco_640x640.py
new file mode 100644
index 0000000000..c2ccb86598
--- /dev/null
+++ b/configs/bottom_up/resnet/coco/res50_coco_640x640.py
@@ -0,0 +1,166 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=100, metric='mAP')
+
+optimizer = dict(
+ type='Adam',
+ lr=0.0015,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+ policy='step',
+ warmup='linear',
+ warmup_iters=500,
+ warmup_ratio=0.001,
+ step=[200, 260])
+total_epochs = 300
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook'),
+ # dict(type='TensorboardLoggerHook')
+ ])
+
+channel_cfg = dict(
+ dataset_joints=17,
+ dataset_channel=[
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+ ],
+ inference_channel=[
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ ])
+
+data_cfg = dict(
+ image_size=640,
+ base_size=320,
+ base_sigma=2,
+ heatmap_size=[160],
+ num_joints=channel_cfg['dataset_joints'],
+ dataset_channel=channel_cfg['dataset_channel'],
+ inference_channel=channel_cfg['inference_channel'],
+ num_scales=1,
+ scale_aware_sigma=False,
+)
+
+# model settings
+model = dict(
+ type='BottomUp',
+ pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth',
+ backbone=dict(type='ResNet', depth=50),
+ keypoint_head=dict(
+ type='BottomUpSimpleHead',
+ in_channels=2048,
+ num_joints=17,
+ tag_per_joint=True,
+ with_ae_loss=[True]),
+ train_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ img_size=data_cfg['image_size']),
+ test_cfg=dict(
+ num_joints=channel_cfg['dataset_joints'],
+ max_num_people=30,
+ scale_factor=[1],
+ with_heatmaps=[True],
+ with_ae=[True],
+ project2image=True,
+ nms_kernel=5,
+ nms_padding=2,
+ tag_per_joint=True,
+ detection_threshold=0.1,
+ tag_threshold=1,
+ use_detection_val=True,
+ ignore_too_much=False,
+ adjust=True,
+ refine=True,
+ flip_test=True),
+ loss_pose=dict(
+ type='MultiLossFactory',
+ num_joints=17,
+ num_stages=1,
+ ae_loss_type='exp',
+ with_ae_loss=[True],
+ push_loss_factor=[0.001],
+ pull_loss_factor=[0.001],
+ with_heatmaps_loss=[True],
+ heatmaps_loss_factor=[1.0],
+ ),
+)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='BottomUpRandomAffine',
+ rot_factor=30,
+ scale_factor=[0.75, 1.5],
+ scale_type='short',
+ trans_factor=40),
+ dict(type='BottomUpRandomFlip', flip_prob=0.5),
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ dict(
+ type='BottomUpGenerateTarget',
+ sigma=2,
+ max_num_people=30,
+ ),
+ dict(
+ type='Collect',
+ keys=['img', 'joints', 'targets', 'masks'],
+ meta_keys=[]),
+]
+
+val_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='BottomUpGetImgSize', test_scale_factor=[1]),
+ dict(
+ type='BottomUpResizeAlign',
+ transforms=[
+ dict(type='ToTensor'),
+ dict(
+ type='NormalizeTensor',
+ mean=[0.485, 0.456, 0.406],
+ std=[0.229, 0.224, 0.225]),
+ ]),
+ dict(
+ type='Collect',
+ keys=[
+ 'img',
+ ],
+ meta_keys=[
+ 'image_file', 'aug_data', 'test_scale_factor', 'base_size',
+ 'center', 'scale', 'flip_index'
+ ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+ samples_per_gpu=24,
+ workers_per_gpu=1,
+ train=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+ img_prefix=f'{data_root}/train2017/',
+ data_cfg=data_cfg,
+ pipeline=train_pipeline),
+ val=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+ test=dict(
+ type='BottomUpCocoDataset',
+ ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+ img_prefix=f'{data_root}/val2017/',
+ data_cfg=data_cfg,
+ pipeline=val_pipeline),
+)
diff --git a/mmpose/models/keypoint_heads/bottom_up_simple_head.py b/mmpose/models/keypoint_heads/bottom_up_simple_head.py
index 5e03768cd1..69154c33ac 100644
--- a/mmpose/models/keypoint_heads/bottom_up_simple_head.py
+++ b/mmpose/models/keypoint_heads/bottom_up_simple_head.py
@@ -1,5 +1,6 @@
import torch.nn as nn
-from mmcv.cnn import build_conv_layer, normal_init
+from mmcv.cnn import (build_conv_layer, build_upsample_layer, constant_init,
+ normal_init)
from ..registry import HEADS
@@ -11,6 +12,12 @@ class BottomUpSimpleHead(nn.Module):
Args:
in_channels (int): Number of input channels.
num_joints (int): Number of joints.
+ num_deconv_layers (int): Number of deconv layers.
+ num_deconv_layers should >= 0. Note that 0 means
+ no deconv layers.
+ num_deconv_filters (list|tuple): Number of filters.
+ If num_deconv_layers > 0, the length of
+ num_deconv_kernels (list|tuple): Kernel sizes.
tag_per_joint (bool): If tag_per_joint is True,
the dimension of tags equals to num_joints,
else the dimension of tags is 1. Default: True
@@ -20,6 +27,9 @@ class BottomUpSimpleHead(nn.Module):
def __init__(self,
in_channels,
num_joints,
+ num_deconv_layers=3,
+ num_deconv_filters=(256, 256, 256),
+ num_deconv_kernels=(4, 4, 4),
tag_per_joint=True,
with_ae_loss=None,
extra=None):
@@ -35,6 +45,18 @@ def __init__(self,
if extra is not None and not isinstance(extra, dict):
raise TypeError('extra should be dict or None.')
+ if num_deconv_layers > 0:
+ self.deconv_layers = self._make_deconv_layer(
+ num_deconv_layers,
+ num_deconv_filters,
+ num_deconv_kernels,
+ )
+ elif num_deconv_layers == 0:
+ self.deconv_layers = nn.Identity()
+ else:
+ raise ValueError(
+ f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
if extra is not None and 'final_conv_kernel' in extra:
assert extra['final_conv_kernel'] in [1, 3]
if extra['final_conv_kernel'] == 3:
@@ -46,9 +68,10 @@ def __init__(self,
kernel_size = 1
padding = 0
- self.final_layers = build_conv_layer(
+ self.final_layer = build_conv_layer(
cfg=dict(type='Conv2d'),
- in_channels=in_channels,
+ in_channels=num_deconv_filters[-1]
+ if num_deconv_layers > 0 else in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=1,
@@ -59,12 +82,67 @@ def forward(self, x):
if isinstance(x, list):
x = x[0]
final_outputs = []
- y = self.final_layers(x)
+ x = self.deconv_layers(x)
+ y = self.final_layer(x)
final_outputs.append(y)
return final_outputs
+ def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+ """Make deconv layers."""
+ if num_layers != len(num_filters):
+ error_msg = f'num_layers({num_layers}) ' \
+ f'!= length of num_filters({len(num_filters)})'
+ raise ValueError(error_msg)
+ if num_layers != len(num_kernels):
+ error_msg = f'num_layers({num_layers}) ' \
+ f'!= length of num_kernels({len(num_kernels)})'
+ raise ValueError(error_msg)
+
+ layers = []
+ for i in range(num_layers):
+ kernel, padding, output_padding = \
+ self._get_deconv_cfg(num_kernels[i])
+
+ planes = num_filters[i]
+ layers.append(
+ build_upsample_layer(
+ dict(type='deconv'),
+ in_channels=self.in_channels,
+ out_channels=planes,
+ kernel_size=kernel,
+ stride=2,
+ padding=padding,
+ output_padding=output_padding,
+ bias=False))
+ layers.append(nn.BatchNorm2d(planes))
+ layers.append(nn.ReLU(inplace=True))
+ self.in_channels = planes
+
+ return nn.Sequential(*layers)
+
+ def _get_deconv_cfg(self, deconv_kernel):
+ """Get configurations for deconv layers."""
+ if deconv_kernel == 4:
+ padding = 1
+ output_padding = 0
+ elif deconv_kernel == 3:
+ padding = 1
+ output_padding = 1
+ elif deconv_kernel == 2:
+ padding = 0
+ output_padding = 0
+ else:
+ raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+ return deconv_kernel, padding, output_padding
+
def init_weights(self):
"""Initialize model weights."""
- for m in self.final_layers.modules():
+ for name, m in self.deconv_layers.named_modules():
+ if isinstance(m, nn.ConvTranspose2d):
+ normal_init(m, std=0.001)
+ elif isinstance(m, nn.BatchNorm2d):
+ constant_init(m, 1)
+ for m in self.final_layer.modules():
if isinstance(m, nn.Conv2d):
normal_init(m, std=0.001, bias=0)
diff --git a/tests/test_model/test_bottom_up_forward.py b/tests/test_model/test_bottom_up_forward.py
index 61d1d027a5..6207104ccc 100644
--- a/tests/test_model/test_bottom_up_forward.py
+++ b/tests/test_model/test_bottom_up_forward.py
@@ -13,8 +13,10 @@ def test_bottomup_forward():
type='BottomUpSimpleHead',
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
+ tag_per_joint=True,
with_ae_loss=[True],
- extra={'final_conv_kernel': 3}),
+ extra=dict(final_conv_kernel=1, )),
train_cfg=dict(),
test_cfg=dict(
num_joints=17,
diff --git a/tests/test_model/test_bottom_up_head.py b/tests/test_model/test_bottom_up_head.py
index e96abd31ba..dc2b95b4fe 100644
--- a/tests/test_model/test_bottom_up_head.py
+++ b/tests/test_model/test_bottom_up_head.py
@@ -25,22 +25,23 @@ def test_bottom_up_simple_head():
with_ae_loss=[True],
extra={'final_conv_kernel': 3})
head.init_weights()
- assert head.final_layers.padding == (1, 1)
+ assert head.final_layer.padding == (1, 1)
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
with_ae_loss=[True],
extra={'final_conv_kernel': 1})
head.init_weights()
- assert head.final_layers.padding == (0, 0)
+ assert head.final_layer.padding == (0, 0)
head = BottomUpSimpleHead(
in_channels=512, num_joints=17, with_ae_loss=[True])
head.init_weights()
- assert head.final_layers.padding == (0, 0)
+ assert head.final_layer.padding == (0, 0)
# test with_ae_loss
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
with_ae_loss=[True],
extra={'final_conv_kernel': 3})
head.init_weights()
@@ -51,6 +52,7 @@ def test_bottom_up_simple_head():
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
with_ae_loss=[False],
extra={'final_conv_kernel': 3})
head.init_weights()
@@ -62,6 +64,7 @@ def test_bottom_up_simple_head():
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
tag_per_joint=False,
with_ae_loss=[False],
extra={'final_conv_kernel': 3})
@@ -73,6 +76,7 @@ def test_bottom_up_simple_head():
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
tag_per_joint=False,
with_ae_loss=[True],
extra={'final_conv_kernel': 3})
@@ -84,6 +88,7 @@ def test_bottom_up_simple_head():
head = BottomUpSimpleHead(
in_channels=512,
num_joints=17,
+ num_deconv_layers=0,
tag_per_joint=False,
with_ae_loss=[True],
extra={'final_conv_kernel': 3})