diff --git a/configs/top_down/resnet/README.md b/configs/top_down/resnet/README.md index 003ba6c2ef..9e4b3f27a3 100644 --- a/configs/top_down/resnet/README.md +++ b/configs/top_down/resnet/README.md @@ -25,6 +25,22 @@ | [pose_resnet_152](/configs/top_down/resnet/coco/res152_coco_384x288.py) | 384x288 | 0.750 | 0.908 | 0.821 | 0.800 | 0.942 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_384x288_20200709.log.json) | + +### Results on OCHuman test dataset with ground-truth bounding boxes + +Following the common setting, the models are trained on COCO train dataset, and evaluate on OCHuman dataset. + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------- | :-----------: | :------: | :------: | :------: | :------: | :------: |:------: |:------: | +| [pose_resnet_50](/configs/top_down/resnet/coco/res50_coco_256x192.py) | 256x192 | 0.546 | 0.726 | 0.593 | 0.592 | 0.755 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res50_coco_256x192-ec54d7f3_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res50_coco_256x192_20200709.log.json) | +| [pose_resnet_50](/configs/top_down/resnet/coco/res50_coco_384x288.py) | 384x288 | 0.539 | 0.723 | 0.574 | 0.588 | 0.756 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res50_coco_384x288-e6f795e9_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res50_coco_384x288_20200709.log.json) | +| [pose_resnet_101](/configs/top_down/resnet/coco/res101_coco_256x192.py) | 256x192 | 0.559 | 0.724 | 0.606 | 0.605 | 0.751 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res101_coco_256x192-6e6babf0_20200708.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res101_coco_256x192_20200708.log.json) | +| [pose_resnet_101](/configs/top_down/resnet/coco/res101_coco_384x288.py) | 384x288 | 0.571 | 0.715 | 0.615 | 0.615 | 0.748 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res101_coco_384x288-8c71bdc9_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res101_coco_384x288_20200709.log.json) | +| [pose_resnet_152](/configs/top_down/resnet/coco/res152_coco_256x192.py) | 256x192 | 0.570 | 0.725 | 0.617 | 0.616 | 0.754 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_256x192-f6e307c2_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_256x192_20200709.log.json) | +| [pose_resnet_152](/configs/top_down/resnet/coco/res152_coco_384x288.py) | 384x288 | 0.582 | 0.723 | 0.627 | 0.627 | 0.752 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_384x288-3860d4c9_20200709.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/top_down/resnet/res152_coco_384x288_20200709.log.json) | + + + ### Results on MPII val set. | Arch | Input Size | Mean | Mean@0.1 | ckpt | log | diff --git a/configs/top_down/resnet/ochuman/res101_ochuman_256x192.py b/configs/top_down/resnet/ochuman/res101_ochuman_256x192.py new file mode 100755 index 0000000000..8862b809b8 --- /dev/null +++ b/configs/top_down/resnet/ochuman/res101_ochuman_256x192.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth', + backbone=dict(type='ResNet', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/top_down/resnet/ochuman/res101_ochuman_384x288.py b/configs/top_down/resnet/ochuman/res101_ochuman_384x288.py new file mode 100755 index 0000000000..6addb9d1ab --- /dev/null +++ b/configs/top_down/resnet/ochuman/res101_ochuman_384x288.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet101-5d3b4d8f.pth', + backbone=dict(type='ResNet', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/top_down/resnet/ochuman/res152_ochuman_256x192.py b/configs/top_down/resnet/ochuman/res152_ochuman_256x192.py new file mode 100755 index 0000000000..094651f47a --- /dev/null +++ b/configs/top_down/resnet/ochuman/res152_ochuman_256x192.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth', + backbone=dict(type='ResNet', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/top_down/resnet/ochuman/res152_ochuman_384x288.py b/configs/top_down/resnet/ochuman/res152_ochuman_384x288.py new file mode 100755 index 0000000000..a63c33d182 --- /dev/null +++ b/configs/top_down/resnet/ochuman/res152_ochuman_384x288.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet152-b121ed2d.pth', + backbone=dict(type='ResNet', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=48, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/top_down/resnet/ochuman/res50_ochuman_256x192.py b/configs/top_down/resnet/ochuman/res50_ochuman_256x192.py new file mode 100644 index 0000000000..5d14360ef1 --- /dev/null +++ b/configs/top_down/resnet/ochuman/res50_ochuman_256x192.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=1, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth', + backbone=dict(type='ResNet', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/configs/top_down/resnet/ochuman/res50_ochuman_384x288.py b/configs/top_down/resnet/ochuman/res50_ochuman_384x288.py new file mode 100755 index 0000000000..ddd6cf69dc --- /dev/null +++ b/configs/top_down/resnet/ochuman/res50_ochuman_384x288.py @@ -0,0 +1,146 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/resnet50-19c8e357.pth', + backbone=dict(type='ResNet', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/ochuman' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file='data/coco/annotations/person_keypoints_train2017.json', + img_prefix='data/coco//train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_val_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), + test=dict( + type='TopDownOCHumanDataset', + ann_file=f'{data_root}/annotations/' + 'ochuman_coco_format_test_range_0.00_1.00.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline), +) diff --git a/docs/getting_started.md b/docs/getting_started.md index 871d4e8bd0..7485b0e6f5 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -64,6 +64,29 @@ mmpose ``` +**For OCHuman data**, please download the images and annotations from [OCHuman](https://github.com/liruilong940607/OCHumanApi), +Move them under $MMPOSE/data, and make them look like this: + +``` +mmpose +├── mmpose +├── docs +├── tests +├── tools +├── configs +`── data + │── ochuman + │-- annotations + │ │-- ochuman_coco_format_val_range_0.00_1.00.json + │ |-- ochuman_coco_format_test_range_0.00_1.00.json + |-- images + │-- 000001.jpg + │-- 000002.jpg + │-- 000003.jpg + │-- ... + +``` + **For MPII-TRB data**, please download from [MPII Human Pose Dataset](http://human-pose.mpi-inf.mpg.de/). Please download the annotation files from [mpii_trb_annotations](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmpose/datasets/mpii_trb_annotations.tar). Extract them under {MMPose}/data, and make them look like this: diff --git a/mmpose/datasets/__init__.py b/mmpose/datasets/__init__.py index bb4f926cb9..aba648c7d1 100644 --- a/mmpose/datasets/__init__.py +++ b/mmpose/datasets/__init__.py @@ -1,13 +1,14 @@ from .builder import build_dataloader, build_dataset from .datasets import (BottomUpCocoDataset, TopDownCocoDataset, TopDownMpiiDataset, TopDownMpiiTrbDataset, - TopDownOneHand10KDataset) + TopDownOCHumanDataset, TopDownOneHand10KDataset) from .pipelines import Compose from .registry import DATASETS, PIPELINES from .samplers import DistributedSampler __all__ = [ 'TopDownCocoDataset', 'BottomUpCocoDataset', 'TopDownMpiiTrbDataset', - 'TopDownOneHand10KDataset', 'TopDownMpiiDataset', 'build_dataloader', - 'build_dataset', 'Compose', 'DistributedSampler', 'DATASETS', 'PIPELINES' + 'TopDownOneHand10KDataset', 'TopDownMpiiDataset', 'TopDownOCHumanDataset', + 'build_dataloader', 'build_dataset', 'Compose', 'DistributedSampler', + 'DATASETS', 'PIPELINES' ] diff --git a/mmpose/datasets/datasets/__init__.py b/mmpose/datasets/datasets/__init__.py index 645f8005a5..50dc58578d 100644 --- a/mmpose/datasets/datasets/__init__.py +++ b/mmpose/datasets/datasets/__init__.py @@ -1,8 +1,10 @@ from .bottom_up import BottomUpCocoDataset from .top_down import (TopDownCocoDataset, TopDownMpiiDataset, - TopDownMpiiTrbDataset, TopDownOneHand10KDataset) + TopDownMpiiTrbDataset, TopDownOCHumanDataset, + TopDownOneHand10KDataset) __all__ = [ 'TopDownCocoDataset', 'BottomUpCocoDataset', 'TopDownMpiiDataset', - 'TopDownMpiiTrbDataset', 'TopDownOneHand10KDataset' + 'TopDownMpiiTrbDataset', 'TopDownOneHand10KDataset', + 'TopDownOCHumanDataset' ] diff --git a/mmpose/datasets/datasets/top_down/__init__.py b/mmpose/datasets/datasets/top_down/__init__.py index fd9fc77ee8..72d6c6b6d1 100644 --- a/mmpose/datasets/datasets/top_down/__init__.py +++ b/mmpose/datasets/datasets/top_down/__init__.py @@ -1,9 +1,10 @@ from .topdown_coco_dataset import TopDownCocoDataset from .topdown_mpii_dataset import TopDownMpiiDataset from .topdown_mpii_trb_dataset import TopDownMpiiTrbDataset +from .topdown_ochuman_dataset import TopDownOCHumanDataset from .topdown_onehand10k_dataset import TopDownOneHand10KDataset __all__ = [ 'TopDownCocoDataset', 'TopDownMpiiTrbDataset', 'TopDownMpiiDataset', - 'TopDownOneHand10KDataset' + 'TopDownOneHand10KDataset', 'TopDownOCHumanDataset' ] diff --git a/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py index 8d5873f9d3..eddbdedec5 100644 --- a/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_coco_dataset.py @@ -16,6 +16,10 @@ class TopDownCocoDataset(TopDownBaseDataset): """CocoDataset dataset for top-down pose estimation. + `Microsoft COCO: Common Objects in Context' ECCV'2014 + More details can be found in the `paper + `_ . + The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py index ec7381e052..dc6ea1fa85 100644 --- a/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_mpii_dataset.py @@ -13,6 +13,10 @@ class TopDownMpiiDataset(TopDownBaseDataset): """MPII Dataset for top-down pose estimation. + `2D Human Pose Estimation: New Benchmark and State of the Art Analysis' + CVPR'2014. More details can be found in the `paper + `_ . + The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. diff --git a/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py index 53286ef19d..f0c4260873 100644 --- a/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_mpii_trb_dataset.py @@ -14,6 +14,10 @@ class TopDownMpiiTrbDataset(TopDownBaseDataset): """MPII-TRB Dataset dataset for top-down pose estimation. + `TRB: A Novel Triplet Representation for Understanding 2D Human Body' + ICCV'2019 More details can be found in the `paper + `_ . + The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information. diff --git a/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py new file mode 100644 index 0000000000..5f3b55eeb3 --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_ochuman_dataset.py @@ -0,0 +1,270 @@ +import os +from collections import OrderedDict, defaultdict + +import numpy as np +from pycocotools.coco import COCO + +from ....core.post_processing import oks_nms, soft_oks_nms +from ...registry import DATASETS +from .topdown_coco_dataset import TopDownCocoDataset + + +def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + Returns: + id2name (dict): mapping image id to name. + name2id (dict): mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + +@DATASETS.register_module() +class TopDownOCHumanDataset(TopDownCocoDataset): + """OChuman dataset for top-down pose estimation. + + `Pose2Seg: Detection Free Human Instance Segmentation' CVPR'2019 + More details can be found in the `paper + `_ . + + "Occluded Human (OCHuman)" dataset contains 8110 heavily occluded + human instances within 4731 images. OCHuman dataset is designed for + validation and testing. To evaluate on OCHuman, the model should be + trained on COCO training set, and then test the robustness of the + model to occlusion using OCHuman. + + OCHuman keypoint indexes (same as COCO):: + + 0: 'nose', + 1: 'left_eye', + 2: 'right_eye', + 3: 'left_ear', + 4: 'right_ear', + 5: 'left_shoulder', + 6: 'right_shoulder', + 7: 'left_elbow', + 8: 'right_elbow', + 9: 'left_wrist', + 10: 'right_wrist', + 11: 'left_hip', + 12: 'right_hip', + 13: 'left_knee', + 14: 'right_knee', + 15: 'left_ankle', + 16: 'right_ankle' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + test_mode=False): + super(TopDownCocoDataset, self).__init__( + ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.image_thr = data_cfg['image_thr'] + + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + self.bbox_thr = data_cfg['bbox_thr'] + + self.ann_info['flip_pairs'] = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], + [11, 12], [13, 14], [15, 16]] + + self.ann_info['upper_body_ids'] = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + self.ann_info['lower_body_ids'] = (11, 12, 13, 14, 15, 16) + + self.ann_info['use_different_joint_weights'] = False + self.ann_info['joint_weights'] = np.array( + [ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, + 1.2, 1.5, 1.5 + ], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) + + self.coco = COCO(ann_file) + + cats = [ + cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict([(self._class_to_coco_ind[cls], + self._class_to_ind[cls]) + for cls in self.classes[1:]]) + self.image_set_index = self.coco.getImgIds() + self.num_images = len(self.image_set_index) + self.id2name, self.name2id = _get_mapping_id_name(self.coco.imgs) + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, index): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + index: coco image id + Returns: + db entry + """ + im_ann = self.coco.loadImgs(index)[0] + width = im_ann['width'] + height = im_ann['height'] + num_joints = self.ann_info['num_joints'] + + ann_ids = self.coco.getAnnIds(imgIds=index, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if obj['area'] > 0 and x2 >= x1 and y2 >= y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + rec = [] + for obj in objs: + if max(obj['keypoints']) == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[index]) + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': 'ochuman', + 'bbox_score': 1 + }) + + return rec + + def evaluate(self, outputs, res_folder, metric='mAP', **kwargs): + """Evaluate coco keypoint results. The pose prediction results will be + saved in `${res_folder}/result_keypoints.json`. + + Note: + num_keypoints: K + + Args: + outputs (list(preds, boxes, image_path)) + :preds (np.ndarray[1,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[1,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_path (list[str]): For example, [ '/', 'v','a', 'l', + '2', '0', '1', '7', '/', '0', '0', '0', '0', '0', + '0', '3', '9', '7', '1', '3', '3', '.', 'j', 'p', 'g'] + res_folder (str): Path of directory to save the results. + metric (str): Metric to be performed. Defaults: 'mAP'. + + Returns: + name_value (dict): Evaluation results for evaluation metric. + """ + assert metric == 'mAP' + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = defaultdict(list) + for preds, boxes, image_path in outputs: + str_image_path = ''.join(image_path) + image_id = self.name2id[os.path.basename(str_image_path)] + + kpts[image_id].append({ + 'keypoints': preds[0], + 'center': boxes[0][0:2], + 'scale': boxes[0][2:4], + 'area': boxes[0][4], + 'score': boxes[0][5], + 'image_id': image_id, + }) + + # rescoring and oks nms + num_joints = self.ann_info['num_joints'] + vis_thr = self.vis_thr + oks_thr = self.oks_thr + oks_nmsed_kpts = [] + for img in kpts.keys(): + img_kpts = kpts[img] + for n_p in img_kpts: + box_score = n_p['score'] + kpt_score = 0 + valid_num = 0 + for n_jt in range(0, num_joints): + t_s = n_p['keypoints'][n_jt][2] + if t_s > vis_thr: + kpt_score = kpt_score + t_s + valid_num = valid_num + 1 + if valid_num != 0: + kpt_score = kpt_score / valid_num + # rescoring + n_p['score'] = kpt_score * box_score + + if self.soft_nms: + keep = soft_oks_nms( + [img_kpts[i] for i in range(len(img_kpts))], oks_thr) + else: + keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))], + oks_thr) + + if len(keep) == 0: + oks_nmsed_kpts.append(img_kpts) + else: + oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep]) + + self._write_coco_keypoint_results(oks_nmsed_kpts, res_file) + + info_str = self._do_python_keypoint_eval(res_file) + name_value = OrderedDict(info_str) + + return name_value diff --git a/mmpose/datasets/datasets/top_down/topdown_onehand10k_dataset.py b/mmpose/datasets/datasets/top_down/topdown_onehand10k_dataset.py index f7549283d2..7e83bc8113 100644 --- a/mmpose/datasets/datasets/top_down/topdown_onehand10k_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_onehand10k_dataset.py @@ -14,6 +14,11 @@ class TopDownOneHand10KDataset(TopDownBaseDataset): """OneHand10K dataset for top-down hand pose estimation. + `Mask-pose Cascaded CNN for 2D Hand Pose Estimation from + Single Color Images' TCSVT'2019 + More details can be found in the `paper + `_ . + The dataset loads raw features and apply specified transforms to return a dict containing the image tensors and other information.