From 41db302dd0bc13ff730c479b354d8724d2c985c5 Mon Sep 17 00:00:00 2001 From: jin-s13 Date: Thu, 23 Jul 2020 17:10:02 +0800 Subject: [PATCH] add backbones --- .../alexnet/coco/alexnet_coco_256x192.py | 144 ++++++++ .../coco/mobilenetv2_coco_256x192.py | 145 ++++++++ .../coco/mobilenetv2_coco_384x288.py | 145 ++++++++ .../coco/resnetv1d101_coco_256x192.py | 145 ++++++++ .../coco/resnetv1d101_coco_384x288.py | 145 ++++++++ .../coco/resnetv1d152_coco_256x192.py | 145 ++++++++ .../coco/resnetv1d152_coco_384x288.py | 145 ++++++++ .../coco/resnetv1d50_coco_256x192.py | 145 ++++++++ .../coco/resnetv1d50_coco_384x288.py | 145 ++++++++ .../resnext/coco/resnext101_coco_256x192.py | 145 ++++++++ .../resnext/coco/resnext101_coco_384x288.py | 145 ++++++++ .../resnext/coco/resnext152_coco_256x192.py | 145 ++++++++ .../resnext/coco/resnext152_coco_384x288.py | 145 ++++++++ .../resnext/coco/resnext50_coco_256x192.py | 145 ++++++++ .../resnext/coco/resnext50_coco_384x288.py | 145 ++++++++ .../seresnet/coco/seresnet101_coco_256x192.py | 145 ++++++++ .../seresnet/coco/seresnet101_coco_384x288.py | 145 ++++++++ .../seresnet/coco/seresnet152_coco_256x192.py | 144 ++++++++ .../seresnet/coco/seresnet152_coco_384x288.py | 144 ++++++++ .../seresnet/coco/seresnet50_coco_256x192.py | 145 ++++++++ .../seresnet/coco/seresnet50_coco_384x288.py | 145 ++++++++ .../coco/shufflenetv1_coco_256x192.py | 145 ++++++++ mmpose/models/backbones/__init__.py | 26 +- mmpose/models/backbones/alexnet.py | 55 +++ mmpose/models/backbones/base_backbone.py | 7 +- mmpose/models/backbones/hourglass.py | 4 +- mmpose/models/backbones/hrnet.py | 8 +- mmpose/models/backbones/mobilenet_v2.py | 269 +++++++++++++++ mmpose/models/backbones/mobilenet_v3.py | 185 ++++++++++ mmpose/models/backbones/regnet.py | 312 +++++++++++++++++ mmpose/models/backbones/resnet.py | 15 +- mmpose/models/backbones/resnext.py | 147 ++++++++ mmpose/models/backbones/scnet.py | 4 +- mmpose/models/backbones/seresnet.py | 124 +++++++ mmpose/models/backbones/seresnext.py | 153 +++++++++ mmpose/models/backbones/shufflenet_v1.py | 322 ++++++++++++++++++ mmpose/models/backbones/shufflenet_v2.py | 295 ++++++++++++++++ mmpose/models/backbones/utils/__init__.py | 10 + .../models/backbones/utils/channel_shuffle.py | 28 ++ .../backbones/utils/inverted_residual.py | 117 +++++++ .../models/backbones/utils/make_divisible.py | 24 ++ mmpose/models/backbones/utils/se_layer.py | 53 +++ mmpose/models/backbones/utils/utils.py | 50 +++ mmpose/models/losses/mse_loss.py | 4 +- tests/test_backbones/test_alexnet.py | 20 ++ tests/test_backbones/test_backbones_utils.py | 116 +++++++ tests/test_backbones/test_mobilenet_v2.py | 256 ++++++++++++++ tests/test_backbones/test_mobilenet_v3.py | 168 +++++++++ tests/test_backbones/test_regnet.py | 91 +++++ tests/test_backbones/test_resnext.py | 59 ++++ tests/test_backbones/test_seresnet.py | 242 +++++++++++++ tests/test_backbones/test_seresnext.py | 72 ++++ tests/test_backbones/test_shufflenet_v1.py | 244 +++++++++++++ tests/test_backbones/test_shufflenet_v2.py | 203 +++++++++++ 54 files changed, 6848 insertions(+), 22 deletions(-) create mode 100755 configs/top_down/alexnet/coco/alexnet_coco_256x192.py create mode 100755 configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_256x192.py create mode 100755 configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_384x288.py create mode 100755 configs/top_down/resnetv1d/coco/resnetv1d101_coco_256x192.py create mode 100755 configs/top_down/resnetv1d/coco/resnetv1d101_coco_384x288.py create mode 100755 configs/top_down/resnetv1d/coco/resnetv1d152_coco_256x192.py create mode 100755 configs/top_down/resnetv1d/coco/resnetv1d152_coco_384x288.py create mode 100644 configs/top_down/resnetv1d/coco/resnetv1d50_coco_256x192.py create mode 100755 configs/top_down/resnetv1d/coco/resnetv1d50_coco_384x288.py create mode 100755 configs/top_down/resnext/coco/resnext101_coco_256x192.py create mode 100755 configs/top_down/resnext/coco/resnext101_coco_384x288.py create mode 100755 configs/top_down/resnext/coco/resnext152_coco_256x192.py create mode 100755 configs/top_down/resnext/coco/resnext152_coco_384x288.py create mode 100644 configs/top_down/resnext/coco/resnext50_coco_256x192.py create mode 100755 configs/top_down/resnext/coco/resnext50_coco_384x288.py create mode 100755 configs/top_down/seresnet/coco/seresnet101_coco_256x192.py create mode 100755 configs/top_down/seresnet/coco/seresnet101_coco_384x288.py create mode 100755 configs/top_down/seresnet/coco/seresnet152_coco_256x192.py create mode 100755 configs/top_down/seresnet/coco/seresnet152_coco_384x288.py create mode 100644 configs/top_down/seresnet/coco/seresnet50_coco_256x192.py create mode 100755 configs/top_down/seresnet/coco/seresnet50_coco_384x288.py create mode 100755 configs/top_down/shufflenet_v1/coco/shufflenetv1_coco_256x192.py create mode 100644 mmpose/models/backbones/alexnet.py create mode 100644 mmpose/models/backbones/mobilenet_v2.py create mode 100644 mmpose/models/backbones/mobilenet_v3.py create mode 100644 mmpose/models/backbones/regnet.py create mode 100644 mmpose/models/backbones/resnext.py create mode 100644 mmpose/models/backbones/seresnet.py create mode 100644 mmpose/models/backbones/seresnext.py create mode 100644 mmpose/models/backbones/shufflenet_v1.py create mode 100644 mmpose/models/backbones/shufflenet_v2.py create mode 100644 mmpose/models/backbones/utils/__init__.py create mode 100644 mmpose/models/backbones/utils/channel_shuffle.py create mode 100644 mmpose/models/backbones/utils/inverted_residual.py create mode 100644 mmpose/models/backbones/utils/make_divisible.py create mode 100644 mmpose/models/backbones/utils/se_layer.py create mode 100644 mmpose/models/backbones/utils/utils.py create mode 100644 tests/test_backbones/test_alexnet.py create mode 100644 tests/test_backbones/test_backbones_utils.py create mode 100644 tests/test_backbones/test_mobilenet_v2.py create mode 100644 tests/test_backbones/test_mobilenet_v3.py create mode 100644 tests/test_backbones/test_regnet.py create mode 100644 tests/test_backbones/test_resnext.py create mode 100644 tests/test_backbones/test_seresnet.py create mode 100644 tests/test_backbones/test_seresnext.py create mode 100644 tests/test_backbones/test_shufflenet_v1.py create mode 100644 tests/test_backbones/test_shufflenet_v2.py diff --git a/configs/top_down/alexnet/coco/alexnet_coco_256x192.py b/configs/top_down/alexnet/coco/alexnet_coco_256x192.py new file mode 100755 index 0000000000..9de0f18d62 --- /dev/null +++ b/configs/top_down/alexnet/coco/alexnet_coco_256x192.py @@ -0,0 +1,144 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=10, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict(type='AlexNet', num_classes=-1), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=256, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[40, 56], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_256x192.py b/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_256x192.py new file mode 100755 index 0000000000..b50dc55798 --- /dev/null +++ b/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=10, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'mobilenet_v2_batch256_20200708-3b2dc3af.pth', + backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=1280, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_384x288.py b/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_384x288.py new file mode 100755 index 0000000000..0f1b397c97 --- /dev/null +++ b/configs/top_down/mobilenet_v2/coco/mobilenetv2_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=10, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'mobilenet_v2_batch256_20200708-3b2dc3af.pth', + backbone=dict(type='MobileNetV2', widen_factor=1., out_indices=(7, )), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=1280, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d101_coco_256x192.py b/configs/top_down/resnetv1d/coco/resnetv1d101_coco_256x192.py new file mode 100755 index 0000000000..9100824db3 --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d101_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d101_batch256_20200708-9cb302ef.pth', + backbone=dict(type='ResNetV1d', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d101_coco_384x288.py b/configs/top_down/resnetv1d/coco/resnetv1d101_coco_384x288.py new file mode 100755 index 0000000000..a307708d0e --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d101_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d101_batch256_20200708-9cb302ef.pth', + backbone=dict(type='ResNetV1d', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d152_coco_256x192.py b/configs/top_down/resnetv1d/coco/resnetv1d152_coco_256x192.py new file mode 100755 index 0000000000..8da0fb0027 --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d152_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d152_batch256_20200708-e79cb6a2.pth', + backbone=dict(type='ResNetV1d', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d152_coco_384x288.py b/configs/top_down/resnetv1d/coco/resnetv1d152_coco_384x288.py new file mode 100755 index 0000000000..de7c21f603 --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d152_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d152_batch256_20200708-e79cb6a2.pth', + backbone=dict(type='ResNetV1d', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=48, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d50_coco_256x192.py b/configs/top_down/resnetv1d/coco/resnetv1d50_coco_256x192.py new file mode 100644 index 0000000000..4a8da1fb16 --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d50_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=1, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d50_batch256_20200708-1ad0ce94.pth', + backbone=dict(type='ResNetV1d', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnetv1d/coco/resnetv1d50_coco_384x288.py b/configs/top_down/resnetv1d/coco/resnetv1d50_coco_384x288.py new file mode 100755 index 0000000000..4935a1f233 --- /dev/null +++ b/configs/top_down/resnetv1d/coco/resnetv1d50_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnetv1d50_batch256_20200708-1ad0ce94.pth', + backbone=dict(type='ResNetV1d', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext101_coco_256x192.py b/configs/top_down/resnext/coco/resnext101_coco_256x192.py new file mode 100755 index 0000000000..35dd70f6d0 --- /dev/null +++ b/configs/top_down/resnext/coco/resnext101_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext101_32x4d_batch256_20200708-87f2d1c9.pth', + backbone=dict(type='ResNeXt', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext101_coco_384x288.py b/configs/top_down/resnext/coco/resnext101_coco_384x288.py new file mode 100755 index 0000000000..5cfe81fe0e --- /dev/null +++ b/configs/top_down/resnext/coco/resnext101_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext101_32x4d_batch256_20200708-87f2d1c9.pth', + backbone=dict(type='ResNeXt', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext152_coco_256x192.py b/configs/top_down/resnext/coco/resnext152_coco_256x192.py new file mode 100755 index 0000000000..ef20c7338f --- /dev/null +++ b/configs/top_down/resnext/coco/resnext152_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext152_32x4d_batch256_20200708-aab5034c.pth', + backbone=dict(type='ResNeXt', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext152_coco_384x288.py b/configs/top_down/resnext/coco/resnext152_coco_384x288.py new file mode 100755 index 0000000000..76f384f94c --- /dev/null +++ b/configs/top_down/resnext/coco/resnext152_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext152_32x4d_batch256_20200708-aab5034c.pth', + backbone=dict(type='ResNeXt', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=48, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext50_coco_256x192.py b/configs/top_down/resnext/coco/resnext50_coco_256x192.py new file mode 100644 index 0000000000..dba4fbf85f --- /dev/null +++ b/configs/top_down/resnext/coco/resnext50_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=1, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext50_32x4d_batch256_20200708-c07adbb7.pth', + backbone=dict(type='ResNeXt', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/resnext/coco/resnext50_coco_384x288.py b/configs/top_down/resnext/coco/resnext50_coco_384x288.py new file mode 100755 index 0000000000..19e70e8c52 --- /dev/null +++ b/configs/top_down/resnext/coco/resnext50_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'resnext50_32x4d_batch256_20200708-c07adbb7.pth', + backbone=dict(type='ResNeXt', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet101_coco_256x192.py b/configs/top_down/seresnet/coco/seresnet101_coco_256x192.py new file mode 100755 index 0000000000..42447c1750 --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet101_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'se-resnet101_batch256_20200708-038a4d04.pth', + backbone=dict(type='SEResNet', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet101_coco_384x288.py b/configs/top_down/seresnet/coco/seresnet101_coco_384x288.py new file mode 100755 index 0000000000..2143596e39 --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet101_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'se-resnet101_batch256_20200708-038a4d04.pth', + backbone=dict(type='SEResNet', depth=101), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet152_coco_256x192.py b/configs/top_down/seresnet/coco/seresnet152_coco_256x192.py new file mode 100755 index 0000000000..fd05fb7c86 --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet152_coco_256x192.py @@ -0,0 +1,144 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict(type='SEResNet', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet152_coco_384x288.py b/configs/top_down/seresnet/coco/seresnet152_coco_384x288.py new file mode 100755 index 0000000000..5bef85e7fd --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet152_coco_384x288.py @@ -0,0 +1,144 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict(type='SEResNet', depth=152), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=48, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet50_coco_256x192.py b/configs/top_down/seresnet/coco/seresnet50_coco_256x192.py new file mode 100644 index 0000000000..68f1939d8e --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet50_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=1, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'se-resnet50_batch256_20200708-657b3c36.pth', + backbone=dict(type='SEResNet', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=True, + image_thr=0.0, + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/seresnet/coco/seresnet50_coco_384x288.py b/configs/top_down/seresnet/coco/seresnet50_coco_384x288.py new file mode 100755 index 0000000000..b31d0490db --- /dev/null +++ b/configs/top_down/seresnet/coco/seresnet50_coco_384x288.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=5, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'se-resnet50_batch256_20200708-657b3c36.pth', + backbone=dict(type='SEResNet', depth=50), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=2048, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[288, 384], + heatmap_size=[72, 96], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=3), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/configs/top_down/shufflenet_v1/coco/shufflenetv1_coco_256x192.py b/configs/top_down/shufflenet_v1/coco/shufflenetv1_coco_256x192.py new file mode 100755 index 0000000000..c91beed4c0 --- /dev/null +++ b/configs/top_down/shufflenet_v1/coco/shufflenetv1_coco_256x192.py @@ -0,0 +1,145 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=10, metric='mAP') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=17, + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +# model settings +model = dict( + type='TopDown', + pretrained='models/pytorch/imagenet/' + 'shufflenet_v1_batch1024_20200708-7a087432.pth', + backbone=dict(type='ShuffleNetV1', groups=3), + keypoint_head=dict( + type='TopDownSimpleHead', + in_channels=960, + out_channels=channel_cfg['num_output_channels'], + ), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process=True, + shift_heatmap=True, + unbiased_decoding=False, + modulate_kernel=11), + loss_pose=dict(type='JointsMSELoss', use_target_weight=True)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + bbox_thr=1.0, + use_gt_bbox=False, + image_thr=0.0, + bbox_file='data/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownHalfBodyTransform', + num_joints_half_body=8, + prob_half_body=0.3), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/coco' +data = dict( + samples_per_gpu=64, + workers_per_gpu=2, + train=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline), + val=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), + test=dict( + type='TopDownCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=valid_pipeline), +) diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index 3231e4ae4c..c2bf545fe2 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -1,6 +1,30 @@ +from .alexnet import AlexNet from .hourglass import HourglassNet from .hrnet import HRNet +from .mobilenet_v2 import MobileNetV2 +from .mobilenet_v3 import MobileNetV3 +from .regnet import RegNet from .resnet import ResNet, ResNetV1d +from .resnext import ResNeXt from .scnet import SCNet +from .seresnet import SEResNet +from .seresnext import SEResNeXt +from .shufflenet_v1 import ShuffleNetV1 +from .shufflenet_v2 import ShuffleNetV2 -__all__ = ['HourglassNet', 'HRNet', 'ResNet', 'ResNetV1d', 'SCNet'] +__all__ = [ + 'AlexNet', + 'HourglassNet', + 'HRNet', + 'MobileNetV2', + 'MobileNetV3', + 'RegNet', + 'ResNet', + 'ResNetV1d', + 'ResNeXt', + 'SCNet', + 'SEResNet', + 'SEResNeXt', + 'ShuffleNetV1', + 'ShuffleNetV2', +] diff --git a/mmpose/models/backbones/alexnet.py b/mmpose/models/backbones/alexnet.py new file mode 100644 index 0000000000..c4bf0b905e --- /dev/null +++ b/mmpose/models/backbones/alexnet.py @@ -0,0 +1,55 @@ +import torch.nn as nn + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +@BACKBONES.register_module() +class AlexNet(BaseBackbone): + """`AlexNet `_ backbone. + + The input for AlexNet is a 224x224 RGB image. + + Args: + num_classes (int): number of classes for classification. + The default value is -1, which uses the backbone as + a feature extractor without the top classifier. + """ + + def __init__(self, num_classes=-1): + super().__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def forward(self, x): + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/mmpose/models/backbones/base_backbone.py b/mmpose/models/backbones/base_backbone.py index f1d21382a4..b4b6317ffe 100644 --- a/mmpose/models/backbones/base_backbone.py +++ b/mmpose/models/backbones/base_backbone.py @@ -2,7 +2,8 @@ from abc import ABCMeta, abstractmethod import torch.nn as nn -from mmcv.runner import load_checkpoint + +from .utils import load_checkpoint class BaseBackbone(nn.Module, metaclass=ABCMeta): @@ -13,7 +14,7 @@ class BaseBackbone(nn.Module, metaclass=ABCMeta): """ def __init__(self): - super(BaseBackbone, self).__init__() + super().__init__() def init_weights(self, pretrained=None): """Init backbone weights. @@ -50,4 +51,4 @@ def train(self, mode=True): Args: mode (bool): Whether it is train_mode or test_mode """ - super(BaseBackbone, self).train(mode) + super().train(mode) diff --git a/mmpose/models/backbones/hourglass.py b/mmpose/models/backbones/hourglass.py index e9324eaf06..76bd0df8b6 100644 --- a/mmpose/models/backbones/hourglass.py +++ b/mmpose/models/backbones/hourglass.py @@ -1,12 +1,12 @@ import torch.nn as nn from mmcv.cnn import ConvModule, constant_init, normal_init -from mmcv.runner import load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from mmpose.utils import get_root_logger from ..registry import BACKBONES from .base_backbone import BaseBackbone from .resnet import BasicBlock, ResLayer +from .utils import load_checkpoint class HourglassModule(nn.Module): @@ -118,7 +118,7 @@ def __init__(self, stage_blocks=(2, 2, 2, 2, 2, 4), feat_channel=256, norm_cfg=dict(type='BN', requires_grad=True)): - super(HourglassNet, self).__init__() + super().__init__() self.num_stacks = num_stacks assert self.num_stacks >= 1 diff --git a/mmpose/models/backbones/hrnet.py b/mmpose/models/backbones/hrnet.py index c0738ae3d1..4b2cf3401c 100644 --- a/mmpose/models/backbones/hrnet.py +++ b/mmpose/models/backbones/hrnet.py @@ -1,12 +1,12 @@ import torch.nn as nn from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, normal_init) -from mmcv.runner import load_checkpoint from torch.nn.modules.batchnorm import _BatchNorm from mmpose.utils import get_root_logger from ..registry import BACKBONES from .resnet import BasicBlock, Bottleneck, get_expansion +from .utils import load_checkpoint class HRModule(nn.Module): @@ -26,7 +26,7 @@ def __init__(self, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN')): - super(HRModule, self).__init__() + super().__init__() self._check_branches(num_branches, num_blocks, in_channels, num_channels) @@ -265,7 +265,7 @@ def __init__(self, norm_eval=False, with_cp=False, zero_init_residual=False): - super(HRNet, self).__init__() + super().__init__() self.extra = extra self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg @@ -537,7 +537,7 @@ def forward(self, x): def train(self, mode=True): """Convert the model into training mode.""" - super(HRNet, self).train(mode) + super().train(mode) if mode and self.norm_eval: for m in self.modules(): if isinstance(m, _BatchNorm): diff --git a/mmpose/models/backbones/mobilenet_v2.py b/mmpose/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000000..740fd1eb12 --- /dev/null +++ b/mmpose/models/backbones/mobilenet_v2.py @@ -0,0 +1,269 @@ +import logging + +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, constant_init, kaiming_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..registry import BACKBONES +from .base_backbone import BaseBackbone +from .utils import load_checkpoint, make_divisible + + +class InvertedResidual(nn.Module): + """InvertedResidual block for MobileNetV2. + + Args: + in_channels (int): The input channels of the InvertedResidual block. + out_channels (int): The output channels of the InvertedResidual block. + stride (int): Stride of the middle (first) 3x3 convolution. + expand_ratio (int): adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + stride, + expand_ratio, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + with_cp=False): + super().__init__() + self.stride = stride + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.use_res_connect = self.stride == 1 and in_channels == out_channels + hidden_dim = int(round(in_channels * expand_ratio)) + + layers = [] + if expand_ratio != 1: + layers.append( + ConvModule( + in_channels=in_channels, + out_channels=hidden_dim, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + layers.extend([ + ConvModule( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=3, + stride=stride, + padding=1, + groups=hidden_dim, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + + def _inner_forward(x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class MobileNetV2(BaseBackbone): + """MobileNetV2 backbone. + + Args: + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + out_indices (None or Sequence[int]): Output from which stages. + Default: (7, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + # Parameters to build layers. 4 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks, stride. + arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], + [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], + [6, 320, 1, 1]] + + def __init__(self, + widen_factor=1., + out_indices=(7, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + norm_eval=False, + with_cp=False): + super().__init__() + self.widen_factor = widen_factor + self.out_indices = out_indices + for index in out_indices: + if index not in range(0, 8): + raise ValueError('the item in out_indices must in ' + f'range(0, 8). But received {index}') + + if frozen_stages not in range(-1, 8): + raise ValueError('frozen_stages must be in range(-1, 8). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks, stride = layer_cfg + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + if widen_factor > 1.0: + self.out_channel = int(1280 * widen_factor) + else: + self.out_channel = 1280 + + layer = ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channel, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.add_module('conv2', layer) + self.layers.append('conv2') + + def make_layer(self, out_channels, num_blocks, stride, expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. Default: 6. + """ + layers = [] + for i in range(num_blocks): + if i >= 1: + stride = 1 + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + stride, + expand_ratio=expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/mobilenet_v3.py b/mmpose/models/backbones/mobilenet_v3.py new file mode 100644 index 0000000000..8a9c3552ea --- /dev/null +++ b/mmpose/models/backbones/mobilenet_v3.py @@ -0,0 +1,185 @@ +import logging + +import torch.nn as nn +from mmcv.cnn import ConvModule, constant_init, kaiming_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..registry import BACKBONES +from .base_backbone import BaseBackbone +from .utils import InvertedResidual, load_checkpoint + + +@BACKBONES.register_module() +class MobileNetV3(BaseBackbone): + """MobileNetV3 backbone. + + Args: + arch (str): Architechture of mobilnetv3, from {small, big}. + Default: small. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + out_indices (None or Sequence[int]): Output from which stages. + Default: (10, ), which means output tensors from final stage. + frozen_stages (int): Stages to be frozen (all param fixed). + Defualt: -1, which means not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Defualt: False. + """ + # Parameters to build each block: + # [kernel size, mid channels, out channels, with_se, act type, stride] + arch_settings = { + 'small': [[3, 16, 16, True, 'ReLU', 2], + [3, 72, 24, False, 'ReLU', 2], + [3, 88, 24, False, 'ReLU', 1], + [5, 96, 40, True, 'HSwish', 2], + [5, 240, 40, True, 'HSwish', 1], + [5, 240, 40, True, 'HSwish', 1], + [5, 120, 48, True, 'HSwish', 1], + [5, 144, 48, True, 'HSwish', 1], + [5, 288, 96, True, 'HSwish', 2], + [5, 576, 96, True, 'HSwish', 1], + [5, 576, 96, True, 'HSwish', 1]], + 'big': [[3, 16, 16, False, 'ReLU', 1], + [3, 64, 24, False, 'ReLU', 2], + [3, 72, 24, False, 'ReLU', 1], + [5, 72, 40, True, 'ReLU', 2], + [5, 120, 40, True, 'ReLU', 1], + [5, 120, 40, True, 'ReLU', 1], + [3, 240, 80, False, 'HSwish', 2], + [3, 200, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 480, 112, True, 'HSwish', 1], + [3, 672, 112, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 2], + [5, 960, 160, True, 'HSwish', 1]] + } # yapf: disable + + def __init__(self, + arch='small', + conv_cfg=None, + norm_cfg=dict(type='BN'), + out_indices=(10, ), + frozen_stages=-1, + norm_eval=False, + with_cp=False): + super().__init__() + assert arch in self.arch_settings + for index in out_indices: + if index not in range(0, len(self.arch_settings[arch])): + raise ValueError('the item in out_indices must in ' + f'range(0, {len(self.arch_settings[arch])}). ' + f'But received {index}') + + if frozen_stages not in range(-1, len(self.arch_settings[arch])): + raise ValueError('frozen_stages must be in range(-1, ' + f'{len(self.arch_settings[arch])}). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.arch = arch + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = 16 + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=dict(type='HSwish')) + + self.layers = self._make_layer() + self.feat_dim = self.arch_settings[arch][-1][2] + + def _make_layer(self): + layers = [] + layer_setting = self.arch_settings[self.arch] + for i, params in enumerate(layer_setting): + (kernel_size, mid_channels, out_channels, with_se, act, + stride) = params + if with_se: + se_cfg = dict( + channels=mid_channels, + ratio=4, + act_cfg=(dict(type='ReLU'), dict(type='HSigmoid'))) + else: + se_cfg = None + + layer = InvertedResidual( + in_channels=self.in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + with_expand_conv=True, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type=act), + with_cp=self.with_cp) + self.in_channels = out_channels + layer_name = 'layer{}'.format(i + 1) + self.add_module(layer_name, layer) + layers.append(layer_name) + return layers + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/regnet.py b/mmpose/models/backbones/regnet.py new file mode 100644 index 0000000000..1c093f2548 --- /dev/null +++ b/mmpose/models/backbones/regnet.py @@ -0,0 +1,312 @@ +import numpy as np +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..registry import BACKBONES +from .resnet import ResNet +from .resnext import Bottleneck + + +@BACKBONES.register_module() +class RegNet(ResNet): + """RegNet backbone. + + More details can be found in `paper `_ . + + Args: + arch (dict): The parameter of RegNets. + - w0 (int): initial width + - wa (float): slope of width + - wm (float): quantization parameter to quantize the width + - depth (int): depth of the backbone + - group_w (int): width of group + - bot_mul (float): bottleneck ratio, i.e. expansion of bottlneck. + strides (Sequence[int]): Strides of the first block of each stage. + base_channels (int): Base channels after stem layer. + in_channels (int): Number of input image channels. Default: 3. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. Default: "pytorch". + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. Default: -1. + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN', requires_grad=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import RegNet + >>> import torch + >>> self = RegNet( + arch=dict( + w0=88, + wa=26.31, + wm=2.25, + group_w=48, + depth=25, + bot_mul=1.0)) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 96, 8, 8) + (1, 192, 4, 4) + (1, 432, 2, 2) + (1, 1008, 1, 1) + """ + arch_settings = { + 'regnetx_400mf': + dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), + 'regnetx_800mf': + dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0), + 'regnetx_1.6gf': + dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0), + 'regnetx_3.2gf': + dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0), + 'regnetx_4.0gf': + dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0), + 'regnetx_6.4gf': + dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0), + 'regnetx_8.0gf': + dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0), + 'regnetx_12gf': + dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0), + } + + def __init__(self, + arch, + in_channels=3, + stem_channels=32, + base_channels=32, + strides=(2, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(3, ), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=True): + super(ResNet, self).__init__() + + # Generate RegNet parameters first + if isinstance(arch, str): + assert arch in self.arch_settings, \ + f'"arch": "{arch}" is not one of the' \ + ' arch_settings' + arch = self.arch_settings[arch] + elif not isinstance(arch, dict): + raise TypeError('Expect "arch" to be either a string ' + f'or a dict, got {type(arch)}') + + widths, num_stages = self.generate_regnet( + arch['w0'], + arch['wa'], + arch['wm'], + arch['depth'], + ) + # Convert to per stage format + stage_widths, stage_blocks = self.get_stages_from_blocks(widths) + # Generate group widths and bot muls + group_widths = [arch['group_w'] for _ in range(num_stages)] + self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)] + # Adjust the compatibility of stage_widths and group_widths + stage_widths, group_widths = self.adjust_width_group( + stage_widths, self.bottleneck_ratio, group_widths) + + # Group params by stage + self.stage_widths = stage_widths + self.group_widths = group_widths + self.depth = sum(stage_blocks) + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + if self.deep_stem: + raise NotImplementedError( + 'deep_stem has not been implemented for RegNet') + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.zero_init_residual = zero_init_residual + self.stage_blocks = stage_blocks[:num_stages] + + self._make_stem_layer(in_channels, stem_channels) + + _in_channels = stem_channels + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = self.strides[i] + dilation = self.dilations[i] + group_width = self.group_widths[i] + width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i])) + stage_groups = width // group_width + + res_layer = self.make_res_layer( + block=Bottleneck, + num_blocks=num_blocks, + in_channels=_in_channels, + out_channels=self.stage_widths[i], + expansion=1, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=self.with_cp, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + base_channels=self.stage_widths[i], + groups=stage_groups, + width_per_group=group_width) + _in_channels = self.stage_widths[i] + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = stage_widths[-1] + + def _make_stem_layer(self, in_channels, base_channels): + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + base_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, base_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + + def generate_regnet(self, + initial_width, + width_slope, + width_parameter, + depth, + divisor=8): + """Generates per block width from RegNet parameters. + + Args: + initial_width ([int]): Initial width of the backbone + width_slope ([float]): Slope of the quantized linear function + width_parameter ([int]): Parameter used to quantize the width. + depth ([int]): Depth of the backbone. + divisor (int, optional): The divisor of channels. Defaults to 8. + + Returns: + list, int: return a list of widths of each stage and the number of + stages + """ + assert width_slope >= 0 + assert initial_width > 0 + assert width_parameter > 1 + assert initial_width % divisor == 0 + widths_cont = np.arange(depth) * width_slope + initial_width + ks = np.round( + np.log(widths_cont / initial_width) / np.log(width_parameter)) + widths = initial_width * np.power(width_parameter, ks) + widths = np.round(np.divide(widths, divisor)) * divisor + num_stages = len(np.unique(widths)) + widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist() + return widths, num_stages + + @staticmethod + def quantize_float(number, divisor): + """Converts a float to closest non-zero int divisible by divior. + + Args: + number (int): Original number to be quantized. + divisor (int): Divisor used to quantize the number. + + Returns: + int: quantized number that is divisible by devisor. + """ + return int(round(number / divisor) * divisor) + + def adjust_width_group(self, widths, bottleneck_ratio, groups): + """Adjusts the compatibility of widths and groups. + + Args: + widths (list[int]): Width of each stage. + bottleneck_ratio (float): Bottleneck ratio. + groups (int): number of groups in each stage + + Returns: + tuple(list): The adjusted widths and groups of each stage. + """ + bottleneck_width = [ + int(w * b) for w, b in zip(widths, bottleneck_ratio) + ] + groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)] + bottleneck_width = [ + self.quantize_float(w_bot, g) + for w_bot, g in zip(bottleneck_width, groups) + ] + widths = [ + int(w_bot / b) + for w_bot, b in zip(bottleneck_width, bottleneck_ratio) + ] + return widths, groups + + def get_stages_from_blocks(self, widths): + """Gets widths/stage_blocks of network at each stage. + + Args: + widths (list[int]): Width in each stage. + + Returns: + tuple(list): width and depth of each stage + """ + width_diff = [ + width != width_prev + for width, width_prev in zip(widths + [0], [0] + widths) + ] + stage_widths = [ + width for width, diff in zip(widths, width_diff[:-1]) if diff + ] + stage_blocks = np.diff([ + depth for depth, diff in zip(range(len(width_diff)), width_diff) + if diff + ]).tolist() + return stage_widths, stage_blocks + + def forward(self, x): + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) diff --git a/mmpose/models/backbones/resnet.py b/mmpose/models/backbones/resnet.py index 23f227fe44..9986031d29 100644 --- a/mmpose/models/backbones/resnet.py +++ b/mmpose/models/backbones/resnet.py @@ -42,7 +42,7 @@ def __init__(self, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN')): - super(BasicBlock, self).__init__() + super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.expansion = expansion @@ -154,7 +154,7 @@ def __init__(self, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN')): - super(Bottleneck, self).__init__() + super().__init__() assert style in ['pytorch', 'caffe'] self.in_channels = in_channels @@ -401,7 +401,7 @@ def __init__(self, norm_cfg=norm_cfg, **kwargs)) - super(ResLayer, self).__init__(*layers) + super().__init__(*layers) @BACKBONES.register_module() @@ -486,7 +486,7 @@ def __init__(self, norm_eval=False, with_cp=False, zero_init_residual=True): - super(ResNet, self).__init__() + super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') self.depth = depth @@ -614,7 +614,7 @@ def _freeze_stages(self): param.requires_grad = False def init_weights(self, pretrained=None): - super(ResNet, self).init_weights(pretrained) + super().init_weights(pretrained) if pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -649,7 +649,7 @@ def forward(self, x): return tuple(outs) def train(self, mode=True): - super(ResNet, self).train(mode) + super().train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): @@ -669,5 +669,4 @@ class ResNetV1d(ResNet): """ def __init__(self, **kwargs): - super(ResNetV1d, self).__init__( - deep_stem=True, avg_down=True, **kwargs) + super().__init__(deep_stem=True, avg_down=True, **kwargs) diff --git a/mmpose/models/backbones/resnext.py b/mmpose/models/backbones/resnext.py new file mode 100644 index 0000000000..147fe8f409 --- /dev/null +++ b/mmpose/models/backbones/resnext.py @@ -0,0 +1,147 @@ +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..registry import BACKBONES +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResLayer, ResNet + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeXt. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__(self, + in_channels, + out_channels, + base_channels=64, + groups=32, + width_per_group=4, + **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.groups = groups + self.width_per_group = width_per_group + + # For ResNet bottleneck, middle channels are determined by expansion + # and out_channels, but for ResNeXt bottleneck, it is determined by + # groups and width_per_group and the stage it is located in. + if groups != 1: + assert self.mid_channels % base_channels == 0 + self.mid_channels = ( + groups * width_per_group * self.mid_channels // base_channels) + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class ResNeXt(ResNet): + """ResNeXt backbone. + + Please refer to the `paper `_ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + groups (int): Groups of conv2 in Bottleneck. Default: 32. + width_per_group (int): Width per group of conv2 in Bottleneck. + Default: 4. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, groups=32, width_per_group=4, **kwargs): + self.groups = groups + self.width_per_group = width_per_group + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer( + groups=self.groups, + width_per_group=self.width_per_group, + base_channels=self.base_channels, + **kwargs) diff --git a/mmpose/models/backbones/scnet.py b/mmpose/models/backbones/scnet.py index 555f00fbd2..bb7febc675 100644 --- a/mmpose/models/backbones/scnet.py +++ b/mmpose/models/backbones/scnet.py @@ -92,7 +92,7 @@ class SCBottleneck(Bottleneck): pooling_r = 4 def __init__(self, in_channels, out_channels, **kwargs): - super(SCBottleneck, self).__init__(in_channels, out_channels, **kwargs) + super().__init__(in_channels, out_channels, **kwargs) self.mid_channels = out_channels // self.expansion // 2 self.norm1_name, norm1 = build_norm_layer( @@ -238,4 +238,4 @@ class SCNet(ResNet): def __init__(self, depth, **kwargs): if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for SCNet') - super(SCNet, self).__init__(depth, **kwargs) + super().__init__(depth, **kwargs) diff --git a/mmpose/models/backbones/seresnet.py b/mmpose/models/backbones/seresnet.py new file mode 100644 index 0000000000..b996fe9441 --- /dev/null +++ b/mmpose/models/backbones/seresnet.py @@ -0,0 +1,124 @@ +import torch.utils.checkpoint as cp + +from ..registry import BACKBONES +from .resnet import Bottleneck, ResLayer, ResNet +from .utils.se_layer import SELayer + + +class SEBottleneck(Bottleneck): + """SEBottleneck block for SEResNet. + + Args: + in_channels (int): The input channels of the SEBottleneck block. + out_channels (int): The output channel of the SEBottleneck block. + se_ratio (int): Squeeze ratio in SELayer. Default: 16 + """ + + def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.se_layer = SELayer(out_channels, ratio=se_ratio) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + out = self.se_layer(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class SEResNet(ResNet): + """SEResNet backbone. + + Please refer to the `paper `_ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + se_ratio (int): Squeeze ratio in SELayer. Default: 16. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + + Example: + >>> from mmpose.models import SEResNet + >>> import torch + >>> self = SEResNet(depth=50) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 224, 224) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 56, 56) + (1, 128, 28, 28) + (1, 256, 14, 14) + (1, 512, 7, 7) + """ + + arch_settings = { + 50: (SEBottleneck, (3, 4, 6, 3)), + 101: (SEBottleneck, (3, 4, 23, 3)), + 152: (SEBottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, se_ratio=16, **kwargs): + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for SEResNet') + self.se_ratio = se_ratio + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer(se_ratio=self.se_ratio, **kwargs) diff --git a/mmpose/models/backbones/seresnext.py b/mmpose/models/backbones/seresnext.py new file mode 100644 index 0000000000..550bfbdb2e --- /dev/null +++ b/mmpose/models/backbones/seresnext.py @@ -0,0 +1,153 @@ +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..registry import BACKBONES +from .resnet import ResLayer +from .seresnet import SEBottleneck as _SEBottleneck +from .seresnet import SEResNet + + +class SEBottleneck(_SEBottleneck): + """SEBottleneck block for SEResNeXt. + + Args: + in_channels (int): Input channels of this block. + out_channels (int): Output channels of this block. + base_channels (int): Middle channels of the first stage. Default: 64. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + stride (int): stride of the block. Default: 1 + dilation (int): dilation of convolution. Default: 1 + downsample (nn.Module): downsample operation on identity branch. + Default: None + se_ratio (int): Squeeze ratio in SELayer. Default: 16 + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + def __init__(self, + in_channels, + out_channels, + base_channels=64, + groups=32, + width_per_group=4, + se_ratio=16, + **kwargs): + super().__init__(in_channels, out_channels, se_ratio, **kwargs) + self.groups = groups + self.width_per_group = width_per_group + + # We follow the same rational of ResNext to compute mid_channels. + # For SEResNet bottleneck, middle channels are determined by expansion + # and out_channels, but for SEResNeXt bottleneck, it is determined by + # groups and width_per_group and the stage it is located in. + if groups != 1: + assert self.mid_channels % base_channels == 0 + self.mid_channels = ( + groups * width_per_group * self.mid_channels // base_channels) + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, self.mid_channels, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.out_channels, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.in_channels, + self.mid_channels, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.mid_channels, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + self.mid_channels, + self.out_channels, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class SEResNeXt(SEResNet): + """SEResNeXt backbone. + + Please refer to the `paper `_ for + details. + + Args: + depth (int): Network depth, from {50, 101, 152}. + groups (int): Groups of conv2 in Bottleneck. Default: 32. + width_per_group (int): Width per group of conv2 in Bottleneck. + Default: 4. + se_ratio (int): Squeeze ratio in SELayer. Default: 16. + in_channels (int): Number of input image channels. Default: 3. + stem_channels (int): Output channels of the stem layer. Default: 64. + num_stages (int): Stages of the network. Default: 4. + strides (Sequence[int]): Strides of the first block of each stage. + Default: ``(1, 2, 2, 2)``. + dilations (Sequence[int]): Dilation of each stage. + Default: ``(1, 1, 1, 1)``. + out_indices (Sequence[int]): Output from which stages. If only one + stage is specified, a single tensor (feature map) is returned, + otherwise multiple stages are specified, a tuple of tensors will + be returned. Default: ``(3, )``. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. + Default: False. + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. Default: -1. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict): The config dict for norm layers. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Default: True. + """ + + arch_settings = { + 50: (SEBottleneck, (3, 4, 6, 3)), + 101: (SEBottleneck, (3, 4, 23, 3)), + 152: (SEBottleneck, (3, 8, 36, 3)) + } + + def __init__(self, depth, groups=32, width_per_group=4, **kwargs): + self.groups = groups + self.width_per_group = width_per_group + super().__init__(depth, **kwargs) + + def make_res_layer(self, **kwargs): + return ResLayer( + groups=self.groups, + width_per_group=self.width_per_group, + base_channels=self.base_channels, + **kwargs) diff --git a/mmpose/models/backbones/shufflenet_v1.py b/mmpose/models/backbones/shufflenet_v1.py new file mode 100644 index 0000000000..b79805925a --- /dev/null +++ b/mmpose/models/backbones/shufflenet_v1.py @@ -0,0 +1,322 @@ +import logging + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import (ConvModule, build_activation_layer, constant_init, + normal_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from ..registry import BACKBONES +from .base_backbone import BaseBackbone +from .utils import channel_shuffle, load_checkpoint, make_divisible + + +class ShuffleUnit(nn.Module): + """ShuffleUnit block. + + ShuffleNet unit with pointwise group convolution (GConv) and channel + shuffle. + + Args: + in_channels (int): The input channels of the ShuffleUnit. + out_channels (int): The output channels of the ShuffleUnit. + groups (int, optional): The number of groups to be used in grouped 1x1 + convolutions in each ShuffleUnit. Default: 3 + first_block (bool, optional): Whether it is the first ShuffleUnit of a + sequential ShuffleUnits. Default: False, which means not using the + grouped 1x1 convolution. + combine (str, optional): The ways to combine the input and output + branches. Default: 'add'. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool, optional): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + groups=3, + first_block=True, + combine='add', + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.first_block = first_block + self.combine = combine + self.groups = groups + self.bottleneck_channels = self.out_channels // 4 + self.with_cp = with_cp + + if self.combine == 'add': + self.depthwise_stride = 1 + self._combine_func = self._add + assert in_channels == out_channels, ( + 'in_channels must be equal to out_channels when combine ' + 'is add') + elif self.combine == 'concat': + self.depthwise_stride = 2 + self._combine_func = self._concat + self.out_channels -= self.in_channels + self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) + else: + raise ValueError(f'Cannot combine tensors with {self.combine}. ' + 'Only "add" and "concat" are supported') + + self.first_1x1_groups = 1 if first_block else self.groups + self.g_conv_1x1_compress = ConvModule( + in_channels=self.in_channels, + out_channels=self.bottleneck_channels, + kernel_size=1, + groups=self.first_1x1_groups, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.depthwise_conv3x3_bn = ConvModule( + in_channels=self.bottleneck_channels, + out_channels=self.bottleneck_channels, + kernel_size=3, + stride=self.depthwise_stride, + padding=1, + groups=self.bottleneck_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.g_conv_1x1_expand = ConvModule( + in_channels=self.bottleneck_channels, + out_channels=self.out_channels, + kernel_size=1, + groups=self.groups, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + self.act = build_activation_layer(act_cfg) + + @staticmethod + def _add(x, out): + # residual connection + return x + out + + @staticmethod + def _concat(x, out): + # concatenate along channel axis + return torch.cat((x, out), 1) + + def forward(self, x): + + def _inner_forward(x): + residual = x + + out = self.g_conv_1x1_compress(x) + out = self.depthwise_conv3x3_bn(out) + + if self.groups > 1: + out = channel_shuffle(out, self.groups) + + out = self.g_conv_1x1_expand(out) + + if self.combine == 'concat': + residual = self.avgpool(residual) + out = self.act(out) + out = self._combine_func(residual, out) + else: + out = self._combine_func(residual, out) + out = self.act(out) + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class ShuffleNetV1(BaseBackbone): + """ShuffleNetV1 backbone. + + Args: + groups (int, optional): The number of groups to be used in grouped 1x1 + convolutions in each ShuffleUnit. Default: 3. + widen_factor (float, optional): Width multiplier - adjusts the number + of channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (2, ) + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + groups=3, + widen_factor=1.0, + out_indices=(2, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + norm_eval=False, + with_cp=False): + super().__init__() + self.stage_blocks = [4, 8, 4] + self.groups = groups + + for index in out_indices: + if index not in range(0, 3): + raise ValueError('the item in out_indices must in ' + f'range(0, 3). But received {index}') + + if frozen_stages not in range(-1, 3): + raise ValueError('frozen_stages must be in range(-1, 3). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + if groups == 1: + channels = (144, 288, 576) + elif groups == 2: + channels = (200, 400, 800) + elif groups == 3: + channels = (240, 480, 960) + elif groups == 4: + channels = (272, 544, 1088) + elif groups == 8: + channels = (384, 768, 1536) + else: + raise ValueError(f'{groups} groups is not supported for 1x1 ' + 'Grouped Convolutions') + + channels = [make_divisible(ch * widen_factor, 8) for ch in channels] + + self.in_channels = int(24 * widen_factor) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layers = nn.ModuleList() + for i, num_blocks in enumerate(self.stage_blocks): + first_block = True if i == 0 else False + layer = self.make_layer(channels[i], num_blocks, first_block) + self.layers.append(layer) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(self.frozen_stages): + layer = self.layers[i] + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'conv1' in name: + normal_init(m, mean=0, std=0.01) + else: + normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m.weight, val=1, bias=0.0001) + if isinstance(m, _BatchNorm): + if m.running_mean is not None: + nn.init.constant_(m.running_mean, 0) + else: + raise TypeError('pretrained must be a str or None. But received ' + f'{type(pretrained)}') + + def make_layer(self, out_channels, num_blocks, first_block=False): + """Stack ShuffleUnit blocks to make a layer. + + Args: + out_channels (int): out_channels of the block. + num_blocks (int): Number of blocks. + first_block (bool, optional): Whether is the first ShuffleUnit of a + sequential ShuffleUnits. Default: False, which means not using + the grouped 1x1 convolution. + """ + layers = [] + for i in range(num_blocks): + first_block = first_block if i == 0 else False + combine_mode = 'concat' if i == 0 else 'add' + layers.append( + ShuffleUnit( + self.in_channels, + out_channels, + groups=self.groups, + first_block=first_block, + combine=combine_mode, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/mmpose/models/backbones/shufflenet_v2.py b/mmpose/models/backbones/shufflenet_v2.py new file mode 100644 index 0000000000..c681db5f75 --- /dev/null +++ b/mmpose/models/backbones/shufflenet_v2.py @@ -0,0 +1,295 @@ +import logging + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule, constant_init, normal_init +from torch.nn.modules.batchnorm import _BatchNorm + +from ..registry import BACKBONES +from .base_backbone import BaseBackbone +from .utils import channel_shuffle, load_checkpoint + + +class InvertedResidual(nn.Module): + """InvertedResidual block for ShuffleNetV2 backbone. + + Args: + in_channels (int): The input channels of the block. + out_channels (int): The output channels of the block. + stride (int): Stride of the 3x3 convolution layer. Default: 1 + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + stride=1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + self.stride = stride + self.with_cp = with_cp + + branch_features = out_channels // 2 + if self.stride == 1: + assert in_channels == branch_features * 2, ( + f'in_channels ({in_channels}) should equal to ' + f'branch_features * 2 ({branch_features * 2}) ' + 'when stride is 1') + + if in_channels != branch_features * 2: + assert self.stride != 1, ( + f'stride ({self.stride}) should not equal 1 when ' + f'in_channels != branch_features * 2') + + if self.stride > 1: + self.branch1 = nn.Sequential( + ConvModule( + in_channels, + in_channels, + kernel_size=3, + stride=self.stride, + padding=1, + groups=in_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + in_channels, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.branch2 = nn.Sequential( + ConvModule( + in_channels if (self.stride > 1) else branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + branch_features, + branch_features, + kernel_size=3, + stride=self.stride, + padding=1, + groups=branch_features, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), + ConvModule( + branch_features, + branch_features, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + + def _inner_forward(x): + if self.stride > 1: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + else: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + + out = channel_shuffle(out, 2) + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +@BACKBONES.register_module() +class ShuffleNetV2(BaseBackbone): + """ShuffleNetV2 backbone. + + Args: + widen_factor (float): Width multiplier - adjusts the number of + channels in each layer by this amount. Default: 1.0. + out_indices (Sequence[int]): Output from which stages. + Default: (0, 1, 2, 3). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + widen_factor=1.0, + out_indices=(3, ), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + norm_eval=False, + with_cp=False): + super().__init__() + self.stage_blocks = [4, 8, 4] + for index in out_indices: + if index not in range(0, 4): + raise ValueError('the item in out_indices must in ' + f'range(0, 4). But received {index}') + + if frozen_stages not in range(-1, 4): + raise ValueError('frozen_stages must be in range(-1, 4). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + if widen_factor == 0.5: + channels = [48, 96, 192, 1024] + elif widen_factor == 1.0: + channels = [116, 232, 464, 1024] + elif widen_factor == 1.5: + channels = [176, 352, 704, 1024] + elif widen_factor == 2.0: + channels = [244, 488, 976, 2048] + else: + raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. ' + f'But received {widen_factor}') + + self.in_channels = 24 + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layers = nn.ModuleList() + for i, num_blocks in enumerate(self.stage_blocks): + layer = self._make_layer(channels[i], num_blocks) + self.layers.append(layer) + + output_channels = channels[-1] + self.layers.append( + ConvModule( + in_channels=self.in_channels, + out_channels=output_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def _make_layer(self, out_channels, num_blocks): + """Stack blocks to make a layer. + + Args: + out_channels (int): out_channels of the block. + num_blocks (int): number of blocks. + """ + layers = [] + for i in range(num_blocks): + stride = 2 if i == 0 else 1 + layers.append( + InvertedResidual( + in_channels=self.in_channels, + out_channels=out_channels, + stride=stride, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + + for i in range(self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'conv1' in name: + normal_init(m, mean=0, std=0.01) + else: + normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m.weight, val=1, bias=0.0001) + if isinstance(m, _BatchNorm): + if m.running_mean is not None: + nn.init.constant_(m.running_mean, 0) + else: + raise TypeError('pretrained must be a str or None. But received ' + f'{type(pretrained)}') + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() diff --git a/mmpose/models/backbones/utils/__init__.py b/mmpose/models/backbones/utils/__init__.py new file mode 100644 index 0000000000..aeea0a48a1 --- /dev/null +++ b/mmpose/models/backbones/utils/__init__.py @@ -0,0 +1,10 @@ +from .channel_shuffle import channel_shuffle +from .inverted_residual import InvertedResidual +from .make_divisible import make_divisible +from .se_layer import SELayer +from .utils import load_checkpoint + +__all__ = [ + 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer', + 'load_checkpoint' +] diff --git a/mmpose/models/backbones/utils/channel_shuffle.py b/mmpose/models/backbones/utils/channel_shuffle.py new file mode 100644 index 0000000000..51d6d98c9b --- /dev/null +++ b/mmpose/models/backbones/utils/channel_shuffle.py @@ -0,0 +1,28 @@ +import torch + + +def channel_shuffle(x, groups): + """Channel Shuffle operation. + + This function enables cross-group information flow for multiple groups + convolution layers. + + Args: + x (Tensor): The input tensor. + groups (int): The number of groups to divide the input tensor + in the channel dimension. + + Returns: + Tensor: The output tensor after channel shuffle operation. + """ + + batch_size, num_channels, height, width = x.size() + assert (num_channels % groups == 0), ('num_channels should be ' + 'divisible by groups') + channels_per_group = num_channels // groups + + x = x.view(batch_size, groups, channels_per_group, height, width) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(batch_size, -1, height, width) + + return x diff --git a/mmpose/models/backbones/utils/inverted_residual.py b/mmpose/models/backbones/utils/inverted_residual.py new file mode 100644 index 0000000000..b6903fa0e2 --- /dev/null +++ b/mmpose/models/backbones/utils/inverted_residual.py @@ -0,0 +1,117 @@ +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import ConvModule + +from .se_layer import SELayer + + +class InvertedResidual(nn.Module): + """Inverted Residual Block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernal size of the depthwise convolution. + Default: 3. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Defaul: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. + Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super().__init__() + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2] + self.with_cp = with_cp + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=mid_channels, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if self.with_se: + self.se = SELayer(**se_cfg) + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out diff --git a/mmpose/models/backbones/utils/make_divisible.py b/mmpose/models/backbones/utils/make_divisible.py new file mode 100644 index 0000000000..02ee047c50 --- /dev/null +++ b/mmpose/models/backbones/utils/make_divisible.py @@ -0,0 +1,24 @@ +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number down to the nearest value that can + be divisible by the divisor. + + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int, optional): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float, optional): The minimum ratio of the rounded channel + number to the original channel number. Default: 0.9. + Returns: + int: The modified output channel number + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value diff --git a/mmpose/models/backbones/utils/se_layer.py b/mmpose/models/backbones/utils/se_layer.py new file mode 100644 index 0000000000..80b2ee277f --- /dev/null +++ b/mmpose/models/backbones/utils/se_layer.py @@ -0,0 +1,53 @@ +import mmcv +import torch.nn as nn +from mmcv.cnn import ConvModule + + +class SELayer(nn.Module): + """Squeeze-and-Excitation Module. + + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Default: 16. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configurated + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configurated by the first dict and the + second activation layer will be configurated by the second dict. + Default: (dict(type='ReLU'), dict(type='Sigmoid')) + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): + super().__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=int(channels / ratio), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=int(channels / ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out diff --git a/mmpose/models/backbones/utils/utils.py b/mmpose/models/backbones/utils/utils.py new file mode 100644 index 0000000000..de500268eb --- /dev/null +++ b/mmpose/models/backbones/utils/utils.py @@ -0,0 +1,50 @@ +from collections import OrderedDict + +from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict + + +def load_checkpoint(model, + filename, + map_location=None, + strict=False, + logger=None): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict_tmp = checkpoint['state_dict'] + else: + state_dict_tmp = checkpoint + + state_dict = OrderedDict() + # strip prefix of state_dict + for k, v in state_dict_tmp.items(): + if k.startswith('module.backbone.'): + state_dict[k[16:]] = v + elif k.startswith('module.'): + state_dict[k[7:]] = v + elif k.startswith('backbone.'): + state_dict[k[9:]] = v + else: + state_dict[k] = v + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py index f6a66c22d2..60eff616d0 100644 --- a/mmpose/models/losses/mse_loss.py +++ b/mmpose/models/losses/mse_loss.py @@ -14,7 +14,7 @@ class JointsMSELoss(nn.Module): """ def __init__(self, use_target_weight=False): - super(JointsMSELoss, self).__init__() + super().__init__() self.criterion = nn.MSELoss() self.use_target_weight = use_target_weight @@ -52,7 +52,7 @@ class JointsOHKMMSELoss(nn.Module): """ def __init__(self, use_target_weight=False, topk=8): - super(JointsOHKMMSELoss, self).__init__() + super().__init__() assert topk > 0 self.criterion = nn.MSELoss(reduction='none') self.use_target_weight = use_target_weight diff --git a/tests/test_backbones/test_alexnet.py b/tests/test_backbones/test_alexnet.py new file mode 100644 index 0000000000..99dad8ffa3 --- /dev/null +++ b/tests/test_backbones/test_alexnet.py @@ -0,0 +1,20 @@ +import torch + +from mmpose.models.backbones import AlexNet + + +def test_alexnet_backbone(): + """Test alexnet backbone.""" + model = AlexNet(-1) + model.train() + + imgs = torch.randn(1, 3, 256, 192) + feat = model(imgs) + assert feat.shape == (1, 256, 7, 5) + + model = AlexNet(1) + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == (1, 1) diff --git a/tests/test_backbones/test_backbones_utils.py b/tests/test_backbones/test_backbones_utils.py new file mode 100644 index 0000000000..a5f778536b --- /dev/null +++ b/tests/test_backbones/test_backbones_utils.py @@ -0,0 +1,116 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones.utils import (InvertedResidual, SELayer, + channel_shuffle, make_divisible) + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def test_make_divisible(): + # test min_value is None + result = make_divisible(34, 8, None) + assert result == 32 + + # test when new_value > min_ratio * value + result = make_divisible(10, 8, min_ratio=0.9) + assert result == 16 + + # test min_value = 0.8 + result = make_divisible(33, 8, min_ratio=0.8) + assert result == 32 + + +def test_channel_shuffle(): + x = torch.randn(1, 24, 56, 56) + with pytest.raises(AssertionError): + # num_channels should be divisible by groups + channel_shuffle(x, 7) + + groups = 3 + batch_size, num_channels, height, width = x.size() + channels_per_group = num_channels // groups + out = channel_shuffle(x, groups) + # test the output value when groups = 3 + for b in range(batch_size): + for c in range(num_channels): + c_out = c % channels_per_group * groups + c // channels_per_group + for i in range(height): + for j in range(width): + assert x[b, c, i, j] == out[b, c_out, i, j] + + +def test_inverted_residual(): + + with pytest.raises(AssertionError): + # stride must be in [1, 2] + InvertedResidual(16, 16, 32, stride=3) + + with pytest.raises(AssertionError): + # se_cfg must be None or dict + InvertedResidual(16, 16, 32, se_cfg=list()) + + with pytest.raises(AssertionError): + # in_channeld and out_channels must be the same if + # with_expand_conv is False + InvertedResidual(16, 16, 32, with_expand_conv=False) + + # Test InvertedResidual forward, stride=1 + block = InvertedResidual(16, 16, 32, stride=1) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert getattr(block, 'se', None) is None + assert block.with_res_shortcut + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual forward, stride=2 + block = InvertedResidual(16, 16, 32, stride=2) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert not block.with_res_shortcut + assert x_out.shape == torch.Size((1, 16, 28, 28)) + + # Test InvertedResidual forward with se layer + se_cfg = dict(channels=32) + block = InvertedResidual(16, 16, 32, stride=1, se_cfg=se_cfg) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert isinstance(block.se, SELayer) + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual forward, with_expand_conv=False + block = InvertedResidual(32, 16, 32, with_expand_conv=False) + x = torch.randn(1, 32, 56, 56) + x_out = block(x) + assert getattr(block, 'expand_conv', None) is None + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual forward with GroupNorm + block = InvertedResidual( + 16, 16, 32, norm_cfg=dict(type='GN', num_groups=2)) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + for m in block.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual forward with HSigmoid + block = InvertedResidual(16, 16, 32, act_cfg=dict(type='HSigmoid')) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual forward with checkpoint + block = InvertedResidual(16, 16, 32, with_cp=True) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert block.with_cp + assert x_out.shape == torch.Size((1, 16, 56, 56)) diff --git a/tests/test_backbones/test_mobilenet_v2.py b/tests/test_backbones/test_mobilenet_v2.py new file mode 100644 index 0000000000..421de11261 --- /dev/null +++ b/tests/test_backbones/test_mobilenet_v2.py @@ -0,0 +1,256 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones import MobileNetV2 +from mmpose.models.backbones.mobilenet_v2 import InvertedResidual + + +def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (InvertedResidual, )): + return True + return False + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_mobilenetv2_invertedresidual(): + + with pytest.raises(AssertionError): + # stride must be in [1, 2] + InvertedResidual(16, 24, stride=3, expand_ratio=6) + + # Test InvertedResidual with checkpoint forward, stride=1 + block = InvertedResidual(16, 24, stride=1, expand_ratio=6) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 56, 56)) + + # Test InvertedResidual with expand_ratio=1 + block = InvertedResidual(16, 16, stride=1, expand_ratio=1) + assert len(block.conv) == 2 + + # Test InvertedResidual with use_res_connect + block = InvertedResidual(16, 16, stride=1, expand_ratio=6) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert block.use_res_connect is True + assert x_out.shape == torch.Size((1, 16, 56, 56)) + + # Test InvertedResidual with checkpoint forward, stride=2 + block = InvertedResidual(16, 24, stride=2, expand_ratio=6) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 28, 28)) + + # Test InvertedResidual with checkpoint forward + block = InvertedResidual(16, 24, stride=1, expand_ratio=6, with_cp=True) + assert block.with_cp + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 56, 56)) + + # Test InvertedResidual with act_cfg=dict(type='ReLU') + block = InvertedResidual( + 16, 24, stride=1, expand_ratio=6, act_cfg=dict(type='ReLU')) + x = torch.randn(1, 16, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 56, 56)) + + +def test_mobilenetv2_backbone(): + with pytest.raises(TypeError): + # pretrained must be a string path + model = MobileNetV2() + model.init_weights(pretrained=0) + + with pytest.raises(ValueError): + # frozen_stages must in range(1, 8) + MobileNetV2(frozen_stages=8) + + with pytest.raises(ValueError): + # tout_indices in range(-1, 8) + MobileNetV2(out_indices=[8]) + + # Test MobileNetV2 with first stage frozen + frozen_stages = 1 + model = MobileNetV2(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.conv1.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test MobileNetV2 with norm_eval=True + model = MobileNetV2(norm_eval=True) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), False) + + # Test MobileNetV2 forward with widen_factor=1.0 + model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 8)) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), True) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 8 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + assert feat[7].shape == torch.Size((1, 1280, 7, 7)) + + # Test MobileNetV2 forward with widen_factor=0.5 + model = MobileNetV2(widen_factor=0.5, out_indices=range(0, 7)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 8, 112, 112)) + assert feat[1].shape == torch.Size((1, 16, 56, 56)) + assert feat[2].shape == torch.Size((1, 16, 28, 28)) + assert feat[3].shape == torch.Size((1, 32, 14, 14)) + assert feat[4].shape == torch.Size((1, 48, 14, 14)) + assert feat[5].shape == torch.Size((1, 80, 7, 7)) + assert feat[6].shape == torch.Size((1, 160, 7, 7)) + + # Test MobileNetV2 forward with widen_factor=2.0 + model = MobileNetV2(widen_factor=2.0) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size((1, 2560, 7, 7)) + + # Test MobileNetV2 forward with out_indices=None + model = MobileNetV2(widen_factor=1.0) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size((1, 1280, 7, 7)) + + # Test MobileNetV2 forward with dict(type='ReLU') + model = MobileNetV2( + widen_factor=1.0, act_cfg=dict(type='ReLU'), out_indices=range(0, 7)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with GroupNorm forward + model = MobileNetV2(widen_factor=1.0, out_indices=range(0, 7)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with BatchNorm forward + model = MobileNetV2( + widen_factor=1.0, + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True), + out_indices=range(0, 7)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) + + # Test MobileNetV2 with layers 1, 3, 5 out forward + model = MobileNetV2(widen_factor=1.0, out_indices=(0, 2, 4)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 32, 28, 28)) + assert feat[2].shape == torch.Size((1, 96, 14, 14)) + + # Test MobileNetV2 with checkpoint forward + model = MobileNetV2( + widen_factor=1.0, with_cp=True, out_indices=range(0, 7)) + for m in model.modules(): + if is_block(m): + assert m.with_cp + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size((1, 16, 112, 112)) + assert feat[1].shape == torch.Size((1, 24, 56, 56)) + assert feat[2].shape == torch.Size((1, 32, 28, 28)) + assert feat[3].shape == torch.Size((1, 64, 14, 14)) + assert feat[4].shape == torch.Size((1, 96, 14, 14)) + assert feat[5].shape == torch.Size((1, 160, 7, 7)) + assert feat[6].shape == torch.Size((1, 320, 7, 7)) diff --git a/tests/test_backbones/test_mobilenet_v3.py b/tests/test_backbones/test_mobilenet_v3.py new file mode 100644 index 0000000000..41ce602690 --- /dev/null +++ b/tests/test_backbones/test_mobilenet_v3.py @@ -0,0 +1,168 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones import MobileNetV3 +from mmpose.models.backbones.utils import InvertedResidual + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_mobilenetv3_backbone(): + with pytest.raises(TypeError): + # pretrained must be a string path + model = MobileNetV3() + model.init_weights(pretrained=0) + + with pytest.raises(AssertionError): + # arch must in [small, big] + MobileNetV3(arch='others') + + with pytest.raises(ValueError): + # frozen_stages must less than 12 when arch is small + MobileNetV3(arch='small', frozen_stages=12) + + with pytest.raises(ValueError): + # frozen_stages must less than 16 when arch is big + MobileNetV3(arch='big', frozen_stages=16) + + with pytest.raises(ValueError): + # max out_indices must less than 11 when arch is small + MobileNetV3(arch='small', out_indices=(11, )) + + with pytest.raises(ValueError): + # max out_indices must less than 15 when arch is big + MobileNetV3(arch='big', out_indices=(15, )) + + # Test MobileNetv3 + model = MobileNetV3() + model.init_weights() + model.train() + + # Test MobileNetv3 with first stage frozen + frozen_stages = 1 + model = MobileNetV3(frozen_stages=frozen_stages) + model.init_weights() + model.train() + for param in model.conv1.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test MobileNetv3 with norm eval + model = MobileNetV3(norm_eval=True, out_indices=range(0, 11)) + model.init_weights() + model.train() + assert check_norm_state(model.modules(), False) + + # Test MobileNetv3 forward with small arch + model = MobileNetV3(out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 11 + assert feat[0].shape == torch.Size([1, 16, 56, 56]) + assert feat[1].shape == torch.Size([1, 24, 28, 28]) + assert feat[2].shape == torch.Size([1, 24, 28, 28]) + assert feat[3].shape == torch.Size([1, 40, 14, 14]) + assert feat[4].shape == torch.Size([1, 40, 14, 14]) + assert feat[5].shape == torch.Size([1, 40, 14, 14]) + assert feat[6].shape == torch.Size([1, 48, 14, 14]) + assert feat[7].shape == torch.Size([1, 48, 14, 14]) + assert feat[8].shape == torch.Size([1, 96, 7, 7]) + assert feat[9].shape == torch.Size([1, 96, 7, 7]) + assert feat[10].shape == torch.Size([1, 96, 7, 7]) + + # Test MobileNetv3 forward with small arch and GroupNorm + model = MobileNetV3( + out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 11 + assert feat[0].shape == torch.Size([1, 16, 56, 56]) + assert feat[1].shape == torch.Size([1, 24, 28, 28]) + assert feat[2].shape == torch.Size([1, 24, 28, 28]) + assert feat[3].shape == torch.Size([1, 40, 14, 14]) + assert feat[4].shape == torch.Size([1, 40, 14, 14]) + assert feat[5].shape == torch.Size([1, 40, 14, 14]) + assert feat[6].shape == torch.Size([1, 48, 14, 14]) + assert feat[7].shape == torch.Size([1, 48, 14, 14]) + assert feat[8].shape == torch.Size([1, 96, 7, 7]) + assert feat[9].shape == torch.Size([1, 96, 7, 7]) + assert feat[10].shape == torch.Size([1, 96, 7, 7]) + + # Test MobileNetv3 forward with big arch + model = MobileNetV3( + arch='big', + out_indices=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 15 + assert feat[0].shape == torch.Size([1, 16, 112, 112]) + assert feat[1].shape == torch.Size([1, 24, 56, 56]) + assert feat[2].shape == torch.Size([1, 24, 56, 56]) + assert feat[3].shape == torch.Size([1, 40, 28, 28]) + assert feat[4].shape == torch.Size([1, 40, 28, 28]) + assert feat[5].shape == torch.Size([1, 40, 28, 28]) + assert feat[6].shape == torch.Size([1, 80, 14, 14]) + assert feat[7].shape == torch.Size([1, 80, 14, 14]) + assert feat[8].shape == torch.Size([1, 80, 14, 14]) + assert feat[9].shape == torch.Size([1, 80, 14, 14]) + assert feat[10].shape == torch.Size([1, 112, 14, 14]) + assert feat[11].shape == torch.Size([1, 112, 14, 14]) + assert feat[12].shape == torch.Size([1, 160, 14, 14]) + assert feat[13].shape == torch.Size([1, 160, 7, 7]) + assert feat[14].shape == torch.Size([1, 160, 7, 7]) + + # Test MobileNetv3 forward with big arch + model = MobileNetV3(arch='big', out_indices=(0, )) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size([1, 16, 112, 112]) + + # Test MobileNetv3 with checkpoint forward + model = MobileNetV3(with_cp=True) + for m in model.modules(): + if isinstance(m, InvertedResidual): + assert m.with_cp + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size([1, 96, 7, 7]) diff --git a/tests/test_backbones/test_regnet.py b/tests/test_backbones/test_regnet.py new file mode 100644 index 0000000000..f3e1b04f8a --- /dev/null +++ b/tests/test_backbones/test_regnet.py @@ -0,0 +1,91 @@ +import pytest +import torch + +from mmpose.models.backbones import RegNet + +regnet_test_data = [ + ('regnetx_400mf', + dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, + bot_mul=1.0), [32, 64, 160, 384]), + ('regnetx_800mf', + dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, + bot_mul=1.0), [64, 128, 288, 672]), + ('regnetx_1.6gf', + dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, + bot_mul=1.0), [72, 168, 408, 912]), + ('regnetx_3.2gf', + dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, + bot_mul=1.0), [96, 192, 432, 1008]), + ('regnetx_4.0gf', + dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, + bot_mul=1.0), [80, 240, 560, 1360]), + ('regnetx_6.4gf', + dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, + bot_mul=1.0), [168, 392, 784, 1624]), + ('regnetx_8.0gf', + dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, + bot_mul=1.0), [80, 240, 720, 1920]), + ('regnetx_12gf', + dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, + bot_mul=1.0), [224, 448, 896, 2240]), +] + + +@pytest.mark.parametrize('arch_name,arch,out_channels', regnet_test_data) +def test_regnet_backbone(arch_name, arch, out_channels): + with pytest.raises(AssertionError): + # ResNeXt depth should be in [50, 101, 152] + RegNet(arch_name + '233') + + # output the last feature map + model = RegNet(arch_name) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert isinstance(feat, torch.Tensor) + assert feat.shape == (1, out_channels[-1], 7, 7) + + # output feature map of all stages + model = RegNet(arch_name, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == (1, out_channels[0], 56, 56) + assert feat[1].shape == (1, out_channels[1], 28, 28) + assert feat[2].shape == (1, out_channels[2], 14, 14) + assert feat[3].shape == (1, out_channels[3], 7, 7) + + +@pytest.mark.parametrize('arch_name,arch,out_channels', regnet_test_data) +def test_custom_arch(arch_name, arch, out_channels): + # output the last feature map + model = RegNet(arch) + model.init_weights() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert isinstance(feat, torch.Tensor) + assert feat.shape == (1, out_channels[-1], 7, 7) + + # output feature map of all stages + model = RegNet(arch, out_indices=(0, 1, 2, 3)) + model.init_weights() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == (1, out_channels[0], 56, 56) + assert feat[1].shape == (1, out_channels[1], 28, 28) + assert feat[2].shape == (1, out_channels[2], 14, 14) + assert feat[3].shape == (1, out_channels[3], 7, 7) + + +def test_exception(): + # arch must be a str or dict + with pytest.raises(TypeError): + _ = RegNet(50) diff --git a/tests/test_backbones/test_resnext.py b/tests/test_backbones/test_resnext.py new file mode 100644 index 0000000000..3b11c6c92b --- /dev/null +++ b/tests/test_backbones/test_resnext.py @@ -0,0 +1,59 @@ +import pytest +import torch + +from mmpose.models.backbones import ResNeXt +from mmpose.models.backbones.resnext import Bottleneck as BottleneckX + + +def test_bottleneck(): + with pytest.raises(AssertionError): + # Style must be in ['pytorch', 'caffe'] + BottleneckX(64, 64, groups=32, width_per_group=4, style='tensorflow') + + # Test ResNeXt Bottleneck structure + block = BottleneckX( + 64, 256, groups=32, width_per_group=4, stride=2, style='pytorch') + assert block.conv2.stride == (2, 2) + assert block.conv2.groups == 32 + assert block.conv2.out_channels == 128 + + # Test ResNeXt Bottleneck forward + block = BottleneckX(64, 64, base_channels=16, groups=32, width_per_group=4) + x = torch.randn(1, 64, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + +def test_resnext(): + with pytest.raises(KeyError): + # ResNeXt depth should be in [50, 101, 152] + ResNeXt(depth=18) + + # Test ResNeXt with group 32, width_per_group 4 + model = ResNeXt( + depth=50, groups=32, width_per_group=4, out_indices=(0, 1, 2, 3)) + for m in model.modules(): + if isinstance(m, BottleneckX): + assert m.conv2.groups == 32 + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + assert feat[3].shape == torch.Size([1, 2048, 7, 7]) + + # Test ResNeXt with group 32, width_per_group 4 and layers 3 out forward + model = ResNeXt(depth=50, groups=32, width_per_group=4, out_indices=(3, )) + for m in model.modules(): + if isinstance(m, BottleneckX): + assert m.conv2.groups == 32 + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size([1, 2048, 7, 7]) diff --git a/tests/test_backbones/test_seresnet.py b/tests/test_backbones/test_seresnet.py new file mode 100644 index 0000000000..2f867674f6 --- /dev/null +++ b/tests/test_backbones/test_seresnet.py @@ -0,0 +1,242 @@ +import pytest +import torch +from torch.nn.modules import AvgPool2d +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones import SEResNet +from mmpose.models.backbones.resnet import ResLayer +from mmpose.models.backbones.seresnet import SEBottleneck, SELayer + + +def all_zeros(modules): + """Check if the weight(and bias) is all zero.""" + weight_zero = torch.equal(modules.weight.data, + torch.zeros_like(modules.weight.data)) + if hasattr(modules, 'bias'): + bias_zero = torch.equal(modules.bias.data, + torch.zeros_like(modules.bias.data)) + else: + bias_zero = True + + return weight_zero and bias_zero + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_selayer(): + # Test selayer forward + layer = SELayer(64) + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + # Test selayer forward with different ratio + layer = SELayer(64, ratio=8) + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + +def test_bottleneck(): + + with pytest.raises(AssertionError): + # Style must be in ['pytorch', 'caffe'] + SEBottleneck(64, 64, style='tensorflow') + + # Test SEBottleneck with checkpoint forward + block = SEBottleneck(64, 64, with_cp=True) + assert block.with_cp + x = torch.randn(1, 64, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + # Test Bottleneck style + block = SEBottleneck(64, 256, stride=2, style='pytorch') + assert block.conv1.stride == (1, 1) + assert block.conv2.stride == (2, 2) + block = SEBottleneck(64, 256, stride=2, style='caffe') + assert block.conv1.stride == (2, 2) + assert block.conv2.stride == (1, 1) + + # Test Bottleneck forward + block = SEBottleneck(64, 64) + x = torch.randn(1, 64, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + +def test_res_layer(): + # Test ResLayer of 3 Bottleneck w\o downsample + layer = ResLayer(SEBottleneck, 3, 64, 64, se_ratio=16) + assert len(layer) == 3 + assert layer[0].conv1.in_channels == 64 + assert layer[0].conv1.out_channels == 16 + for i in range(1, len(layer)): + assert layer[i].conv1.in_channels == 64 + assert layer[i].conv1.out_channels == 16 + for i in range(len(layer)): + assert layer[i].downsample is None + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + # Test ResLayer of 3 SEBottleneck with downsample + layer = ResLayer(SEBottleneck, 3, 64, 256, se_ratio=16) + assert layer[0].downsample[0].out_channels == 256 + for i in range(1, len(layer)): + assert layer[i].downsample is None + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 256, 56, 56]) + + # Test ResLayer of 3 SEBottleneck with stride=2 + layer = ResLayer(SEBottleneck, 3, 64, 256, stride=2, se_ratio=8) + assert layer[0].downsample[0].out_channels == 256 + assert layer[0].downsample[0].stride == (2, 2) + for i in range(1, len(layer)): + assert layer[i].downsample is None + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 256, 28, 28]) + + # Test ResLayer of 3 SEBottleneck with stride=2 and average downsample + layer = ResLayer( + SEBottleneck, 3, 64, 256, stride=2, avg_down=True, se_ratio=8) + assert isinstance(layer[0].downsample[0], AvgPool2d) + assert layer[0].downsample[1].out_channels == 256 + assert layer[0].downsample[1].stride == (1, 1) + for i in range(1, len(layer)): + assert layer[i].downsample is None + x = torch.randn(1, 64, 56, 56) + x_out = layer(x) + assert x_out.shape == torch.Size([1, 256, 28, 28]) + + +def test_seresnet(): + """Test resnet backbone.""" + with pytest.raises(KeyError): + # SEResNet depth should be in [50, 101, 152] + SEResNet(20) + + with pytest.raises(AssertionError): + # In SEResNet: 1 <= num_stages <= 4 + SEResNet(50, num_stages=0) + + with pytest.raises(AssertionError): + # In SEResNet: 1 <= num_stages <= 4 + SEResNet(50, num_stages=5) + + with pytest.raises(AssertionError): + # len(strides) == len(dilations) == num_stages + SEResNet(50, strides=(1, ), dilations=(1, 1), num_stages=3) + + with pytest.raises(TypeError): + # pretrained must be a string path + model = SEResNet(50) + model.init_weights(pretrained=0) + + with pytest.raises(AssertionError): + # Style must be in ['pytorch', 'caffe'] + SEResNet(50, style='tensorflow') + + # Test SEResNet50 norm_eval=True + model = SEResNet(50, norm_eval=True) + model.init_weights() + model.train() + assert check_norm_state(model.modules(), False) + + # Test SEResNet50 with torchvision pretrained weight + model = SEResNet(depth=50, norm_eval=True) + model.init_weights('torchvision://resnet50') + model.train() + assert check_norm_state(model.modules(), False) + + # Test SEResNet50 with first stage frozen + frozen_stages = 1 + model = SEResNet(50, frozen_stages=frozen_stages) + model.init_weights() + model.train() + assert model.norm1.training is False + for layer in [model.conv1, model.norm1]: + for param in layer.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'layer{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test SEResNet50 with BatchNorm forward + model = SEResNet(50, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + assert feat[3].shape == torch.Size([1, 2048, 7, 7]) + + # Test SEResNet50 with layers 1, 2, 3 out forward + model = SEResNet(50, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + + # Test SEResNet50 with layers 3 (top feature maps) out forward + model = SEResNet(50, out_indices=(3, )) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size([1, 2048, 7, 7]) + + # Test SEResNet50 with checkpoint forward + model = SEResNet(50, out_indices=(0, 1, 2, 3), with_cp=True) + for m in model.modules(): + if isinstance(m, SEBottleneck): + assert m.with_cp + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + assert feat[3].shape == torch.Size([1, 2048, 7, 7]) + + # Test SEResNet50 zero initialization of residual + model = SEResNet(50, out_indices=(0, 1, 2, 3), zero_init_residual=True) + model.init_weights() + for m in model.modules(): + if isinstance(m, SEBottleneck): + assert all_zeros(m.norm3) + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + assert feat[3].shape == torch.Size([1, 2048, 7, 7]) diff --git a/tests/test_backbones/test_seresnext.py b/tests/test_backbones/test_seresnext.py new file mode 100644 index 0000000000..a0c690546e --- /dev/null +++ b/tests/test_backbones/test_seresnext.py @@ -0,0 +1,72 @@ +import pytest +import torch + +from mmpose.models.backbones import SEResNeXt +from mmpose.models.backbones.seresnext import SEBottleneck as SEBottleneckX + + +def test_bottleneck(): + with pytest.raises(AssertionError): + # Style must be in ['pytorch', 'caffe'] + SEBottleneckX(64, 64, groups=32, width_per_group=4, style='tensorflow') + + # Test SEResNeXt Bottleneck structure + block = SEBottleneckX( + 64, 256, groups=32, width_per_group=4, stride=2, style='pytorch') + assert block.width_per_group == 4 + assert block.conv2.stride == (2, 2) + assert block.conv2.groups == 32 + assert block.conv2.out_channels == 128 + assert block.conv2.out_channels == block.mid_channels + + # Test SEResNeXt Bottleneck structure (groups=1) + block = SEBottleneckX( + 64, 256, groups=1, width_per_group=4, stride=2, style='pytorch') + assert block.conv2.stride == (2, 2) + assert block.conv2.groups == 1 + assert block.conv2.out_channels == 64 + assert block.mid_channels == 64 + assert block.conv2.out_channels == block.mid_channels + + # Test SEResNeXt Bottleneck forward + block = SEBottleneckX( + 64, 64, base_channels=16, groups=32, width_per_group=4) + x = torch.randn(1, 64, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size([1, 64, 56, 56]) + + +def test_seresnext(): + with pytest.raises(KeyError): + # SEResNeXt depth should be in [50, 101, 152] + SEResNeXt(depth=18) + + # Test SEResNeXt with group 32, width_per_group 4 + model = SEResNeXt( + depth=50, groups=32, width_per_group=4, out_indices=(0, 1, 2, 3)) + for m in model.modules(): + if isinstance(m, SEBottleneckX): + assert m.conv2.groups == 32 + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 256, 56, 56]) + assert feat[1].shape == torch.Size([1, 512, 28, 28]) + assert feat[2].shape == torch.Size([1, 1024, 14, 14]) + assert feat[3].shape == torch.Size([1, 2048, 7, 7]) + + # Test SEResNeXt with group 32, width_per_group 4 and layers 3 out forward + model = SEResNeXt( + depth=50, groups=32, width_per_group=4, out_indices=(3, )) + for m in model.modules(): + if isinstance(m, SEBottleneckX): + assert m.conv2.groups == 32 + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat.shape == torch.Size([1, 2048, 7, 7]) diff --git a/tests/test_backbones/test_shufflenet_v1.py b/tests/test_backbones/test_shufflenet_v1.py new file mode 100644 index 0000000000..70887f8431 --- /dev/null +++ b/tests/test_backbones/test_shufflenet_v1.py @@ -0,0 +1,244 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones import ShuffleNetV1 +from mmpose.models.backbones.shufflenet_v1 import ShuffleUnit + + +def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (ShuffleUnit, )): + return True + return False + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_shufflenetv1_shuffleuint(): + + with pytest.raises(ValueError): + # combine must be in ['add', 'concat'] + ShuffleUnit(24, 16, groups=3, first_block=True, combine='test') + + with pytest.raises(AssertionError): + # inplanes must be equal tp = outplanes when combine='add' + ShuffleUnit(64, 24, groups=4, first_block=True, combine='add') + + # Test ShuffleUnit with combine='add' + block = ShuffleUnit(24, 24, groups=3, first_block=True, combine='add') + x = torch.randn(1, 24, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 56, 56)) + + # Test ShuffleUnit with combine='concat' + block = ShuffleUnit(24, 240, groups=3, first_block=True, combine='concat') + x = torch.randn(1, 24, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 240, 28, 28)) + + # Test ShuffleUnit with checkpoint forward + block = ShuffleUnit( + 24, 24, groups=3, first_block=True, combine='add', with_cp=True) + assert block.with_cp + x = torch.randn(1, 24, 56, 56) + x.requires_grad = True + x_out = block(x) + assert x_out.shape == torch.Size((1, 24, 56, 56)) + + +def test_shufflenetv1_backbone(): + + with pytest.raises(ValueError): + # frozen_stages must be in range(-1, 4) + ShuffleNetV1(frozen_stages=10) + + with pytest.raises(ValueError): + # the item in out_indices must be in range(0, 4) + ShuffleNetV1(out_indices=[5]) + + with pytest.raises(ValueError): + # groups must be in [1, 2, 3, 4, 8] + ShuffleNetV1(groups=10) + + with pytest.raises(TypeError): + # pretrained must be str or None + model = ShuffleNetV1() + model.init_weights(pretrained=1) + + # Test ShuffleNetV1 norm state + model = ShuffleNetV1() + model.init_weights() + model.train() + assert check_norm_state(model.modules(), True) + + # Test ShuffleNetV1 with first stage frozen + frozen_stages = 1 + model = ShuffleNetV1(frozen_stages=frozen_stages, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + for param in model.conv1.parameters(): + assert param.requires_grad is False + for i in range(frozen_stages): + layer = model.layers[i] + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test ShuffleNetV1 forward with groups=1 + model = ShuffleNetV1(groups=1, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 144, 28, 28)) + assert feat[1].shape == torch.Size((1, 288, 14, 14)) + assert feat[2].shape == torch.Size((1, 576, 7, 7)) + + # Test ShuffleNetV1 forward with groups=2 + model = ShuffleNetV1(groups=2, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 200, 28, 28)) + assert feat[1].shape == torch.Size((1, 400, 14, 14)) + assert feat[2].shape == torch.Size((1, 800, 7, 7)) + + # Test ShuffleNetV1 forward with groups=3 + model = ShuffleNetV1(groups=3, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 240, 28, 28)) + assert feat[1].shape == torch.Size((1, 480, 14, 14)) + assert feat[2].shape == torch.Size((1, 960, 7, 7)) + + # Test ShuffleNetV1 forward with groups=4 + model = ShuffleNetV1(groups=4, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 272, 28, 28)) + assert feat[1].shape == torch.Size((1, 544, 14, 14)) + assert feat[2].shape == torch.Size((1, 1088, 7, 7)) + + # Test ShuffleNetV1 forward with groups=8 + model = ShuffleNetV1(groups=8, out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 384, 28, 28)) + assert feat[1].shape == torch.Size((1, 768, 14, 14)) + assert feat[2].shape == torch.Size((1, 1536, 7, 7)) + + # Test ShuffleNetV1 forward with GroupNorm forward + model = ShuffleNetV1( + groups=3, + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True), + out_indices=(0, 1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 240, 28, 28)) + assert feat[1].shape == torch.Size((1, 480, 14, 14)) + assert feat[2].shape == torch.Size((1, 960, 7, 7)) + + # Test ShuffleNetV1 forward with layers 1, 2 forward + model = ShuffleNetV1(groups=3, out_indices=(1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 2 + assert feat[0].shape == torch.Size((1, 480, 14, 14)) + assert feat[1].shape == torch.Size((1, 960, 7, 7)) + + # Test ShuffleNetV1 forward with layers 2 forward + model = ShuffleNetV1(groups=3, out_indices=(2, )) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert isinstance(feat, torch.Tensor) + assert feat.shape == torch.Size((1, 960, 7, 7)) + + # Test ShuffleNetV1 forward with checkpoint forward + model = ShuffleNetV1(groups=3, with_cp=True) + for m in model.modules(): + if is_block(m): + assert m.with_cp + + # Test ShuffleNetV1 with norm_eval + model = ShuffleNetV1(norm_eval=True) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), False) diff --git a/tests/test_backbones/test_shufflenet_v2.py b/tests/test_backbones/test_shufflenet_v2.py new file mode 100644 index 0000000000..e95a846af8 --- /dev/null +++ b/tests/test_backbones/test_shufflenet_v2.py @@ -0,0 +1,203 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.models.backbones import ShuffleNetV2 +from mmpose.models.backbones.shufflenet_v2 import InvertedResidual + + +def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (InvertedResidual, )): + return True + return False + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_shufflenetv2_invertedresidual(): + + with pytest.raises(AssertionError): + # when stride==1, in_channels should be equal to out_channels // 2 * 2 + InvertedResidual(24, 32, stride=1) + + with pytest.raises(AssertionError): + # when in_channels != out_channels // 2 * 2, stride should not be + # equal to 1. + InvertedResidual(24, 32, stride=1) + + # Test InvertedResidual forward + block = InvertedResidual(24, 48, stride=2) + x = torch.randn(1, 24, 56, 56) + x_out = block(x) + assert x_out.shape == torch.Size((1, 48, 28, 28)) + + # Test InvertedResidual with checkpoint forward + block = InvertedResidual(48, 48, stride=1, with_cp=True) + assert block.with_cp + x = torch.randn(1, 48, 56, 56) + x.requires_grad = True + x_out = block(x) + assert x_out.shape == torch.Size((1, 48, 56, 56)) + + +def test_shufflenetv2_backbone(): + + with pytest.raises(ValueError): + # groups must be in 0.5, 1.0, 1.5, 2.0] + ShuffleNetV2(widen_factor=3.0) + + with pytest.raises(ValueError): + # frozen_stages must be in [0, 1, 2, 3] + ShuffleNetV2(widen_factor=1.0, frozen_stages=4) + + with pytest.raises(ValueError): + # out_indices must be in [0, 1, 2, 3] + ShuffleNetV2(widen_factor=1.0, out_indices=(4, )) + + with pytest.raises(TypeError): + # pretrained must be str or None + model = ShuffleNetV2() + model.init_weights(pretrained=1) + + # Test ShuffleNetV2 norm state + model = ShuffleNetV2() + model.init_weights() + model.train() + assert check_norm_state(model.modules(), True) + + # Test ShuffleNetV2 with first stage frozen + frozen_stages = 1 + model = ShuffleNetV2(frozen_stages=frozen_stages) + model.init_weights() + model.train() + for param in model.conv1.parameters(): + assert param.requires_grad is False + for i in range(0, frozen_stages): + layer = model.layers[i] + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test ShuffleNetV2 with norm_eval + model = ShuffleNetV2(norm_eval=True) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), False) + + # Test ShuffleNetV2 forward with widen_factor=0.5 + model = ShuffleNetV2(widen_factor=0.5, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 48, 28, 28)) + assert feat[1].shape == torch.Size((1, 96, 14, 14)) + assert feat[2].shape == torch.Size((1, 192, 7, 7)) + + # Test ShuffleNetV2 forward with widen_factor=1.0 + model = ShuffleNetV2(widen_factor=1.0, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 116, 28, 28)) + assert feat[1].shape == torch.Size((1, 232, 14, 14)) + assert feat[2].shape == torch.Size((1, 464, 7, 7)) + + # Test ShuffleNetV2 forward with widen_factor=1.5 + model = ShuffleNetV2(widen_factor=1.5, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 176, 28, 28)) + assert feat[1].shape == torch.Size((1, 352, 14, 14)) + assert feat[2].shape == torch.Size((1, 704, 7, 7)) + + # Test ShuffleNetV2 forward with widen_factor=2.0 + model = ShuffleNetV2(widen_factor=2.0, out_indices=(0, 1, 2, 3)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 244, 28, 28)) + assert feat[1].shape == torch.Size((1, 488, 14, 14)) + assert feat[2].shape == torch.Size((1, 976, 7, 7)) + + # Test ShuffleNetV2 forward with layers 3 forward + model = ShuffleNetV2(widen_factor=1.0, out_indices=(2, )) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert isinstance(feat, torch.Tensor) + assert feat.shape == torch.Size((1, 464, 7, 7)) + + # Test ShuffleNetV2 forward with layers 1 2 forward + model = ShuffleNetV2(widen_factor=1.0, out_indices=(1, 2)) + model.init_weights() + model.train() + + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 2 + assert feat[0].shape == torch.Size((1, 232, 14, 14)) + assert feat[1].shape == torch.Size((1, 464, 7, 7)) + + # Test ShuffleNetV2 forward with checkpoint forward + model = ShuffleNetV2(widen_factor=1.0, with_cp=True) + for m in model.modules(): + if is_block(m): + assert m.with_cp