From b7b40a1557701688ae7829a7d1d6b7f28e9271a0 Mon Sep 17 00:00:00 2001 From: su Date: Fri, 26 Feb 2021 13:52:16 +0800 Subject: [PATCH] Update big models (#605) * Add configs and some stats. * Update. * Update r101 8x8. * Minor. * Update url. * Update changelog. * Fix typos and workdir. --- configs/recognition/slowfast/README.md | 3 + ...lowfast_r101_8x8x1_256e_kinetics400_rgb.py | 137 ++++++++++++++++++ ...st_r101_r50_4x16x1_256e_kinetics400_rgb.py | 136 +++++++++++++++++ ...st_r152_r50_4x16x1_256e_kinetics400_rgb.py | 136 +++++++++++++++++ docs/changelog.md | 1 + 5 files changed, 413 insertions(+) create mode 100644 configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py create mode 100644 configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py create mode 100644 configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 1ed8080863..ba00f13269 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -25,6 +25,9 @@ |[slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50|None |75.64|92.3|1.6 ((32+4)x10x3 frames)|6203|[ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth)| [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log)| [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log.json)| |[slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 256|8x4| ResNet50 |None |75.61|92.34|x|9062|[ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb_20200810-863812c2.pth)|[log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/20200731_151537.log)|[json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/20200731_151537.log.json)| |[slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50 |None|76.94|92.8|1.3 ((32+8)x10x3 frames)|9062| [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log)| [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json)| +|[slowfast_r101_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py) |short-side 256|8x1| ResNet101 + ResNet50 |None|76.69|93.07||16628| [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/slowfast_r101_4x16x1_256e_kinetics400_rgb_20210218-d8b58813.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log)| [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_4x16x1_256e_kinetics400_rgb/20210118_133528.log.json)| +|[slowfast_r101_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py) |short-side 256|8x4| ResNet101 |None|77.90|93.51||25994| [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/slowfast_r101_8x8x1_256e_kinetics400_rgb_20210218-0dd54025.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log)| [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb/20210218_121513.log.json)| +|[slowfast_r152_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py) |short-side 256|8x1| ResNet152 + ResNet50 |None|77.13|93.20||10077| [ckpt](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/slowfast_r152_4x16x1_256e_kinetics400_rgb_20210122-bdeb6b87.pth) | [log](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log)| [json](https://download.openmmlab.com/mmaction/recognition/slowfast/slowfast_r152_4x16x1_256e_kinetics400_rgb/20210122_131321.log.json)| Notes: diff --git a/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py b/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000000..79259e419c --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r101_8x8x1_256e_kinetics400_rgb.py @@ -0,0 +1,137 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowFast', + pretrained=None, + resample_rate=4, # tau + speed_ratio=4, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict( + type='resnet3d', + depth=101, + pretrained=None, + lateral=True, + fusion_kernel=7, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + fast_pathway=dict( + type='resnet3d', + depth=101, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_eval=False)), + cls_head=dict( + type='SlowFastHead', + in_channels=2304, # 2048+256 + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips='prob') +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.1, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=34) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/slowfast_r101_8x8x1_256e_kinetics400_rgb' +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000000..a74dc94f26 --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb.py @@ -0,0 +1,136 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowFast', + pretrained=None, + resample_rate=8, # tau + speed_ratio=8, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict( + type='resnet3d', + depth=101, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_eval=False)), + cls_head=dict( + type='SlowFastHead', + in_channels=2304, # 2048+256 + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips='prob') +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.1, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=34) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/slowfast_r101_r50_4x16x1_256e_kinetics400_rgb' +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000000..a581bae143 --- /dev/null +++ b/configs/recognition/slowfast/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb.py @@ -0,0 +1,136 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowFast', + pretrained=None, + resample_rate=8, # tau + speed_ratio=8, # alpha + channel_ratio=8, # beta_inv + slow_pathway=dict( + type='resnet3d', + depth=152, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + norm_eval=False)), + cls_head=dict( + type='SlowFastHead', + in_channels=2304, # 2048+256 + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips='prob', max_testing_views=8) +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train' +data_root_val = 'data/kinetics400/rawframes_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=1, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.1, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + min_lr=0, + warmup='linear', + warmup_by_epoch=True, + warmup_iters=34) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy']) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = './work_dirs/slowfast_r152_r50_4x16x1_256e_kinetics400_rgb' +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/docs/changelog.md b/docs/changelog.md index 899a5e6455..a594718e52 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -27,6 +27,7 @@ **ModelZoo** - Add TSM-MobileNetV2 for Kinetics400 ([#415](https://github.com/open-mmlab/mmaction2/pull/415)) +- Add deeper SlowFast models ([#605](https://github.com/open-mmlab/mmaction2/pull/605)) ### 0.11.0 (31/01/2021)