Skip to content

Commit 3a3e10a

Browse files
[Feature] Support Imgaug for augmentations in the data pipeline. (open-mmlab#492)
* imgaug first commit. * update changelog * add unittest & fix a few bugs * add imgaug in optional.txt * add docs & add iaa.Augmenter as input & add unittest * improve codecov * fix * fix __repr__ * fix changelog * fix docs/typo/class name, etc. * add modality assert for imgaug * remove iaa.Rotate sample * 1. fix multi-gpu bug 2. add tsn/i3d demo config 3. add assert for in&out dtype 4. update docs
1 parent 910d2fb commit 3a3e10a

File tree

7 files changed

+572
-3
lines changed

7 files changed

+572
-3
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# model settings
2+
model = dict(
3+
type='Recognizer3D',
4+
backbone=dict(
5+
type='ResNet3d',
6+
pretrained2d=True,
7+
pretrained='torchvision://resnet50',
8+
depth=50,
9+
conv_cfg=dict(type='Conv3d'),
10+
norm_eval=False,
11+
inflate=((1, 1, 1), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 1, 0)),
12+
zero_init_residual=False),
13+
cls_head=dict(
14+
type='I3DHead',
15+
num_classes=400,
16+
in_channels=2048,
17+
spatial_type='avg',
18+
dropout_ratio=0.5,
19+
init_std=0.01))
20+
# model training and testing settings
21+
train_cfg = None
22+
test_cfg = dict(average_clips='prob')
23+
# dataset settings
24+
dataset_type = 'VideoDataset'
25+
data_root = 'data/kinetics400/videos_train'
26+
data_root_val = 'data/kinetics400/videos_val'
27+
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
28+
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
29+
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
30+
img_norm_cfg = dict(
31+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
32+
train_pipeline = [
33+
dict(type='DecordInit'),
34+
dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
35+
dict(type='DecordDecode'),
36+
dict(type='Resize', scale=(-1, 256)),
37+
dict(
38+
type='MultiScaleCrop',
39+
input_size=224,
40+
scales=(1, 0.8),
41+
random_crop=False,
42+
max_wh_scale_gap=0),
43+
dict(type='Resize', scale=(224, 224), keep_ratio=False),
44+
dict(
45+
type='Imgaug',
46+
transforms=[
47+
dict(type='Fliplr', p=0.5),
48+
dict(type='Rotate', rotate=(-20, 20)),
49+
dict(type='Dropout', p=(0, 0.05))
50+
]),
51+
# dict(type='Imgaug', transforms='default'),
52+
dict(type='Normalize', **img_norm_cfg),
53+
dict(type='FormatShape', input_format='NCTHW'),
54+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
55+
dict(type='ToTensor', keys=['imgs', 'label'])
56+
]
57+
val_pipeline = [
58+
dict(type='DecordInit'),
59+
dict(
60+
type='SampleFrames',
61+
clip_len=32,
62+
frame_interval=2,
63+
num_clips=1,
64+
test_mode=True),
65+
dict(type='DecordDecode'),
66+
dict(type='Resize', scale=(-1, 256)),
67+
dict(type='CenterCrop', crop_size=224),
68+
dict(type='Flip', flip_ratio=0),
69+
dict(type='Normalize', **img_norm_cfg),
70+
dict(type='FormatShape', input_format='NCTHW'),
71+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
72+
dict(type='ToTensor', keys=['imgs'])
73+
]
74+
test_pipeline = [
75+
dict(type='DecordInit'),
76+
dict(
77+
type='SampleFrames',
78+
clip_len=32,
79+
frame_interval=2,
80+
num_clips=10,
81+
test_mode=True),
82+
dict(type='DecordDecode'),
83+
dict(type='Resize', scale=(-1, 256)),
84+
dict(type='ThreeCrop', crop_size=256),
85+
dict(type='Flip', flip_ratio=0),
86+
dict(type='Normalize', **img_norm_cfg),
87+
dict(type='FormatShape', input_format='NCTHW'),
88+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
89+
dict(type='ToTensor', keys=['imgs'])
90+
]
91+
data = dict(
92+
videos_per_gpu=8,
93+
workers_per_gpu=4,
94+
train=dict(
95+
type=dataset_type,
96+
ann_file=ann_file_train,
97+
data_prefix=data_root,
98+
pipeline=train_pipeline),
99+
val=dict(
100+
type=dataset_type,
101+
ann_file=ann_file_val,
102+
data_prefix=data_root_val,
103+
pipeline=val_pipeline),
104+
test=dict(
105+
type=dataset_type,
106+
ann_file=ann_file_val,
107+
data_prefix=data_root_val,
108+
pipeline=test_pipeline))
109+
# optimizer
110+
optimizer = dict(
111+
type='SGD', lr=0.01, momentum=0.9,
112+
weight_decay=0.0001) # this lr is used for 8 gpus
113+
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
114+
# learning policy
115+
lr_config = dict(policy='step', step=[40, 80])
116+
total_epochs = 100
117+
checkpoint_config = dict(interval=5)
118+
evaluation = dict(
119+
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
120+
log_config = dict(
121+
interval=20,
122+
hooks=[
123+
dict(type='TextLoggerHook'),
124+
# dict(type='TensorboardLoggerHook'),
125+
])
126+
# runtime settings
127+
dist_params = dict(backend='nccl')
128+
log_level = 'INFO'
129+
work_dir = './work_dirs/i3d_r50_video_3d_32x2x1_100e_kinetics400_rgb/'
130+
load_from = None
131+
resume_from = None
132+
workflow = [('train', 1)]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# model settings
2+
model = dict(
3+
type='Recognizer2D',
4+
backbone=dict(
5+
type='ResNet',
6+
pretrained='torchvision://resnet50',
7+
depth=50,
8+
norm_eval=False),
9+
cls_head=dict(
10+
type='TSNHead',
11+
num_classes=400,
12+
in_channels=2048,
13+
spatial_type='avg',
14+
consensus=dict(type='AvgConsensus', dim=1),
15+
dropout_ratio=0.4,
16+
init_std=0.01))
17+
# model training and testing settings
18+
train_cfg = None
19+
test_cfg = dict(average_clips=None)
20+
# dataset settings
21+
dataset_type = 'VideoDataset'
22+
data_root = 'data/kinetics400/videos_train'
23+
data_root_val = 'data/kinetics400/videos_val'
24+
ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
25+
ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
26+
ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
27+
img_norm_cfg = dict(
28+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
29+
train_pipeline = [
30+
dict(type='DecordInit'),
31+
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
32+
dict(type='DecordDecode'),
33+
dict(
34+
type='MultiScaleCrop',
35+
input_size=224,
36+
scales=(1, 0.875, 0.75, 0.66),
37+
random_crop=False,
38+
max_wh_scale_gap=1),
39+
dict(type='Resize', scale=(224, 224), keep_ratio=False),
40+
dict(type='Flip', flip_ratio=0.5),
41+
dict(type='Imgaug', transforms='default'),
42+
# dict(
43+
# type='Imgaug',
44+
# transforms=[
45+
# dict(type='Rotate', rotate=(-20, 20)),
46+
# dict(type='Dropout', p=(0, 0.05))
47+
# ]),
48+
dict(type='Normalize', **img_norm_cfg),
49+
dict(type='FormatShape', input_format='NCHW'),
50+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
51+
dict(type='ToTensor', keys=['imgs', 'label'])
52+
]
53+
val_pipeline = [
54+
dict(type='DecordInit'),
55+
dict(
56+
type='SampleFrames',
57+
clip_len=1,
58+
frame_interval=1,
59+
num_clips=8,
60+
test_mode=True),
61+
dict(type='DecordDecode'),
62+
dict(type='Resize', scale=(-1, 256)),
63+
dict(type='CenterCrop', crop_size=224),
64+
dict(type='Flip', flip_ratio=0),
65+
dict(type='Normalize', **img_norm_cfg),
66+
dict(type='FormatShape', input_format='NCHW'),
67+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
68+
dict(type='ToTensor', keys=['imgs'])
69+
]
70+
test_pipeline = [
71+
dict(type='DecordInit'),
72+
dict(
73+
type='SampleFrames',
74+
clip_len=1,
75+
frame_interval=1,
76+
num_clips=25,
77+
test_mode=True),
78+
dict(type='DecordDecode'),
79+
dict(type='Resize', scale=(-1, 256)),
80+
dict(type='ThreeCrop', crop_size=256),
81+
dict(type='Flip', flip_ratio=0),
82+
dict(type='Normalize', **img_norm_cfg),
83+
dict(type='FormatShape', input_format='NCHW'),
84+
dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
85+
dict(type='ToTensor', keys=['imgs'])
86+
]
87+
data = dict(
88+
videos_per_gpu=32,
89+
workers_per_gpu=4,
90+
train=dict(
91+
type=dataset_type,
92+
ann_file=ann_file_train,
93+
data_prefix=data_root,
94+
pipeline=train_pipeline),
95+
val=dict(
96+
type=dataset_type,
97+
ann_file=ann_file_val,
98+
data_prefix=data_root_val,
99+
pipeline=val_pipeline),
100+
test=dict(
101+
type=dataset_type,
102+
ann_file=ann_file_test,
103+
data_prefix=data_root_val,
104+
pipeline=test_pipeline))
105+
# optimizer
106+
optimizer = dict(
107+
type='SGD', lr=0.01, momentum=0.9,
108+
weight_decay=0.0001) # this lr is used for 8 gpus
109+
optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
110+
# learning policy
111+
lr_config = dict(policy='step', step=[40, 80])
112+
total_epochs = 100
113+
checkpoint_config = dict(interval=1)
114+
evaluation = dict(
115+
interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'])
116+
log_config = dict(
117+
interval=20,
118+
hooks=[
119+
dict(type='TextLoggerHook'),
120+
# dict(type='TensorboardLoggerHook'),
121+
])
122+
# runtime settings
123+
dist_params = dict(backend='nccl')
124+
log_level = 'INFO'
125+
work_dir = './work_dirs/tsn_r50_video_1x1x8_100e_kinetics400_rgb/'
126+
load_from = None
127+
resume_from = None
128+
workflow = [('train', 1)]

docs/changelog.md

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
**New Features**
88

9+
- Support [imgaug](https://imgaug.readthedocs.io/en/latest/index.html) for augmentations in the data pipeline ([#492](https://github.com/open-mmlab/mmaction2/pull/492))
10+
911
**Improvements**
1012

1113
- Support setting `max_testing_views` for extremely large models to save GPU memory used ([#511](https://github.com/open-mmlab/mmaction2/pull/511))

mmaction/datasets/pipelines/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .augmentations import (AudioAmplify, CenterCrop, ColorJitter,
22
EntityBoxCrop, EntityBoxFlip, EntityBoxRescale,
3-
Flip, Fuse, MelSpectrogram, MultiGroupCrop,
3+
Flip, Fuse, Imgaug, MelSpectrogram, MultiGroupCrop,
44
MultiScaleCrop, Normalize, RandomCrop,
55
RandomRescale, RandomResizedCrop, RandomScale,
66
Resize, TenCrop, ThreeCrop)
@@ -31,5 +31,5 @@
3131
'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector',
3232
'AudioDecodeInit', 'EntityBoxFlip', 'EntityBoxCrop', 'EntityBoxRescale',
3333
'RandomScale', 'ImageDecode', 'BuildPseudoClip', 'RandomRescale',
34-
'PyAVDecodeMotionVector', 'Rename'
34+
'PyAVDecodeMotionVector', 'Rename', 'Imgaug'
3535
]

0 commit comments

Comments
 (0)