Skip to content

Commit 47b1ca9

Browse files
[Improvement] Update docs about test crops. (open-mmlab#359)
* [Docs] Update docs about test crops. 1. Add more docs. 2. Update default configs in TSM model when using DenseSampleFrames. * [Docs] Update docs about test crops 1. Add more docs. 2. Update default configs in TSM model when using DenseSampleFrames. * calculate num_crops automatically * remove `twice_sample/test_crops` in test_cfg * update all tsm model `test_cfg['average_clips']` default value to 'prob' * add changelog * fix a bug when using tsn and `test_cfg['average_clips']='prob'` * fix docs and add docs for open-mmlab#363 * use `num_segments` instead of `num_segs` in average_clip * use `num_segs` in TSMHead and average_clip.
1 parent b483fe0 commit 47b1ca9

24 files changed

+63
-81
lines changed

configs/recognition/tsm/README.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,7 @@ not including the IO time and pre-processing time. For each setting, we use 1 gp
6161
We use efficient setting as default provided in config files, and it can be changed to accurate setting by
6262
```python
6363
...
64-
# `test_cfg = dict(average_clips=None)` for efficient setting
65-
test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True) # for accurate setting
64+
test_cfg = dict(average_clips='prob')
6665
...
6766
test_pipeline = [
6867
dict(

configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
is_shift=True))
2525
# model training and testing settings
2626
train_cfg = None
27-
test_cfg = dict(average_clips=None)
27+
test_cfg = dict(average_clips='prob')
2828
# dataset settings
2929
dataset_type = 'RawframeDataset'
3030
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
is_shift=True))
2525
# model training and testing settings
2626
train_cfg = None
27-
test_cfg = dict(average_clips=None)
27+
test_cfg = dict(average_clips='prob')
2828
# dataset settings
2929
dataset_type = 'RawframeDataset'
3030
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
is_shift=True))
2525
# model training and testing settings
2626
train_cfg = None
27-
test_cfg = dict(average_clips=None)
27+
test_cfg = dict(average_clips='prob')
2828
# dataset settings
2929
dataset_type = 'RawframeDataset'
3030
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/sthv1/rawframes'

configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/sthv2/rawframes'

configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_shift=True))
2121
# model training and testing settings
2222
train_cfg = None
23-
test_cfg = dict(average_clips=None)
23+
test_cfg = dict(average_clips='prob')
2424
# dataset settings
2525
dataset_type = 'RawframeDataset'
2626
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_shift=True))
2121
# model training and testing settings
2222
train_cfg = None
23-
test_cfg = dict(average_clips=None)
23+
test_cfg = dict(average_clips='prob')
2424
# dataset settings
2525
dataset_type = 'RawframeDataset'
2626
data_root = 'data/sthv1/rawframes'

configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_shift=True))
2121
# model training and testing settings
2222
train_cfg = None
23-
test_cfg = dict(average_clips=None)
23+
test_cfg = dict(average_clips='prob')
2424
# dataset settings
2525
dataset_type = 'RawframeDataset'
2626
data_root = 'data/sthv2/rawframes'

configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/sthv1/rawframes'

configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/sthv2/rawframes'

configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'RawframeDataset'
2424
data_root = 'data/kinetics400/rawframes_train'

configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_shift=True))
1919
# model training and testing settings
2020
train_cfg = None
21-
test_cfg = dict(average_clips=None)
21+
test_cfg = dict(average_clips='prob')
2222
# dataset settings
2323
dataset_type = 'VideoDataset'
2424
data_root = 'data/kinetics400/videos_train'

configs/recognition/tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_shift=True))
2121
# model training and testing settings
2222
train_cfg = None
23-
test_cfg = dict(average_clips=None)
23+
test_cfg = dict(average_clips='prob')
2424
# dataset settings
2525
dataset_type = 'RawframeDataset'
2626
data_root = 'data/kinetics400/rawframes_train'

demo/README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
### Video demo
1111

12-
We provide a demo script to predict the recognition result using a single video.
12+
We provide a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
1313

1414
```shell
1515
python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} {LABEL_FILE} [--use-frames] \
@@ -157,7 +157,7 @@ or use checkpoint url from `configs/` to directly load corresponding checkpoint,
157157

158158
### Webcam demo
159159

160-
We provide a demo script to implement real-time action recognition from web camera.
160+
We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
161161

162162
```shell
163163
python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
@@ -212,7 +212,7 @@ Users can change:
212212
213213
### Long video demo
214214
215-
We provide a demo script to predict different labels using a single long video.
215+
We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
216216
217217
```shell
218218
python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \

docs/changelog.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
- Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323))
1717
- Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329))
1818
- Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310))
19+
- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
1920

2021
**Bug Fixes**
2122
- Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314))

mmaction/datasets/pipelines/loading.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,8 @@ class DenseSampleFrames(SampleFrames):
323323
sample_range (int): Total sample range for dense sample.
324324
Default: 64.
325325
num_sample_positions (int): Number of sample start positions, Which is
326-
only used in test mode. Default: 10.
326+
only used in test mode. Default: 10. That is to say, by default,
327+
there are at least 10 clips for one input sample in test mode.
327328
temporal_jitter (bool): Whether to apply temporal jittering.
328329
Default: False.
329330
test_mode (bool): Store True when building test or validation dataset.

mmaction/models/heads/tsm_head.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,16 @@ def init_weights(self):
7373
"""Initiate the parameters from scratch."""
7474
normal_init(self.fc_cls, std=self.init_std)
7575

76-
def forward(self, x, num_segments):
76+
def forward(self, x, num_segs):
7777
"""Defines the computation performed at every call.
7878
7979
Args:
8080
x (torch.Tensor): The input data.
81-
num_segments (int): Number of frame segments. Default: 8.
82-
81+
num_segs (int): Useless in TSMHead. By default, `num_segs`
82+
is equal to `clip_len * num_clips * num_crops`, which is
83+
automatically generated in Recognizer forward phase and
84+
useless in TSM models. The `self.num_segments` we need is a
85+
hyper parameter to build TSM models.
8386
Returns:
8487
torch.Tensor: The classification scores for input samples.
8588
"""

mmaction/models/recognizers/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,11 @@ def average_clip(self, cls_score, num_segs=1):
7777
7878
Using different averaging types ('score' or 'prob' or None,
7979
which defined in test_cfg) to computed the final averaged
80-
class score.
80+
class score. Only called in test mode.
8181
8282
Args:
8383
cls_score (torch.Tensor): Class score to be averaged.
84+
num_segs (int): Number of clips for each input sample.
8485
8586
Returns:
8687
torch.Tensor: Averaged class score.

mmaction/models/recognizers/recognizer2d.py

+22-40
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,9 @@ def forward_train(self, imgs, labels, **kwargs):
3232

3333
return losses
3434

35-
def forward_test(self, imgs):
36-
"""Defines the computation performed at every call when evaluation and
37-
testing."""
38-
test_crops = self.test_cfg.get('test_crops', None)
39-
twice_sample = self.test_cfg.get('twice_sample', False)
40-
35+
def _do_test(self, imgs):
36+
"""Defines the computation performed at every call when evaluation,
37+
testing and gradcam."""
4138
batches = imgs.shape[0]
4239

4340
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
@@ -57,13 +54,26 @@ def forward_test(self, imgs):
5754
losses.update(loss_aux)
5855
num_segs = 1
5956

57+
# When using `TSNHead` or `TPNHead`, shape is [batch_size, num_classes]
58+
# When using `TSMHead`, shape is [batch_size * num_crops, num_classes]
59+
# `num_crops` is calculated by:
60+
# 1) `twice_sample` in `SampleFrames`
61+
# 2) `num_sample_positions` in `DenseSampleFrames`
62+
# 3) `ThreeCrop/TenCrop/MultiGroupCrop` in `test_pipeline`
63+
# 4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1`
6064
cls_score = self.cls_head(x, num_segs)
61-
if test_crops is not None:
62-
if twice_sample:
63-
test_crops = test_crops * 2
64-
cls_score = self.average_clip(cls_score, test_crops)
6565

66-
return cls_score.cpu().numpy()
66+
assert cls_score.size()[0] % batches == 0
67+
# calculate num_crops automatically
68+
cls_score = self.average_clip(cls_score,
69+
cls_score.size()[0] // batches)
70+
71+
return cls_score
72+
73+
def forward_test(self, imgs):
74+
"""Defines the computation performed at every call when evaluation and
75+
testing."""
76+
return self._do_test(imgs).cpu().numpy()
6777

6878
def forward_dummy(self, imgs):
6979
"""Used for computing network FLOPs.
@@ -87,32 +97,4 @@ def forward_dummy(self, imgs):
8797
def forward_gradcam(self, imgs):
8898
"""Defines the computation performed at every call when using gradcam
8999
utils."""
90-
test_crops = self.test_cfg.get('test_crops', None)
91-
twice_sample = self.test_cfg.get('twice_sample', False)
92-
93-
batches = imgs.shape[0]
94-
95-
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
96-
num_segs = imgs.shape[0] // batches
97-
98-
losses = dict()
99-
100-
x = self.extract_feat(imgs)
101-
if hasattr(self, 'neck'):
102-
x = [
103-
each.reshape((-1, num_segs) +
104-
each.shape[1:]).transpose(1, 2).contiguous()
105-
for each in x
106-
]
107-
x, loss_aux = self.neck(x)
108-
x = x.squeeze(2)
109-
losses.update(loss_aux)
110-
num_segs = 1
111-
112-
cls_score = self.cls_head(x, num_segs)
113-
if test_crops is not None:
114-
if twice_sample:
115-
test_crops = test_crops * 2
116-
cls_score = self.average_clip(cls_score, test_crops)
117-
118-
return cls_score
100+
return self._do_test(imgs)

mmaction/models/recognizers/recognizer3d.py

+10-15
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def forward_train(self, imgs, labels, **kwargs):
2323

2424
return losses
2525

26-
def forward_test(self, imgs):
27-
"""Defines the computation performed at every call when evaluation and
28-
testing."""
26+
def _do_test(self, imgs):
27+
"""Defines the computation performed at every call when evaluation,
28+
testing and gradcam."""
2929
num_segs = imgs.shape[1]
3030
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
3131

@@ -36,7 +36,12 @@ def forward_test(self, imgs):
3636
cls_score = self.cls_head(x)
3737
cls_score = self.average_clip(cls_score, num_segs)
3838

39-
return cls_score.cpu().numpy()
39+
return cls_score
40+
41+
def forward_test(self, imgs):
42+
"""Defines the computation performed at every call when evaluation and
43+
testing."""
44+
return self._do_test(imgs).cpu().numpy()
4045

4146
def forward_dummy(self, imgs):
4247
"""Used for computing network FLOPs.
@@ -58,14 +63,4 @@ def forward_dummy(self, imgs):
5863
def forward_gradcam(self, imgs):
5964
"""Defines the computation performed at every call when using gradcam
6065
utils."""
61-
num_segs = imgs.shape[1]
62-
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
63-
64-
x = self.extract_feat(imgs)
65-
if hasattr(self, 'neck'):
66-
x, _ = self.neck(x)
67-
68-
cls_score = self.cls_head(x)
69-
cls_score = self.average_clip(cls_score, num_segs)
70-
71-
return cls_score
66+
return self._do_test(imgs)

tests/test_gradcam.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def test_tsm():
194194
_do_test_2D_models(recognizer, target_layer_name, input_shape)
195195

196196
# test twice sample + 3 crops, 2*3*8=48
197-
test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True)
197+
test_cfg = dict(average_clips='prob')
198198
recognizer = build_recognizer(config.model, test_cfg=test_cfg)
199199
recognizer.cfg = config
200200
input_shape = (1, 48, 3, 32, 32)

tests/test_models/test_recognizers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ def test_tsm():
300300
demo_inputs = generate_demo_inputs(input_shape)
301301
imgs = demo_inputs['imgs']
302302

303-
test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True)
303+
test_cfg = dict(average_clips='prob')
304304
recognizer = build_recognizer(
305305
model, train_cfg=train_cfg, test_cfg=test_cfg)
306306

0 commit comments

Comments
 (0)