[Improvement] Update docs about test crops. (open-mmlab#359)

irvingzhang0512 · web-flow · commit 47b1ca9df6d1 · 2020-11-27T14:16:58.000+08:00
* [Docs] Update docs about test crops. 1. Add more docs. 2. Update default configs in TSM model when using DenseSampleFrames. * [Docs] Update docs about test crops 1. Add more docs. 2. Update default configs in TSM model when using DenseSampleFrames. * calculate num_crops automatically * remove `twice_sample/test_crops` in test_cfg * update all tsm model `test_cfg['average_clips']` default value to 'prob' * add changelog * fix a bug when using tsn and `test_cfg['average_clips']='prob'` * fix docs and add docs for open-mmlab#363 * use `num_segments` instead of `num_segs` in average_clip * use `num_segs` in TSMHead and average_clip.
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
@@ -61,8 +61,7 @@ not including the IO time and pre-processing time. For each setting, we use 1 gp
 We use efficient setting as default provided in config files, and it can be changed to accurate setting by
 ```python
 ...
-# `test_cfg = dict(average_clips=None)` for efficient setting
-test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True)  # for accurate setting
+test_cfg = dict(average_clips='prob')
 ...
 test_pipeline = [
     dict(
diff --git a/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_nl_dot_product_r50_1x1x8_50e_kinetics400_rgb.py
@@ -24,7 +24,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_nl_embedded_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
@@ -24,7 +24,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_nl_gaussian_r50_1x1x8_50e_kinetics400_rgb.py
@@ -24,7 +24,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py b/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv1/rawframes'
diff --git a/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py b/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv2/rawframes'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py
@@ -20,7 +20,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py
@@ -20,7 +20,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv1/rawframes'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py
@@ -20,7 +20,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv2/rawframes'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv1/rawframes'
diff --git a/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py b/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/sthv2/rawframes'
diff --git a/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py
@@ -18,7 +18,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'VideoDataset'
 data_root = 'data/kinetics400/videos_train'
diff --git a/configs/recognition/tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_temporal_pool_r50_1x1x8_50e_kinetics400_rgb.py
@@ -20,7 +20,7 @@
         is_shift=True))
 # model training and testing settings
 train_cfg = None
-test_cfg = dict(average_clips=None)
+test_cfg = dict(average_clips='prob')
 # dataset settings
 dataset_type = 'RawframeDataset'
 data_root = 'data/kinetics400/rawframes_train'
diff --git a/demo/README.md b/demo/README.md
@@ -9,7 +9,7 @@
 
 ### Video demo
 
-We provide a demo script to predict the recognition result using a single video.
+We provide a demo script to predict the recognition result using a single video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
 
 ```shell
 python demo/demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} {LABEL_FILE} [--use-frames] \
@@ -157,7 +157,7 @@ or use checkpoint url from `configs/` to directly load corresponding checkpoint,
 
 ### Webcam demo
 
-We provide a demo script to implement real-time action recognition from web camera.
+We provide a demo script to implement real-time action recognition from web camera. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
 
 ```shell
 python demo/webcam_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${LABEL_FILE} \
@@ -212,7 +212,7 @@ Users can change:
 
 ### Long video demo
 
-We provide a demo script to predict different labels using a single long video.
+We provide a demo script to predict different labels using a single long video. In order to get predict results in range `[0, 1]`, make sure to set `test_cfg = dict(average_clips='prob')` in config file.
 
 ```shell
 python demo/long_video_demo.py ${CONFIG_FILE} ${CHECKPOINT_FILE} ${VIDEO_FILE} ${LABEL_FILE} \
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -16,6 +16,7 @@
 - Add random seed for building filelists ([#323](https://github.com/open-mmlab/mmaction2/pull/323))
 - Move docs about demo to `demo/README.md` ([#329](https://github.com/open-mmlab/mmaction2/pull/329))
 - Remove redundant code in `tools/test.py` ([#310](https://github.com/open-mmlab/mmaction2/pull/310))
+- Automatically calculate number of test clips for Recognizer2D ([#359](https://github.com/open-mmlab/mmaction2/pull/359))
 
 **Bug Fixes**
 - Fix a bug in BaseDataset when `data_prefix` is None ([#314](https://github.com/open-mmlab/mmaction2/pull/314))
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
@@ -323,7 +323,8 @@ class DenseSampleFrames(SampleFrames):
         sample_range (int): Total sample range for dense sample.
             Default: 64.
         num_sample_positions (int): Number of sample start positions, Which is
-            only used in test mode. Default: 10.
+            only used in test mode. Default: 10. That is to say, by default,
+            there are at least 10 clips for one input sample in test mode.
         temporal_jitter (bool): Whether to apply temporal jittering.
             Default: False.
         test_mode (bool): Store True when building test or validation dataset.
diff --git a/mmaction/models/heads/tsm_head.py b/mmaction/models/heads/tsm_head.py
@@ -73,13 +73,16 @@ def init_weights(self):
         """Initiate the parameters from scratch."""
         normal_init(self.fc_cls, std=self.init_std)
 
-    def forward(self, x, num_segments):
+    def forward(self, x, num_segs):
         """Defines the computation performed at every call.
 
         Args:
             x (torch.Tensor): The input data.
-            num_segments (int): Number of frame segments. Default: 8.
-
+            num_segs (int): Useless in TSMHead. By default, `num_segs`
+                is equal to `clip_len * num_clips * num_crops`, which is
+                automatically generated in Recognizer forward phase and
+                useless in TSM models. The `self.num_segments` we need is a
+                hyper parameter to build TSM models.
         Returns:
             torch.Tensor: The classification scores for input samples.
         """
diff --git a/mmaction/models/recognizers/base.py b/mmaction/models/recognizers/base.py
@@ -77,10 +77,11 @@ def average_clip(self, cls_score, num_segs=1):
 
         Using different averaging types ('score' or 'prob' or None,
         which defined in test_cfg) to computed the final averaged
-        class score.
+        class score. Only called in test mode.
 
         Args:
             cls_score (torch.Tensor): Class score to be averaged.
+            num_segs (int): Number of clips for each input sample.
 
         Returns:
             torch.Tensor: Averaged class score.
diff --git a/mmaction/models/recognizers/recognizer2d.py b/mmaction/models/recognizers/recognizer2d.py
@@ -32,12 +32,9 @@ def forward_train(self, imgs, labels, **kwargs):
 
         return losses
 
-    def forward_test(self, imgs):
-        """Defines the computation performed at every call when evaluation and
-        testing."""
-        test_crops = self.test_cfg.get('test_crops', None)
-        twice_sample = self.test_cfg.get('twice_sample', False)
-
+    def _do_test(self, imgs):
+        """Defines the computation performed at every call when evaluation,
+        testing and gradcam."""
         batches = imgs.shape[0]
 
         imgs = imgs.reshape((-1, ) + imgs.shape[2:])
@@ -57,13 +54,26 @@ def forward_test(self, imgs):
             losses.update(loss_aux)
             num_segs = 1
 
+        # When using `TSNHead` or `TPNHead`, shape is [batch_size, num_classes]
+        # When using `TSMHead`, shape is [batch_size * num_crops, num_classes]
+        # `num_crops` is calculated by:
+        #   1) `twice_sample` in `SampleFrames`
+        #   2) `num_sample_positions` in `DenseSampleFrames`
+        #   3) `ThreeCrop/TenCrop/MultiGroupCrop` in `test_pipeline`
+        #   4) `num_clips` in `SampleFrames` or its subclass if `clip_len != 1`
         cls_score = self.cls_head(x, num_segs)
-        if test_crops is not None:
-            if twice_sample:
-                test_crops = test_crops * 2
-            cls_score = self.average_clip(cls_score, test_crops)
 
-        return cls_score.cpu().numpy()
+        assert cls_score.size()[0] % batches == 0
+        # calculate num_crops automatically
+        cls_score = self.average_clip(cls_score,
+                                      cls_score.size()[0] // batches)
+
+        return cls_score
+
+    def forward_test(self, imgs):
+        """Defines the computation performed at every call when evaluation and
+        testing."""
+        return self._do_test(imgs).cpu().numpy()
 
     def forward_dummy(self, imgs):
         """Used for computing network FLOPs.
@@ -87,32 +97,4 @@ def forward_dummy(self, imgs):
     def forward_gradcam(self, imgs):
         """Defines the computation performed at every call when using gradcam
         utils."""
-        test_crops = self.test_cfg.get('test_crops', None)
-        twice_sample = self.test_cfg.get('twice_sample', False)
-
-        batches = imgs.shape[0]
-
-        imgs = imgs.reshape((-1, ) + imgs.shape[2:])
-        num_segs = imgs.shape[0] // batches
-
-        losses = dict()
-
-        x = self.extract_feat(imgs)
-        if hasattr(self, 'neck'):
-            x = [
-                each.reshape((-1, num_segs) +
-                             each.shape[1:]).transpose(1, 2).contiguous()
-                for each in x
-            ]
-            x, loss_aux = self.neck(x)
-            x = x.squeeze(2)
-            losses.update(loss_aux)
-            num_segs = 1
-
-        cls_score = self.cls_head(x, num_segs)
-        if test_crops is not None:
-            if twice_sample:
-                test_crops = test_crops * 2
-            cls_score = self.average_clip(cls_score, test_crops)
-
-        return cls_score
+        return self._do_test(imgs)
diff --git a/mmaction/models/recognizers/recognizer3d.py b/mmaction/models/recognizers/recognizer3d.py
@@ -23,9 +23,9 @@ def forward_train(self, imgs, labels, **kwargs):
 
         return losses
 
-    def forward_test(self, imgs):
-        """Defines the computation performed at every call when evaluation and
-        testing."""
+    def _do_test(self, imgs):
+        """Defines the computation performed at every call when evaluation,
+        testing and gradcam."""
         num_segs = imgs.shape[1]
         imgs = imgs.reshape((-1, ) + imgs.shape[2:])
 
@@ -36,7 +36,12 @@ def forward_test(self, imgs):
         cls_score = self.cls_head(x)
         cls_score = self.average_clip(cls_score, num_segs)
 
-        return cls_score.cpu().numpy()
+        return cls_score
+
+    def forward_test(self, imgs):
+        """Defines the computation performed at every call when evaluation and
+        testing."""
+        return self._do_test(imgs).cpu().numpy()
 
     def forward_dummy(self, imgs):
         """Used for computing network FLOPs.
@@ -58,14 +63,4 @@ def forward_dummy(self, imgs):
     def forward_gradcam(self, imgs):
         """Defines the computation performed at every call when using gradcam
         utils."""
-        num_segs = imgs.shape[1]
-        imgs = imgs.reshape((-1, ) + imgs.shape[2:])
-
-        x = self.extract_feat(imgs)
-        if hasattr(self, 'neck'):
-            x, _ = self.neck(x)
-
-        cls_score = self.cls_head(x)
-        cls_score = self.average_clip(cls_score, num_segs)
-
-        return cls_score
+        return self._do_test(imgs)
diff --git a/tests/test_gradcam.py b/tests/test_gradcam.py
@@ -194,7 +194,7 @@ def test_tsm():
     _do_test_2D_models(recognizer, target_layer_name, input_shape)
 
     # test twice sample + 3 crops, 2*3*8=48
-    test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True)
+    test_cfg = dict(average_clips='prob')
     recognizer = build_recognizer(config.model, test_cfg=test_cfg)
     recognizer.cfg = config
     input_shape = (1, 48, 3, 32, 32)
diff --git a/tests/test_models/test_recognizers.py b/tests/test_models/test_recognizers.py
@@ -300,7 +300,7 @@ def test_tsm():
     demo_inputs = generate_demo_inputs(input_shape)
     imgs = demo_inputs['imgs']
 
-    test_cfg = dict(average_clips='prob', test_crops=3, twice_sample=True)
+    test_cfg = dict(average_clips='prob')
     recognizer = build_recognizer(
         model, train_cfg=train_cfg, test_cfg=test_cfg)