sibozhang
diff --git a/‎configs/_base_/models/tanet_r50.py
+19 b/‎configs/_base_/models/tanet_r50.py
+19
diff --git a/‎configs/recognition/tanet/README.md
+67 b/‎configs/recognition/tanet/README.md
+67
diff --git a/‎configs/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb.py
+100 b/‎configs/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb.py
+100
diff --git a/‎mmaction/core/optimizer/tsm_optimizer_constructor.py
+2-1 b/‎mmaction/core/optimizer/tsm_optimizer_constructor.py
+2-1
diff --git a/‎mmaction/models/__init__.py
+4-4 b/‎mmaction/models/__init__.py
+4-4
diff --git a/‎mmaction/models/backbones/__init__.py
+2-1 b/‎mmaction/models/backbones/__init__.py
+2-1
diff --git a/‎mmaction/models/backbones/tanet.py
+114 b/‎mmaction/models/backbones/tanet.py
+114
diff --git a/‎mmaction/models/common/__init__.py
+2-1 b/‎mmaction/models/common/__init__.py
+2-1
@@ -0,0 +1,19 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='TANet',
+        pretrained='torchvision://resnet50',
+        depth=50,
+        num_segments=8,
+        tam_cfg=dict()),
+    cls_head=dict(
+        type='TSMHead',
+        num_classes=400,
+        in_channels=2048,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.5,
+        init_std=0.001))
+train_cfg = None
+test_cfg = dict(average_clips='prob')
@@ -0,0 +1,67 @@
+# TANet
+
+## Introduction
+
+[ALGORITHM]
+
+```latex
+@article{liu2020tam,
+  title={TAM: Temporal Adaptive Module for Video Recognition},
+  author={Liu, Zhaoyang and Wang, Limin and Wu, Wayne and Qian, Chen and Lu, Tong},
+  journal={arXiv preprint arXiv:2005.06803},
+  year={2020}
+}
+```
+
+## Model Zoo
+
+### Kinetics-400
+
+|config | resolution | gpus | backbone | pretrain | top1 acc| top5 acc | reference top1 acc | reference top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
+|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
+|[tanet_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb.py) |short-side 320|8| TANet | ImageNet |76.28 | 92.60 |[76.22](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh)|[92.53](https://github.com/liu-zhy/temporal-adaptive-module/blob/master/scripts/test_tam_kinetics_rgb_8f.sh) | x | 7124 | [ckpt](https://download.openmmlab.com/mmaction/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb/tanet_r50_dense_1x1x8_100e_kinetics400_rgb_20210219-032c8e94.pth) | [log](https://download.openmmlab.com/mmaction/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb/tanet_r50_dense_1x1x8_100e_kinetics400_rgb_20210219.log)| [json](https://download.openmmlab.com/mmaction/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb/tanet_r50_dense_1x1x8_100e_kinetics400_rgb_20210219.json)|
+
+Notes:
+
+1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
+   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
+   e.g., lr=0.01 for 8 GPUs x 8 videos/gpu and lr=0.04 for 16 GPUs x 16 videos/gpu.
+2. The values in columns named after "reference" are the results got by testing on our dataset, using the checkpoints provided by the author with same model settings. The checkpoints for reference repo can be downloaded [here](https://drive.google.com/drive/folders/1sFfmP3yrfc7IzRshEELOby7-aEoymIFL?usp=sharing).
+
+For more details on data preparation, you can refer to Kinetics400 in [Data Preparation](/docs/data_preparation.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train TANet model on Kinetics-400 dataset in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb.py \
+    --work-dir work_dirs/tanet_r50_dense_1x1x8_100e_kinetics400_rgb \
+    --validate --seed 0 --deterministic
+```
+
+For more details, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test TANet model on Kinetics-400 dataset and dump the result to a json file.
+
+```shell
+python tools/test.py configs/recognition/tanet/tanet_r50_dense_1x1x8_100e_kinetics400_rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
+    --out result.json
+```
+
+For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
@@ -0,0 +1,100 @@
+_base_ = [
+    '../../_base_/models/tanet_r50.py', '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/kinetics400/rawframes_train'
+data_root_val = 'data/kinetics400/rawframes_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes.txt'
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+
+train_pipeline = [
+    dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='DenseSampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=8,
+    workers_per_gpu=4,
+    test_dataloader=dict(videos_per_gpu=2),
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+evaluation = dict(
+    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'])
+
+# optimizer
+optimizer = dict(
+    type='SGD',
+    constructor='TSMOptimizerConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    lr=0.01,  # this lr is used for 8 gpus
+    momentum=0.9,
+    weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[50, 75, 90])
+total_epochs = 100
+
+# runtime settings
+work_dir = './work_dirs/tanet_r50_dense_1x1x8_100e_kinetics400_rgb/'
@@ -54,7 +54,8 @@ def add_params(self, params, model):
             elif isinstance(m, torch.nn.Linear):
                 m_params = list(m.parameters())
                 normal_weight.append(m_params[0])
-                normal_bias.append(m_params[1])
+                if len(m_params) == 2:
+                    normal_bias.append(m_params[1])
             elif isinstance(m,
                             (_BatchNorm, SyncBatchNorm, torch.nn.GroupNorm)):
                 for param in list(m.parameters()):
 
@@ -1,11 +1,11 @@
 from .backbones import (C3D, X3D, MobileNetV2, MobileNetV2TSM, ResNet,
                         ResNet2Plus1d, ResNet3d, ResNet3dCSN, ResNet3dLayer,
                         ResNet3dSlowFast, ResNet3dSlowOnly, ResNetAudio,
-                        ResNetTIN, ResNetTSM)
+                        ResNetTIN, ResNetTSM, TANet)
 from .builder import (DETECTORS, build_backbone, build_detector, build_head,
                       build_localizer, build_loss, build_model, build_neck,
                       build_recognizer)
-from .common import Conv2plus1d, ConvAudio
+from .common import TAM, Conv2plus1d, ConvAudio
 from .heads import (AudioTSNHead, AVARoIHead, BaseHead, BBoxHeadAVA, I3DHead,
                     SlowFastHead, TPNHead, TSMHead, TSNHead, X3DHead)
 from .localizers import BMN, PEM, TEM
@@ -25,10 +25,10 @@
     'BaseRecognizer', 'LOSSES', 'CrossEntropyLoss', 'NLLLoss', 'HVULoss',
     'ResNetTSM', 'ResNet3dSlowFast', 'SlowFastHead', 'Conv2plus1d',
     'ResNet3dSlowOnly', 'BCELossWithLogits', 'LOCALIZERS', 'build_localizer',
-    'PEM', 'TEM', 'BinaryLogisticRegressionLoss', 'BMN', 'BMNLoss',
+    'PEM', 'TAM', 'TEM', 'BinaryLogisticRegressionLoss', 'BMN', 'BMNLoss',
     'build_model', 'OHEMHingeLoss', 'SSNLoss', 'ResNet3dCSN', 'ResNetTIN',
     'TPN', 'TPNHead', 'build_loss', 'build_neck', 'AudioRecognizer',
     'AudioTSNHead', 'X3D', 'X3DHead', 'ResNet3dLayer', 'DETECTORS',
     'SingleRoIExtractor3D', 'BBoxHeadAVA', 'ResNetAudio', 'build_detector',
-    'ConvAudio', 'AVARoIHead', 'MobileNetV2', 'MobileNetV2TSM'
+    'ConvAudio', 'AVARoIHead', 'MobileNetV2', 'MobileNetV2TSM', 'TANet'
 ]
@@ -10,10 +10,11 @@
 from .resnet_audio import ResNetAudio
 from .resnet_tin import ResNetTIN
 from .resnet_tsm import ResNetTSM
+from .tanet import TANet
 from .x3d import X3D
 
 __all__ = [
     'C3D', 'ResNet', 'ResNet3d', 'ResNetTSM', 'ResNet2Plus1d',
     'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNet3dCSN', 'ResNetTIN', 'X3D',
-    'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2'
+    'ResNetAudio', 'ResNet3dLayer', 'MobileNetV2TSM', 'MobileNetV2', 'TANet'
 ]
@@ -0,0 +1,114 @@
+from copy import deepcopy
+
+import torch.nn as nn
+from torch.utils import checkpoint as cp
+
+from ..common import TAM
+from ..registry import BACKBONES
+from .resnet import Bottleneck, ResNet
+
+
+class TABlock(nn.Module):
+    """Temporal Adaptive Block (TA-Block) for TANet.
+
+    This block is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO
+    RECOGNITION <https://arxiv.org/pdf/2005.06803>`_
+
+    The temporal adaptive module (TAM) is embedded into ResNet-Block
+    after the first Conv2D, which turns the vanilla ResNet-Block
+    into TA-Block.
+
+    Args:
+        block (nn.Module): Residual blocks to be substituted.
+        num_segments (int): Number of frame segments.
+        tam_cfg (dict): Config for temporal adaptive module (TAM).
+            Default: dict().
+    """
+
+    def __init__(self, block, num_segments, tam_cfg=dict()):
+        super().__init__()
+        self.tam_cfg = deepcopy(tam_cfg)
+        self.block = block
+        self.num_segments = num_segments
+        self.tam = TAM(
+            in_channels=block.conv1.out_channels,
+            num_segments=num_segments,
+            **self.tam_cfg)
+
+        if not isinstance(self.block, Bottleneck):
+            raise NotImplementedError('TA-Blocks have not been fully '
+                                      'implemented except the pattern based '
+                                      'on Bottleneck block.')
+
+    def forward(self, x):
+        if isinstance(self.block, Bottleneck):
+
+            def _inner_forward(x):
+                """Forward wrapper for utilizing checkpoint."""
+                identity = x
+
+                out = self.block.conv1(x)
+                out = self.tam(out)
+                out = self.block.conv2(out)
+                out = self.block.conv3(out)
+
+                if self.block.downsample is not None:
+                    identity = self.block.downsample(x)
+
+                out = out + identity
+
+                return out
+
+            if self.block.with_cp and x.requires_grad:
+                out = cp.checkpoint(_inner_forward, x)
+            else:
+                out = _inner_forward(x)
+
+            out = self.block.relu(out)
+
+            return out
+
+
+@BACKBONES.register_module()
+class TANet(ResNet):
+    """Temporal Adaptive Network (TANet) backbone.
+
+    This backbone is proposed in `TAM: TEMPORAL ADAPTIVE MODULE FOR VIDEO
+    RECOGNITION <https://arxiv.org/pdf/2005.06803>`_
+
+    Embedding the temporal adaptive module (TAM) into ResNet to
+    instantiate TANet.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_segments (int): Number of frame segments.
+        tam_cfg (dict | None): Config for temporal adaptive module (TAM).
+            Default: dict().
+        **kwargs (keyword arguments, optional): Arguments for ResNet except
+            ```depth```.
+    """
+
+    def __init__(self, depth, num_segments, tam_cfg=dict(), **kwargs):
+        super().__init__(depth, **kwargs)
+        assert num_segments >= 3
+        self.num_segments = num_segments
+        self.tam_cfg = deepcopy(tam_cfg)
+
+    def init_weights(self):
+        super().init_weights()
+        self.make_tam_modeling()
+
+    def make_tam_modeling(self):
+        """Replace ResNet-Block with TA-Block."""
+
+        def make_tam_block(stage, num_segments, tam_cfg=dict()):
+            blocks = list(stage.children())
+            for i, block in enumerate(blocks):
+                blocks[i] = TABlock(block, num_segments, deepcopy(tam_cfg))
+            return nn.Sequential(*blocks)
+
+        for i in range(self.num_stages):
+            layer_name = f'layer{i + 1}'
+            res_layer = getattr(self, layer_name)
+            setattr(self, layer_name,
+                    make_tam_block(res_layer, self.num_segments, self.tam_cfg))
@@ -1,4 +1,5 @@
 from .conv2plus1d import Conv2plus1d
 from .conv_audio import ConvAudio
+from .tam import TAM
 
-__all__ = ['Conv2plus1d', 'ConvAudio']
+__all__ = ['Conv2plus1d', 'ConvAudio', 'TAM']