[Enhancement] Delete convert function and add instruction to ViT/Swin…

… README.md (#791) * delete convert function and add instruction to README.md * unified model convert and README * remove url * fix import error * fix unittest * rename pretrain * rename vit and deit pretrain * Update upernet_deit-b16_512x512_160k_ade20k.py * Update upernet_deit-b16_512x512_80k_ade20k.py * Update upernet_deit-b16_ln_mln_512x512_160k_ade20k.py * Update upernet_deit-b16_mln_512x512_160k_ade20k.py * Update upernet_deit-s16_512x512_160k_ade20k.py * Update upernet_deit-s16_512x512_80k_ade20k.py * Update upernet_deit-s16_ln_mln_512x512_160k_ade20k.py * Update upernet_deit-s16_mln_512x512_160k_ade20k.py Co-authored-by: Jiarui XU <xvjiarui0826@gmail.com> Co-authored-by: Junjun2016 <hejunjun@sjtu.edu.cn>
open-mmlab · Aug 25, 2021 · c11da07 · c11da07
1 parent e235c1a
commit c11da07
Show file tree

Hide file tree

Showing 35 changed files with 131 additions and 217 deletions.
diff --git a/configs/_base_/models/setr_mla.py b/configs/_base_/models/setr_mla.py
@@ -3,8 +3,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained=\
-    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',  # noqa
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
     backbone=dict(
         type='VisionTransformer',
         img_size=(768, 768),

diff --git a/configs/_base_/models/setr_naive.py b/configs/_base_/models/setr_naive.py
@@ -3,8 +3,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained=\
-    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',  # noqa
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
     backbone=dict(
         type='VisionTransformer',
         img_size=(768, 768),

diff --git a/configs/_base_/models/setr_pup.py b/configs/_base_/models/setr_pup.py
@@ -3,8 +3,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained=\
-    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth',  # noqa
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
     backbone=dict(
         type='VisionTransformer',
         img_size=(768, 768),

diff --git a/configs/_base_/models/upernet_vit-b16_ln_mln.py b/configs/_base_/models/upernet_vit-b16_ln_mln.py
@@ -2,7 +2,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',  # noqa
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
     backbone=dict(
         type='VisionTransformer',
         img_size=(512, 512),

diff --git a/configs/segformer/README.md b/configs/segformer/README.md
@@ -13,6 +13,18 @@
 }
 ```
 
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`mit2mmseg.py`](../../tools/model_converters/mit2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/NVlabs/SegFormer) to MMSegmentation style.
+
+```shell
+python tools/model_converters/swin2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
 ## Results and models
 
 ### ADE20k
@@ -61,13 +73,3 @@ test_pipeline = [
         ])
 ]
 ```
-
-## How to use segformer official pretrain weights
-
-We convert the backbone weights from the official repo (https://github.com/NVlabs/SegFormer) with `tools/model_converters/mit_convert.py`.
-
-You may follow below steps to start segformer training preparation:
-
-1. Download segformer pretrain weights (Suggest put in `pretrain/`);
-2. Run convert script to convert official pretrain weights: `python tools/model_converters/mit_convert.py pretrain/mit_b0.pth pretrain/mit_b0.pth`;
-3. Modify `pretrained` of segformer model config, for example, `pretrained` of `segformer_mit-b0_512x512_160k_ade20k.py` is set to `pretrain/mit_b0.pth`;
diff --git a/configs/setr/setr_mla_512x512_160k_b8_ade20k.py b/configs/setr/setr_mla_512x512_160k_b8_ade20k.py
@@ -4,6 +4,7 @@
 ]
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
+    pretrained='pretrain/vit_large_patch16_384.pth',
     backbone=dict(img_size=(512, 512), drop_rate=0.),
     decode_head=dict(num_classes=150),
     auxiliary_head=[

diff --git a/configs/setr/setr_naive_512x512_160k_b16_ade20k.py b/configs/setr/setr_naive_512x512_160k_b16_ade20k.py
@@ -4,6 +4,7 @@
 ]
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
+    pretrained='pretrain/vit_large_patch16_384.pth',
     backbone=dict(img_size=(512, 512), drop_rate=0.),
     decode_head=dict(num_classes=150),
     auxiliary_head=[

diff --git a/configs/setr/setr_pup_512x512_160k_b16_ade20k.py b/configs/setr/setr_pup_512x512_160k_b16_ade20k.py
@@ -4,6 +4,7 @@
 ]
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
+    pretrained='pretrain/vit_large_patch16_384.pth',
     backbone=dict(img_size=(512, 512), drop_rate=0.),
     decode_head=dict(num_classes=150),
     auxiliary_head=[

diff --git a/configs/swin/README.md b/configs/swin/README.md
@@ -13,6 +13,24 @@
 }
 ```
 
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`swin2mmseg.py`](../../tools/model_converters/swin2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation) to MMSegmentation style.
+
+```shell
+python tools/model_converters/swin2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/swin2mmseg.py https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth pretrain/swin_base_patch4_window7_224.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
 ## Results and models
 
 ### ADE20K

diff --git a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py b/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py
@@ -3,8 +3,7 @@
     'pretrain_224x224_1K.py'
 ]
 model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth', # noqa
+    pretrained='pretrain/swin_base_patch4_window12_384.pth',
     backbone=dict(
         pretrain_img_size=384,
         embed_dims=128,

diff --git a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py b/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py
@@ -2,7 +2,4 @@
     './upernet_swin_base_patch4_window12_512x512_160k_ade20k_'
     'pretrain_384x384_1K.py'
 ]
-model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth', # noqa
-)
+model = dict(pretrained='pretrain/swin_base_patch4_window12_384_22k.pth')
diff --git a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
@@ -3,11 +3,8 @@
     'pretrain_224x224_1K.py'
 ]
 model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth', # noqa
+    pretrained='pretrain/swin_base_patch4_window7_224.pth',
     backbone=dict(
-        embed_dims=128,
-        depths=[2, 2, 18, 2],
-        num_heads=[4, 8, 16, 32]),
+        embed_dims=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32]),
     decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
     auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py b/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py
@@ -2,7 +2,4 @@
     './upernet_swin_base_patch4_window7_512x512_160k_ade20k_'
     'pretrain_224x224_1K.py'
 ]
-model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pth', # noqa
-)
+model = dict(pretrained='pretrain/swin_base_patch4_window7_224_22k.pth')
diff --git a/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
@@ -3,15 +3,7 @@
     'pretrain_224x224_1K.py'
 ]
 model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth', # noqa
-    backbone=dict(
-        depths=[2, 2, 18, 2]),
-    decode_head=dict(
-        in_channels=[96, 192, 384, 768],
-        num_classes=150
-    ),
-    auxiliary_head=dict(
-        in_channels=384,
-        num_classes=150
-    ))
+    pretrained='pretrain/swin_small_patch4_window7_224.pth',
+    backbone=dict(depths=[2, 2, 18, 2]),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=384, num_classes=150))
diff --git a/configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
@@ -3,8 +3,7 @@
     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 ]
 model = dict(
-    pretrained=\
-    'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth', # noqa
+    pretrained='pretrain/swin_tiny_patch4_window7_224.pth',
     backbone=dict(
         embed_dims=96,
         depths=[2, 2, 6, 2],

diff --git a/configs/vit/README.md b/configs/vit/README.md
@@ -13,6 +13,24 @@
 }
 ```
 
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`vit2mmseg.py`](../../tools/model_converters/vit2mmseg.py) in the tools directory to convert the key of models from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to MMSegmentation style.
+
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth pretrain/jx_vit_base_p16_224-80ecf9dd.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
 ## Results and models
 
 ### ADE20K

diff --git a/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py
@@ -1,6 +1,6 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',  # noqa
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
     backbone=dict(drop_path_rate=0.1),
-    neck=None)  # yapf: disable
+    neck=None)
diff --git a/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py
@@ -1,6 +1,6 @@
 _base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',  # noqa
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
     backbone=dict(drop_path_rate=0.1),
-    neck=None)  # yapf: disable
+    neck=None)
diff --git a/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py
@@ -1,5 +1,5 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',  # noqa
-    backbone=dict(drop_path_rate=0.1, final_norm=True))  # yapf: disable
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1, final_norm=True))
diff --git a/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py
@@ -1,5 +1,6 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth',  # noqa
-    backbone=dict(drop_path_rate=0.1),)  # yapf: disable
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+)
diff --git a/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py
@@ -1,8 +1,8 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth',  # noqa
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
     backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
     decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
     neck=None,
-    auxiliary_head=dict(num_classes=150, in_channels=384))  # yapf: disable
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py
@@ -1,8 +1,8 @@
 _base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth',  # noqa
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
     backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
     decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
     neck=None,
-    auxiliary_head=dict(num_classes=150, in_channels=384))  # yapf: disable
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py
@@ -1,12 +1,9 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth',  # noqa
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
     backbone=dict(
-        num_heads=6,
-        embed_dims=384,
-        drop_path_rate=0.1,
-        final_norm=True),
+        num_heads=6, embed_dims=384, drop_path_rate=0.1, final_norm=True),
     decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
     neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
-    auxiliary_head=dict(num_classes=150, in_channels=384))  # yapf: disable
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py
@@ -1,8 +1,8 @@
 _base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
 
 model = dict(
-    pretrained='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth',  # noqa
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
     backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
     decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
     neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
-    auxiliary_head=dict(num_classes=150, in_channels=384))  # yapf: disable
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py
@@ -5,6 +5,7 @@
 ]
 
 model = dict(
+    pretrained='pretrain/vit_base_patch16_224.pth',
     backbone=dict(drop_path_rate=0.1, final_norm=True),
     decode_head=dict(num_classes=150),
     auxiliary_head=dict(num_classes=150))

diff --git a/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py
@@ -5,7 +5,9 @@
 ]
 
 model = dict(
-    decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150))
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
 
 # AdamW optimizer, no weight decay for position embedding & layer norm
 # in backbone

diff --git a/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py b/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py
@@ -5,7 +5,9 @@
 ]
 
 model = dict(
-    decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150))
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
 
 # AdamW optimizer, no weight decay for position embedding & layer norm
 # in backbone

diff --git a/mmseg/models/backbones/swin.py b/mmseg/models/backbones/swin.py
@@ -17,7 +17,7 @@
 from mmseg.ops import resize
 from ...utils import get_root_logger
 from ..builder import ATTENTION, BACKBONES
-from ..utils import PatchEmbed, swin_convert
+from ..utils import PatchEmbed
 
 
 class PatchMerging(BaseModule):
@@ -564,8 +564,6 @@ class SwinTransformer(BaseModule):
             Default: dict(type='LN').
         norm_cfg (dict): Config dict for normalization layer at
             output of backone. Defaults: dict(type='LN').
-        pretrain_style (str): Choose to use official or mmcls pretrain weights.
-            Default: official.
         pretrained (str, optional): model pretrained path. Default: None.
         init_cfg (dict, optional): The Config for initialization.
             Defaults to None.
@@ -591,7 +589,6 @@ def __init__(self,
                  use_abs_pos_embed=False,
                  act_cfg=dict(type='GELU'),
                  norm_cfg=dict(type='LN'),
-                 pretrain_style='official',
                  pretrained=None,
                  init_cfg=None):
         super(SwinTransformer, self).__init__()
@@ -605,9 +602,6 @@ def __init__(self,
                 f'The size of image should have length 1 or 2, ' \
                 f'but got {len(pretrain_img_size)}'
 
-        assert pretrain_style in ['official', 'mmcls'], 'We only support load '
-        'official ckpt and mmcls ckpt.'
-
         if isinstance(pretrained, str) or pretrained is None:
             warnings.warn('DeprecationWarning: pretrained is a deprecated, '
                           'please use "init_cfg" instead')
@@ -617,7 +611,6 @@ def __init__(self,
         num_layers = len(depths)
         self.out_indices = out_indices
         self.use_abs_pos_embed = use_abs_pos_embed
-        self.pretrain_style = pretrain_style
         self.pretrained = pretrained
         self.init_cfg = init_cfg
 
@@ -713,9 +706,6 @@ def init_weights(self):
             else:
                 state_dict = ckpt
 
-            if self.pretrain_style == 'official':
-                state_dict = swin_convert(state_dict)
-
             # strip prefix of state_dict
             if list(state_dict.keys())[0].startswith('module.'):
                 state_dict = {k[7:]: v for k, v in state_dict.items()}