From 0d05d6329f766c42aa9bfb188f6e1b57cca2b2d7 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 21 Mar 2022 14:39:09 +0800 Subject: [PATCH 01/45] [Feature] Add BEiT backbone --- configs/_base_/models/upernet_beit.py | 52 ++ ...t_beit_base_12_640_slide_160k_ade20k_ms.py | 70 +++ ...eit_base_12_640_slide_160k_ade20k_pt2ft.py | 47 ++ ..._beit_large_24_640_slide_160k_ade20k_ms.py | 72 +++ ...it_large_24_640_slide_160k_ade20k_pt2ft.py | 52 ++ mmseg/core/__init__.py | 2 + .../core/layer_decay_optimizer_constructor.py | 100 ++++ mmseg/models/backbones/__init__.py | 3 +- mmseg/models/backbones/beit.py | 556 ++++++++++++++++++ mmseg/models/necks/__init__.py | 5 +- mmseg/models/necks/featurepyramid.py | 71 +++ tests/test_models/test_backbones/test_beit.py | 176 ++++++ .../test_necks/test_featurepyramid.py | 17 + 13 files changed, 1221 insertions(+), 2 deletions(-) create mode 100644 configs/_base_/models/upernet_beit.py create mode 100644 configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py create mode 100644 configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py create mode 100644 configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py create mode 100644 configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py create mode 100644 mmseg/core/layer_decay_optimizer_constructor.py create mode 100644 mmseg/models/backbones/beit.py create mode 100644 mmseg/models/necks/featurepyramid.py create mode 100644 tests/test_models/test_backbones/test_beit.py create mode 100644 tests/test_models/test_necks/test_featurepyramid.py diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py new file mode 100644 index 0000000000..d83f3792b0 --- /dev/null +++ b/configs/_base_/models/upernet_beit.py @@ -0,0 +1,52 @@ +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + pretrained=None, + backbone=dict( + type='BEiT', + img_size=(640, 640), + patch_size=16, + in_channels=3, + embed_dims=768, + num_layers=12, + num_heads=12, + mlp_ratio=4, + out_indices=(3, 5, 7, 11), + qv_bias=True, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1, + norm_cfg=dict(type='LN', eps=1e-6), + act_cfg=dict(type='GELU'), + norm_eval=False, + interpolate_mode='bicubic', + init_values=0.1), + neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]), + decode_head=dict( + type='UPerHead', + in_channels=[384, 384, 384, 384], + in_index=[0, 1, 2, 3], + pool_scales=(1, 2, 3, 6), + channels=512, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=dict( + type='FCNHead', + in_channels=384, + in_index=2, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py b/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py new file mode 100644 index 0000000000..acffb362d5 --- /dev/null +++ b/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py @@ -0,0 +1,70 @@ +_base_ = [ + '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +crop_size = (640, 640) + +model = dict( + pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k_new1.pth', + backbone=dict( + type='BEiT', + img_size=(640, 640), + patch_size=16, + embed_dims=768, + num_layers=12, + num_heads=12, + mlp_ratio=4, + qv_bias=True, + init_values=0.1, + drop_path_rate=0.1, + out_indices=[3, 5, 7, 11]), + neck=dict(embed_dim=768, rescales=[4, 2, 1, 0.5]), + decode_head=dict( + in_channels=[768, 768, 768, 768], num_classes=150, channels=768), + auxiliary_head=dict(in_channels=768, num_classes=150), + test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=3e-5, + betas=(0.9, 0.999), + weight_decay=0.05, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9)) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +min_size = 640 +find_unused_parameters = True + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2560, 640), + img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=True, + transforms=[ + dict(type='Resize', keep_ratio=True, min_size=min_size), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline), + samples_per_gpu=2, +) diff --git a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py b/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py new file mode 100644 index 0000000000..4d2842bf77 --- /dev/null +++ b/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py @@ -0,0 +1,47 @@ +_base_ = [ + '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +crop_size = (640, 640) + +model = dict( + pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k.pth', + backbone=dict( + type='BEiT', + img_size=(640, 640), + patch_size=16, + embed_dims=768, + num_layers=12, + num_heads=12, + mlp_ratio=4, + qv_bias=True, + init_values=0.1, + drop_path_rate=0.1, + out_indices=[3, 5, 7, 11]), + neck=dict(embed_dim=768, rescales=[4, 2, 1, 0.5]), + decode_head=dict( + in_channels=[768, 768, 768, 768], num_classes=150, channels=768), + auxiliary_head=dict(in_channels=768, num_classes=150), + test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=3e-5, + betas=(0.9, 0.999), + weight_decay=0.05, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9)) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=1500, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +# By default, models are trained on 8 GPUs with 2 images per GPU +data = dict(samples_per_gpu=2) diff --git a/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py new file mode 100644 index 0000000000..be492b56ea --- /dev/null +++ b/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py @@ -0,0 +1,72 @@ +_base_ = [ + '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' +] +crop_size = (640, 640) + +model = dict( + pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', + backbone=dict( + type='BEiT', + img_size=(640, 640), + patch_size=16, + embed_dims=1024, + num_layers=24, + num_heads=16, + mlp_ratio=4, + qv_bias=True, + init_values=1e-6, + drop_path_rate=0.2, + out_indices=[7, 11, 15, 23], + ), + neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), + decode_head=dict( + in_channels=[1024, 1024, 1024, 1024], + num_classes=150, + channels=1024, + ), + auxiliary_head=dict(in_channels=1024, num_classes=150), + test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=2e-5, + betas=(0.9, 0.999), + weight_decay=0.05, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.95)) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=3000, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +min_size = 640 + +find_unused_parameters = True + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2560, 640), + img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=True, + transforms=[ + dict(type='Resize', keep_ratio=True, min_size=min_size), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline)) diff --git a/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py b/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py new file mode 100644 index 0000000000..909b33534c --- /dev/null +++ b/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py @@ -0,0 +1,52 @@ +_base_ = [ + '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' +] +crop_size = (640, 640) + +model = dict( + pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', + backbone=dict( + type='BEiT', + img_size=(640, 640), + patch_size=16, + embed_dims=1024, + num_layers=24, + num_heads=16, + mlp_ratio=4, + qv_bias=True, + init_values=1e-6, + drop_path_rate=0.2, + out_indices=[7, 11, 15, 23], + ), + neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), + decode_head=dict( + in_channels=[1024, 1024, 1024, 1024], + num_classes=150, + channels=1024, + ), + auxiliary_head=dict(in_channels=1024, num_classes=150), + test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + +optimizer = dict( + _delete_=True, + type='AdamW', + lr=2e-5, + betas=(0.9, 0.999), + weight_decay=0.05, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.95)) + +lr_config = dict( + _delete_=True, + policy='poly', + warmup='linear', + warmup_iters=3000, + warmup_ratio=1e-6, + power=1.0, + min_lr=0.0, + by_epoch=False) + +data = dict(samples_per_gpu=1) +optimizer_config = dict( + type='GradientCumulativeOptimizerHook', cumulative_iters=2) diff --git a/mmseg/core/__init__.py b/mmseg/core/__init__.py index 402278618e..c60b48c0c6 100644 --- a/mmseg/core/__init__.py +++ b/mmseg/core/__init__.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .evaluation import * # noqa: F401, F403 +from .layer_decay_optimizer_constructor import \ + LayerDecayOptimizerConstructor # noqa: F401 from .seg import * # noqa: F401, F403 from .utils import * # noqa: F401, F403 diff --git a/mmseg/core/layer_decay_optimizer_constructor.py b/mmseg/core/layer_decay_optimizer_constructor.py new file mode 100644 index 0000000000..b0c50f6f37 --- /dev/null +++ b/mmseg/core/layer_decay_optimizer_constructor.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import (OPTIMIZER_BUILDERS, DefaultOptimizerConstructor, + get_dist_info) + +from mmseg.utils import get_root_logger + + +def get_num_layer_for_vit(var_name, num_max_layer): + """Get the layer id to set the different learning rates. + + Args: + var_name (str): The key of the model. + num_max_layer (int): Maximum number of backbone layers. + Returns: + layer id (int): Returns the layer id of the key. + """ + + if var_name in ('backbone.cls_token', 'backbone.mask_token', + 'backbone.pos_embed'): + return 0 + elif var_name.startswith('backbone.patch_embed'): + return 0 + elif var_name.startswith('backbone.layers'): + layer_id = int(var_name.split('.')[2]) + return layer_id + 1 + else: + return num_max_layer - 1 + + +@OPTIMIZER_BUILDERS.register_module() +class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor): + """Different learning rates are set for different layers of backbone.""" + + def add_params(self, params, module, prefix='', is_dcn_module=None): + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module + is_dcn_module (int|float|None): If the current module is a + submodule of DCN, `is_dcn_module` will be passed to + control conv_offset layer's learning rate. Defaults to None. + """ + parameter_groups = {} + logger = get_root_logger() + logger.info(self.paramwise_cfg) + + num_layers = self.paramwise_cfg.get('num_layers') + 2 + layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate') + + logger.info(f'Build LayerDecayOptimizerConstructor ' + f'{layer_decay_rate} - {num_layers}') + + weight_decay = self.base_wd + + for name, param in module.named_parameters(): + if not param.requires_grad: + continue # frozen weights + if len(param.shape) == 1 or name.endswith('.bias') or name in ( + 'pos_embed', 'cls_token'): + group_name = 'no_decay' + this_weight_decay = 0. + else: + group_name = 'decay' + this_weight_decay = weight_decay + + layer_id = get_num_layer_for_vit(name, num_layers) + group_name = 'layer_%d_%s' % (layer_id, group_name) + + if group_name not in parameter_groups: + scale = layer_decay_rate**(num_layers - layer_id - 1) + + parameter_groups[group_name] = { + 'weight_decay': this_weight_decay, + 'params': [], + 'param_names': [], + 'lr_scale': scale, + 'group_name': group_name, + 'lr': scale * self.base_lr, + } + + parameter_groups[group_name]['params'].append(param) + parameter_groups[group_name]['param_names'].append(name) + rank, _ = get_dist_info() + if rank == 0: + to_display = {} + for key in parameter_groups: + to_display[key] = { + 'param_names': parameter_groups[key]['param_names'], + 'lr_scale': parameter_groups[key]['lr_scale'], + 'lr': parameter_groups[key]['lr'], + 'weight_decay': parameter_groups[key]['weight_decay'], + } + logger.info(f'Param groups ={to_display}') + + params.extend(parameter_groups.values()) diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py index 434378e993..1ede4874da 100644 --- a/mmseg/models/backbones/__init__.py +++ b/mmseg/models/backbones/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .beit import BEiT from .bisenetv1 import BiSeNetV1 from .bisenetv2 import BiSeNetV2 from .cgnet import CGNet @@ -24,5 +25,5 @@ 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'TIMMBackbone', 'ERFNet', 'PCPVT', - 'SVT', 'STDCNet', 'STDCContextPathNet' + 'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT' ] diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py new file mode 100644 index 0000000000..e0991a74ae --- /dev/null +++ b/mmseg/models/backbones/beit.py @@ -0,0 +1,556 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_norm_layer +from mmcv.cnn.bricks.drop import build_dropout +from mmcv.cnn.bricks.transformer import FFN +from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init, + trunc_normal_) +from mmcv.runner import BaseModule, ModuleList, _load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm +from torch.nn.modules.utils import _pair as to_2tuple + +from mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import PatchEmbed + +try: + from scipy import interpolate +except ImportError: + interpolate = None + + +class BEiTAttention(BaseModule): + """Window based multi-head self-attention (W-MSA) module with relative + position bias. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): The height and width of the window. + qv_bias (bool, optional): If True, add a learnable bias to q, v. + Default: True. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + init_cfg (dict | None, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + window_size, + qv_bias=True, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + init_cfg=None): + + super().__init__(init_cfg=init_cfg) + self.embed_dims = embed_dims + self.num_heads = num_heads + head_embed_dims = embed_dims // num_heads + self.scale = qk_scale or head_embed_dims**-0.5 + if qv_bias: + self.q_bias = nn.Parameter(torch.zeros(embed_dims)) + self.v_bias = nn.Parameter(torch.zeros(embed_dims)) + else: + self.q_bias = None + self.v_bias = None + + self.window_size = window_size + # cls to token & token 2 cls & cls to cls + self.num_relative_distance = (2 * window_size[0] - + 1) * (2 * window_size[1] - 1) + 3 + # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH) + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) + + # get pair-wise relative position index for + # each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + # coords shape is (2, Wh, Ww) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) + # coords_flatten shape is (2, Wh*Ww) + coords_flatten = torch.flatten(coords, 1) + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :]) + # relative_coords shape is (Wh*Ww, Wh*Ww, 2) + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + # shift to start from 0 + relative_coords[:, :, 0] += window_size[0] - 1 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros( + size=(window_size[0] * window_size[1] + 1, ) * 2, + dtype=relative_coords.dtype) + + # relative_position_index shape is (Wh*Ww, Wh*Ww) + relative_position_index[1:, 1:] = relative_coords.sum(-1) + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer('relative_position_index', + relative_position_index) + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop_rate) + + def init_weights(self): + trunc_normal_(self.relative_position_bias_table, std=0.02) + + def forward(self, x): + """ + Args: + x (tensor): input features with shape of (num_windows*B, N, C). + """ + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + (self.q_bias, + torch.zeros_like(self.v_bias, + requires_grad=False), self.v_bias)) + + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + relative_position_bias = \ + self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class TransformerEncoderLayer(BaseModule): + """Implements one encoder layer in Vision Transformer. + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + attn_drop_rate (float): The drop out rate for attention layer. + Default: 0.0. + drop_path_rate (float): stochastic depth rate. Default 0.0. + num_fcs (int): The number of fully-connected layers for FFNs. + Default: 2. + qkv_bias (bool): enable bias for qkv if True. Default: True + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + window_size (tuple[int], optional): The height and width of the window. + Default: None. + init_values (float, optional): Initialize the values of BEiTAttention + and FFN with learnable scaling. Default: None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + attn_drop_rate=0., + drop_path_rate=0., + num_fcs=2, + qv_bias=True, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + window_size=None, + init_values=None): + super(TransformerEncoderLayer, self).__init__() + + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, embed_dims, postfix=1) + self.add_module(self.norm1_name, norm1) + + self.attn = BEiTAttention( + embed_dims=embed_dims, + num_heads=num_heads, + window_size=window_size, + qv_bias=qv_bias, + qk_scale=None, + attn_drop_rate=attn_drop_rate, + proj_drop_rate=0., + init_cfg=None) + self.ffn = FFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=num_fcs, + ffn_drop=0., + dropout_layer=None, + act_cfg=act_cfg, + add_identity=False) + + self.norm2_name, norm2 = build_norm_layer( + norm_cfg, embed_dims, postfix=2) + self.add_module(self.norm2_name, norm2) + + # NOTE: drop path for stochastic depth, we shall see if + # this is better than dropout here + dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate) + self.drop_path = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + self.gamma_1 = nn.Parameter( + init_values * torch.ones((embed_dims)), requires_grad=True) + self.gamma_2 = nn.Parameter( + init_values * torch.ones((embed_dims)), requires_grad=True) + + @property + def norm1(self): + return getattr(self, self.norm1_name) + + @property + def norm2(self): + return getattr(self, self.norm2_name) + + def forward(self, x): + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) + x = x + self.drop_path(self.gamma_2 * self.ffn(self.norm2(x))) + return x + + +@BACKBONES.register_module() +class BEiT(BaseModule): + """VisionTransformer with support for patch. + + Args: + img_size (int | tuple): Input image size. Default: 224. + patch_size (int): The patch size. Default: 16. + in_channels (int): Number of input channels. Default: 3. + embed_dims (int): embedding dimension. Default: 768. + num_layers (int): depth of transformer. Default: 12. + num_heads (int): number of attention heads. Default: 12. + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + out_indices (list | tuple | int): Output from which stages. + Default: -1. + qkv_bias (bool): enable bias for qkv if True. Default: True. + drop_rate (float): Probability of an element to be zeroed. + Default 0.0 + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0 + drop_path_rate (float): stochastic depth rate. Default 0.0. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN') + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + patch_norm (bool): Whether to add a norm in PatchEmbed Block. + Default: False. + final_norm (bool): Whether to add a additional layer to normalize + final feature map. Default: False. + interpolate_mode (str): Select the interpolate mode for position + embeding vector resize. Default: bicubic. + num_fcs (int): The number of fully-connected layers for FFNs. + Default: 2. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. Default: False. + pretrained (str, optional): model pretrained path. Default: None. + init_values (float): Initialize the values of BEiTAttention and FFN + with learnable scaling. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dims=768, + num_layers=12, + num_heads=12, + mlp_ratio=4, + out_indices=-1, + qv_bias=True, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_cfg=dict(type='LN'), + act_cfg=dict(type='GELU'), + patch_norm=False, + final_norm=False, + interpolate_mode='bicubic', + num_fcs=2, + norm_eval=False, + with_cp=False, + pretrained=None, + init_values=0.1, + init_cfg=None): + super(BEiT, self).__init__(init_cfg=init_cfg) + + if isinstance(img_size, int): + img_size = to_2tuple(img_size) + elif isinstance(img_size, tuple): + if len(img_size) == 1: + img_size = to_2tuple(img_size[0]) + assert len(img_size) == 2, \ + f'The size of image should have length 1 or 2, ' \ + f'but got {len(img_size)}' + + assert not (init_cfg and pretrained), \ + 'init_cfg and pretrained cannot be set at the same time' + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + elif pretrained is not None: + raise TypeError('pretrained must be a str or None') + + self.img_size = img_size + self.patch_size = patch_size + self.interpolate_mode = interpolate_mode + self.norm_eval = norm_eval + self.with_cp = with_cp + self.pretrained = pretrained + + self.patch_embed = PatchEmbed( + in_channels=in_channels, + embed_dims=embed_dims, + conv_type='Conv2d', + kernel_size=patch_size, + stride=patch_size, + padding=0, + norm_cfg=norm_cfg if patch_norm else None, + init_cfg=None, + ) + + window_size = (img_size[0] // patch_size, img_size[1] // patch_size) + self.patch_shape = window_size + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) + self.drop_after_pos = nn.Dropout(p=drop_rate) + + if isinstance(out_indices, int): + if out_indices == -1: + out_indices = num_layers - 1 + self.out_indices = [out_indices] + elif isinstance(out_indices, list) or isinstance(out_indices, tuple): + self.out_indices = out_indices + else: + raise TypeError('out_indices must be type of int, list or tuple') + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_layers)] + self.layers = ModuleList() + for i in range(num_layers): + self.layers.append( + TransformerEncoderLayer( + embed_dims=embed_dims, + num_heads=num_heads, + feedforward_channels=mlp_ratio * embed_dims, + attn_drop_rate=attn_drop_rate, + drop_path_rate=dpr[i], + num_fcs=num_fcs, + qv_bias=qv_bias, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + window_size=window_size, + init_values=init_values)) + + self.final_norm = final_norm + if final_norm: + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, embed_dims, postfix=1) + self.add_module(self.norm1_name, norm1) + + @property + def norm1(self): + return getattr(self, self.norm1_name) + + def fix_init_weight(self): + + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.layers): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.ffn.layers[1].weight.data, layer_id + 1) + + def resize_rel_pos_embed(self, state_dict, key, dst_num_pos, + dst_patch_shape): + """Resize relative pos_embed weights. + + Args: + state_dict (dict): Key and value of the model. + dst_num_pos (int): The number of relative position encoding + for the current model. + dst_patch_shape (tuple): The number of the patch embedding. + Returns: + state_dict (dict): Interpolate the relative pos_embed weights + in the pre-train model to the current model size. + """ + + rel_pos_bias = state_dict[key] + src_num_pos, num_attn_heads = rel_pos_bias.size() + # dst_num_pos, _ = self.state_dict()[key].size() + # dst_patch_shape = self.patch_shape + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * ( + dst_patch_shape[1] * 2 - 1) + src_size = int((src_num_pos - num_extra_tokens)**0.5) + dst_size = int((dst_num_pos - num_extra_tokens)**0.5) + if src_size != dst_size: + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r**n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q**(i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to( + rel_pos_bias.device)) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) + state_dict[key] = new_rel_pos_bias + + return state_dict + + def init_weights(self): + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + self.apply(_init_weights) + self.fix_init_weight() + + if (isinstance(self.init_cfg, dict) + and self.init_cfg.get('type') == 'Pretrained'): + logger = get_root_logger() + checkpoint = _load_checkpoint( + self.init_cfg['checkpoint'], logger=logger, map_location='cpu') + + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + all_keys = list(state_dict.keys()) + for key in all_keys: + if 'relative_position_index' in key: + state_dict.pop(key) + # In order to keep the center of pos_bias as consistent as + # possible after interpolation, and vice versa in the edge + # area, the geometric sequence interpolation method is adopted. + if 'relative_position_bias_table' in key: + + dst_num_pos, _ = self.state_dict()[key].size() + state_dict = self.resize_rel_pos_embed( + state_dict, key, dst_num_pos, self.patch_shape) + + self.load_state_dict(state_dict, False) + elif self.init_cfg is not None: + super(BEiT, self).init_weights() + else: + # We only implement the 'jax_impl' initialization implemented at + # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 + trunc_normal_(self.cls_token, std=.02) + for n, m in self.named_modules(): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + if 'ffn' in n: + nn.init.normal_(m.bias, mean=0., std=1e-6) + else: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + kaiming_init(m, mode='fan_in', bias=0.) + elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): + constant_init(m, val=1.0, bias=0.) + + def forward(self, inputs): + B = inputs.shape[0] + + x, hw_shape = self.patch_embed(inputs) + + # stole cls_tokens impl from Phil Wang, thanks + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i == len(self.layers) - 1: + if self.final_norm: + x = self.norm1(x) + if i in self.out_indices: + # Remove class token and reshape token for decoder head + out = x[:, 1:] + B, _, C = out.shape + out = out.reshape(B, hw_shape[0], hw_shape[1], + C).permute(0, 3, 1, 2).contiguous() + outs.append(out) + + return tuple(outs) + + def train(self, mode=True): + super(BEiT, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.LayerNorm): + m.eval() diff --git a/mmseg/models/necks/__init__.py b/mmseg/models/necks/__init__.py index aba73f165b..ff03186a92 100644 --- a/mmseg/models/necks/__init__.py +++ b/mmseg/models/necks/__init__.py @@ -1,8 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .featurepyramid import Feature2Pyramid from .fpn import FPN from .ic_neck import ICNeck from .jpu import JPU from .mla_neck import MLANeck from .multilevel_neck import MultiLevelNeck -__all__ = ['FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck', 'JPU'] +__all__ = [ + 'FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck', 'JPU', 'Feature2Pyramid' +] diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py new file mode 100644 index 0000000000..447062f1c4 --- /dev/null +++ b/mmseg/models/necks/featurepyramid.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from ..builder import NECKS + + +@NECKS.register_module() +class Feature2Pyramid(nn.Module): + """Feature2Pyramid. + + A neck structure connect ViT backbone and decoder_heads. + + Args: + embed_dims (int): embedding dimension. + rescales (list[float]): different sampling multiples were + used to obtain pyramid features. Default: (4, 2, 1, 0.5). + norm (str) : bn or syncbn. + """ + + def __init__(self, embed_dim, rescales, norm='syncbn'): + super(Feature2Pyramid, self).__init__() + self.rescales = rescales + for k in self.rescales: + if k == 4: + if norm == 'bn': + self.upsample_4x = nn.Sequential( + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.BatchNorm2d(embed_dim), + nn.GELU(), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) + elif norm == 'syncbn': + self.upsample_4x = nn.Sequential( + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + nn.SyncBatchNorm(embed_dim), + nn.GELU(), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) + elif k == 2: + self.upsample_2x = nn.Sequential( + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2)) + elif k == 1: + self.identity = nn.Identity() + elif k == 0.5: + self.downsample_2x = nn.MaxPool2d(kernel_size=2, stride=2) + elif k == 0.25: + self.downsample_4x = nn.MaxPool2d(kernel_size=4, stride=4) + else: + raise KeyError(f'invalid {k} for feature2pyramid') + + def forward(self, inputs): + assert len(inputs) == len(self.rescales) + outputs = [] + if self.upsample_4x is not None: + ops = [ + self.upsample_4x, self.upsample_2x, self.identity, + self.downsample_2x + ] + else: + ops = [ + self.upsample_2x, self.identity, self.downsample_2x, + self.downsample_4x + ] + for i in range(len(inputs)): + outputs.append(ops[i](inputs[i])) + return tuple(outputs) diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py new file mode 100644 index 0000000000..604af59a26 --- /dev/null +++ b/tests/test_models/test_backbones/test_beit.py @@ -0,0 +1,176 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmseg.models.backbones.beit import BEiT +from .utils import check_norm_state + + +def test_beit_backbone(): + with pytest.raises(TypeError): + # pretrained must be a string path + model = BEiT() + model.init_weights(pretrained=0) + + with pytest.raises(TypeError): + # img_size must be int or tuple + model = BEiT(img_size=512.0) + + with pytest.raises(TypeError): + # out_indices must be int ,list or tuple + model = BEiT(out_indices=1.) + + with pytest.raises(AssertionError): + # The length of img_size tuple must be lower than 3. + BEiT(img_size=(224, 224, 224)) + + with pytest.raises(TypeError): + # Pretrained must be None or Str. + BEiT(pretrained=123) + + # Test img_size isinstance tuple + imgs = torch.randn(1, 3, 224, 224) + model = BEiT(img_size=(224, )) + model.init_weights() + model(imgs) + + # Test img_size isinstance tuple + imgs = torch.randn(1, 3, 224, 224) + model = BEiT(img_size=(224, 224)) + model(imgs) + + # Test norm_eval = True + model = BEiT(norm_eval=True) + model.train() + + # Test BEiT backbone with input size of 224 and patch size of 16 + model = BEiT() + model.init_weights() + model.train() + + # Test qv_bias + model = BEiT(qv_bias=False) + model.train() + + # Test out_indices = list + model = BEiT(out_indices=[-1, -2, -3]) + model.train() + + assert check_norm_state(model.modules(), True) + + # Test normal size input image + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 14) + + # Test BEiT backbone with input size of 256 and patch size of 16 + model = BEiT(img_size=(256, 256)) + model.init_weights() + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 16, 16) + + # Test BEiT backbone with input size of 32 and patch size of 16 + model = BEiT(img_size=(32, 32)) + model.init_weights() + model.train() + imgs = torch.randn(1, 3, 32, 32) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 2, 2) + + # Test unbalanced size input image + model = BEiT(img_size=(112, 224)) + model.init_weights() + model.train() + imgs = torch.randn(1, 3, 112, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 7, 14) + + # Test irregular input image + model = BEiT(img_size=(234, 345)) + model.init_weights() + model.train() + imgs = torch.randn(1, 3, 234, 345) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 21) + + # Test with_cp=True + model = BEiT(with_cp=True) + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 14) + + # Test init_values=0 + model = BEiT(init_values=0) + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 14) + + # Test final norm + model = BEiT(final_norm=True) + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 14) + + # Test patch norm + model = BEiT(patch_norm=True) + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert feat[-1].shape == (1, 768, 14, 14) + + +def test_beit_init(): + path = 'PATH_THAT_DO_NOT_EXIST' + # Test all combinations of pretrained and init_cfg + # pretrained=None, init_cfg=None + model = BEiT(pretrained=None, init_cfg=None) + assert model.init_cfg is None + model.init_weights() + + # pretrained=None + # init_cfg loads pretrain from an non-existent file + model = BEiT( + pretrained=None, init_cfg=dict(type='Pretrained', checkpoint=path)) + assert model.init_cfg == dict(type='Pretrained', checkpoint=path) + # Test loading a checkpoint from an non-existent file + with pytest.raises(OSError): + model.init_weights() + + # pretrained=None + # init_cfg=123, whose type is unsupported + model = BEiT(pretrained=None, init_cfg=123) + with pytest.raises(TypeError): + model.init_weights() + + # pretrained loads pretrain from an non-existent file + # init_cfg=None + model = BEiT(pretrained=path, init_cfg=None) + assert model.init_cfg == dict(type='Pretrained', checkpoint=path) + # Test loading a checkpoint from an non-existent file + with pytest.raises(OSError): + model.init_weights() + + # pretrained loads pretrain from an non-existent file + # init_cfg loads pretrain from an non-existent file + with pytest.raises(AssertionError): + model = BEiT( + pretrained=path, init_cfg=dict(type='Pretrained', checkpoint=path)) + with pytest.raises(AssertionError): + model = BEiT(pretrained=path, init_cfg=123) + + # pretrain=123, whose type is unsupported + # init_cfg=None + with pytest.raises(TypeError): + model = BEiT(pretrained=123, init_cfg=None) + + # pretrain=123, whose type is unsupported + # init_cfg loads pretrain from an non-existent file + with pytest.raises(AssertionError): + model = BEiT( + pretrained=123, init_cfg=dict(type='Pretrained', checkpoint=path)) + + # pretrain=123, whose type is unsupported + # init_cfg=123, whose type is unsupported + with pytest.raises(AssertionError): + model = BEiT(pretrained=123, init_cfg=123) diff --git a/tests/test_models/test_necks/test_featurepyramid.py b/tests/test_models/test_necks/test_featurepyramid.py new file mode 100644 index 0000000000..a404a54a8c --- /dev/null +++ b/tests/test_models/test_necks/test_featurepyramid.py @@ -0,0 +1,17 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmseg.models import Feature2Pyramid + + +def test_fpn(): + rescales = [4, 2, 1, 0.5] + embed_dim = 64 + inputs = [torch.randn(1, embed_dim, 32, 32) for i in range(len(rescales))] + + fpn = Feature2Pyramid(embed_dim, rescales, norm='bn') + outputs = fpn(inputs) + assert outputs[0].shape == torch.Size([1, 64, 128, 128]) + assert outputs[1].shape == torch.Size([1, 64, 64, 64]) + assert outputs[2].shape == torch.Size([1, 64, 32, 32]) + assert outputs[3].shape == torch.Size([1, 64, 16, 16]) From d226d417faf6b2fbcb064d37ce8f9f6d2a61f9eb Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 21 Mar 2022 15:47:51 +0800 Subject: [PATCH 02/45] fix --- mmseg/models/backbones/beit.py | 150 +++++++++--------- tests/test_models/test_backbones/test_beit.py | 2 +- 2 files changed, 73 insertions(+), 79 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index e0991a74ae..0b4f6678fb 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -395,73 +395,85 @@ def rescale(param, layer_id): rescale(layer.attn.proj.weight.data, layer_id + 1) rescale(layer.ffn.layers[1].weight.data, layer_id + 1) - def resize_rel_pos_embed(self, state_dict, key, dst_num_pos, - dst_patch_shape): + def resize_rel_pos_embed(self, checkpoint): """Resize relative pos_embed weights. Args: - state_dict (dict): Key and value of the model. - dst_num_pos (int): The number of relative position encoding - for the current model. - dst_patch_shape (tuple): The number of the patch embedding. + checkpoint (dict): Key and value of the pretrain model. Returns: state_dict (dict): Interpolate the relative pos_embed weights in the pre-train model to the current model size. """ - rel_pos_bias = state_dict[key] - src_num_pos, num_attn_heads = rel_pos_bias.size() - # dst_num_pos, _ = self.state_dict()[key].size() - # dst_patch_shape = self.patch_shape - if dst_patch_shape[0] != dst_patch_shape[1]: - raise NotImplementedError() - num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * ( - dst_patch_shape[1] * 2 - 1) - src_size = int((src_num_pos - num_extra_tokens)**0.5) - dst_size = int((dst_num_pos - num_extra_tokens)**0.5) - if src_size != dst_size: - extra_tokens = rel_pos_bias[-num_extra_tokens:, :] - rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] - - def geometric_progression(a, r, n): - return a * (1.0 - r**n) / (1.0 - r) - - left, right = 1.01, 1.5 - while right - left > 1e-6: - q = (left + right) / 2.0 - gp = geometric_progression(1, q, src_size // 2) - if gp > dst_size // 2: - right = q - else: - left = q - - dis = [] - cur = 1 - for i in range(src_size // 2): - dis.append(cur) - cur += q**(i + 1) - - r_ids = [-_ for _ in reversed(dis)] - - x = r_ids + [0] + dis - y = r_ids + [0] + dis - - t = dst_size // 2.0 - dx = np.arange(-t, t + 0.1, 1.0) - dy = np.arange(-t, t + 0.1, 1.0) - - all_rel_pos_bias = [] - - for i in range(num_attn_heads): - z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() - f = interpolate.interp2d(x, y, z, kind='cubic') - all_rel_pos_bias.append( - torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to( - rel_pos_bias.device)) - - rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) - new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) - state_dict[key] = new_rel_pos_bias + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + all_keys = list(state_dict.keys()) + for key in all_keys: + if 'relative_position_index' in key: + state_dict.pop(key) + # In order to keep the center of pos_bias as consistent as + # possible after interpolation, and vice versa in the edge + # area, the geometric sequence interpolation method is adopted. + if 'relative_position_bias_table' in key: + rel_pos_bias = state_dict[key] + src_num_pos, num_attn_heads = rel_pos_bias.size() + dst_num_pos, _ = self.state_dict()[key].size() + dst_patch_shape = self.patch_shape + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - ( + dst_patch_shape[0] * 2 - 1) * ( + dst_patch_shape[1] * 2 - 1) + src_size = int((src_num_pos - num_extra_tokens)**0.5) + dst_size = int((dst_num_pos - num_extra_tokens)**0.5) + if src_size != dst_size: + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r**n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q**(i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, + src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view( + -1, 1).to(rel_pos_bias.device)) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), + dim=0) + state_dict[key] = new_rel_pos_bias return state_dict @@ -484,25 +496,7 @@ def _init_weights(m): logger = get_root_logger() checkpoint = _load_checkpoint( self.init_cfg['checkpoint'], logger=logger, map_location='cpu') - - if 'state_dict' in checkpoint: - state_dict = checkpoint['state_dict'] - else: - state_dict = checkpoint - - all_keys = list(state_dict.keys()) - for key in all_keys: - if 'relative_position_index' in key: - state_dict.pop(key) - # In order to keep the center of pos_bias as consistent as - # possible after interpolation, and vice versa in the edge - # area, the geometric sequence interpolation method is adopted. - if 'relative_position_bias_table' in key: - - dst_num_pos, _ = self.state_dict()[key].size() - state_dict = self.resize_rel_pos_embed( - state_dict, key, dst_num_pos, self.patch_shape) - + state_dict = self.resize_rel_pos_embed(checkpoint) self.load_state_dict(state_dict, False) elif self.init_cfg is not None: super(BEiT, self).init_weights() diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py index 604af59a26..a6e1a521d3 100644 --- a/tests/test_models/test_backbones/test_beit.py +++ b/tests/test_models/test_backbones/test_beit.py @@ -53,7 +53,7 @@ def test_beit_backbone(): model.train() # Test out_indices = list - model = BEiT(out_indices=[-1, -2, -3]) + model = BEiT(out_indices=[2, 4, 8, 12]) model.train() assert check_norm_state(model.modules(), True) From e6c03bc539733cd09b56f5fc46eb234ae90da3dd Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 21 Mar 2022 18:59:39 +0800 Subject: [PATCH 03/45] fix --- mmseg/models/backbones/beit.py | 1 - mmseg/models/necks/featurepyramid.py | 1 + tests/test_models/test_backbones/test_beit.py | 11 +++++++++++ .../test_necks/test_featurepyramid.py | 19 +++++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 0b4f6678fb..b4431cb83c 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -404,7 +404,6 @@ def resize_rel_pos_embed(self, checkpoint): state_dict (dict): Interpolate the relative pos_embed weights in the pre-train model to the current model size. """ - if 'state_dict' in checkpoint: state_dict = checkpoint['state_dict'] else: diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 447062f1c4..2e961cd78d 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -20,6 +20,7 @@ class Feature2Pyramid(nn.Module): def __init__(self, embed_dim, rescales, norm='syncbn'): super(Feature2Pyramid, self).__init__() self.rescales = rescales + self.upsample_4x = None for k in self.rescales: if k == 4: if norm == 'bn': diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py index a6e1a521d3..deee9f54e6 100644 --- a/tests/test_models/test_backbones/test_beit.py +++ b/tests/test_models/test_backbones/test_beit.py @@ -137,6 +137,17 @@ def test_beit_init(): with pytest.raises(OSError): model.init_weights() + # test resize_rel_pos_embed + value = torch.randn(732, 16) + ckpt = { + 'state_dict': { + 'layers.0.attn.relative_position_index': 0, + 'layers.0.attn.relative_position_bias_table': value + } + } + model = BEiT(img_size=(512, 512)) + model.resize_rel_pos_embed(ckpt) + # pretrained=None # init_cfg=123, whose type is unsupported model = BEiT(pretrained=None, init_cfg=123) diff --git a/tests/test_models/test_necks/test_featurepyramid.py b/tests/test_models/test_necks/test_featurepyramid.py index a404a54a8c..fb37c515a3 100644 --- a/tests/test_models/test_necks/test_featurepyramid.py +++ b/tests/test_models/test_necks/test_featurepyramid.py @@ -5,6 +5,7 @@ def test_fpn(): + # test rescales = [4, 2, 1, 0.5] embed_dim = 64 inputs = [torch.randn(1, embed_dim, 32, 32) for i in range(len(rescales))] @@ -15,3 +16,21 @@ def test_fpn(): assert outputs[1].shape == torch.Size([1, 64, 64, 64]) assert outputs[2].shape == torch.Size([1, 64, 32, 32]) assert outputs[3].shape == torch.Size([1, 64, 16, 16]) + + # test rescales = [2, 1, 0.5, 0.25] + rescales = [2, 1, 0.5, 0.25] + inputs = [torch.randn(1, embed_dim, 32, 32) for i in range(len(rescales))] + + fpn = Feature2Pyramid(embed_dim, rescales, norm='bn') + outputs = fpn(inputs) + assert outputs[0].shape == torch.Size([1, 64, 64, 64]) + assert outputs[1].shape == torch.Size([1, 64, 32, 32]) + assert outputs[2].shape == torch.Size([1, 64, 16, 16]) + assert outputs[3].shape == torch.Size([1, 64, 8, 8]) + + # test syncbn + fpn = Feature2Pyramid(embed_dim, rescales, norm='syncb') + + # test rescales = [4, 2, 0.25, 0] + rescales = [4, 2, 0.25, 0] + fpn = Feature2Pyramid(embed_dim, rescales) From 9e3f5b6877e14d06be8cb11055193100aab70a3f Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 21 Mar 2022 20:06:26 +0800 Subject: [PATCH 04/45] fix --- .../test_layer_decay_optimizer_constructor.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 tests/test_core/test_layer_decay_optimizer_constructor.py diff --git a/tests/test_core/test_layer_decay_optimizer_constructor.py b/tests/test_core/test_layer_decay_optimizer_constructor.py new file mode 100644 index 0000000000..443e890385 --- /dev/null +++ b/tests/test_core/test_layer_decay_optimizer_constructor.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmseg.core.layer_decay_optimizer_constructor import \ + LayerDecayOptimizerConstructor + + +class BEiTExampleModel(nn.Module): + + def __init__(self, depth): + super().__init__() + self.backbone = nn.ModuleList() + + # add some variables to meet unit test coverate rate + self.backbone.cls_token = nn.Parameter(torch.ones(1)) + self.backbone.patch_embed = nn.Parameter(torch.ones(1)) + self.backbone.layers = nn.ModuleList() + for _ in range(depth): + layer = nn.Conv2d(3, 3, 1) + self.backbone.layers.append(layer) + + +def test_beit_layer_decay_optimizer_constructor(): + + # paramwise_cfg with ConvNeXtExampleModel + model = BEiTExampleModel(depth=12) + optimizer_cfg = dict( + type='AdamW', lr=1, betas=(0.9, 0.999), weight_decay=0.05) + paramwise_cfg = dict(num_layers=12, layer_decay_rate=0.9) + optim_constructor = LayerDecayOptimizerConstructor(optimizer_cfg, + paramwise_cfg) + optim_constructor(model) From ba98cc3ecc627a2583e979e783a430c7f22ff211 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 21 Mar 2022 20:27:34 +0800 Subject: [PATCH 05/45] fix --- tests/test_models/test_backbones/test_beit.py | 3 ++- tests/test_models/test_necks/test_featurepyramid.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py index deee9f54e6..9a8f0d327a 100644 --- a/tests/test_models/test_backbones/test_beit.py +++ b/tests/test_models/test_backbones/test_beit.py @@ -146,7 +146,8 @@ def test_beit_init(): } } model = BEiT(img_size=(512, 512)) - model.resize_rel_pos_embed(ckpt) + with pytest.raises(AttributeError): + model.resize_rel_pos_embed(ckpt) # pretrained=None # init_cfg=123, whose type is unsupported diff --git a/tests/test_models/test_necks/test_featurepyramid.py b/tests/test_models/test_necks/test_featurepyramid.py index fb37c515a3..558841b4fa 100644 --- a/tests/test_models/test_necks/test_featurepyramid.py +++ b/tests/test_models/test_necks/test_featurepyramid.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import pytest import torch from mmseg.models import Feature2Pyramid @@ -33,4 +34,5 @@ def test_fpn(): # test rescales = [4, 2, 0.25, 0] rescales = [4, 2, 0.25, 0] - fpn = Feature2Pyramid(embed_dim, rescales) + with pytest.raises(KeyError): + fpn = Feature2Pyramid(embed_dim, rescales) From c0462b6fb0b3d1053a2b06b2911942d25744afe8 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 23 Mar 2022 01:16:40 +0800 Subject: [PATCH 06/45] add readme --- README.md | 1 + README_zh-CN.md | 1 + configs/beit/README.md | 58 ++++++++++++++++++++++++++++ tools/model_converters/beit2mmseg.py | 56 +++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 configs/beit/README.md create mode 100644 tools/model_converters/beit2mmseg.py diff --git a/README.md b/README.md index efe0acf67e..4545efbe48 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ Supported backbones: - [x] [Swin Transformer (ICCV'2021)](configs/swin) - [x] [Twins (NeurIPS'2021)](configs/twins) - [x] [ConvNeXt (CVPR'2022)](configs/convnext) +- [x] [BEiT (ICLR'2022)](configs/beit) Supported methods: diff --git a/README_zh-CN.md b/README_zh-CN.md index 45dabda8eb..cfb6fe0edb 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -84,6 +84,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O - [x] [Swin Transformer (ICCV'2021)](configs/swin) - [x] [Twins (NeurIPS'2021)](configs/twins) - [x] [ConvNeXt (CVPR'2022)](configs/convnext) +- [x] [BEiT (ICLR'2022)](configs/beit) 已支持的算法: diff --git a/configs/beit/README.md b/configs/beit/README.md new file mode 100644 index 0000000000..f5ebd86e60 --- /dev/null +++ b/configs/beit/README.md @@ -0,0 +1,58 @@ +# BEiT + +[BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) + +## Introduction + + + +Official Repo + +Code Snippet + +## Abstract + + + +We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder. Experimental results on image classification and semantic segmentation show that our model achieves competitive results with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K, significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains 86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%). The code and pretrained models are available at [this https URL](https://github.com/microsoft/unilm/tree/master/beit). + + +## Citation + +```bibtex +@article{beit, + title={{BEiT}: {BERT} Pre-Training of Image Transformers}, + author={Hangbo Bao and Li Dong and Furu Wei}, + year={2021}, + eprint={2106.08254}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +## Usage + +To use other repositories' pre-trained models, it is necessary to convert keys. + +We provide a script [`beit2mmseg.py`](../../tools/model_converters/beit2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/microsoft/unilm/tree/master/beit/semantic_segmentation) to MMSegmentation style. + +```shell +python tools/model_converters/beit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH} +``` + +E.g. + +```shell +python tools/model_converters/swin2mmseg.py https://unilm.blob.core.windows.net/beit/beit_base_patch16_224_pt22k_ft22k.pth pretrain/beit_base_patch16_224_pt22k_ft22k.pth +``` + +This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. + +## Results and models + +### ADE20K + +| Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | xxxx | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | diff --git a/tools/model_converters/beit2mmseg.py b/tools/model_converters/beit2mmseg.py new file mode 100644 index 0000000000..94435825e0 --- /dev/null +++ b/tools/model_converters/beit2mmseg.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from collections import OrderedDict + +import torch +import mmcv +from mmcv.runner import CheckpointLoader + + +def convert_beit(ckpt): + new_ckpt = OrderedDict() + + for k, v in ckpt.items(): + if k.startswith('patch_embed'): + new_key = k.replace('patch_embed.proj', 'patch_embed.projection') + new_ckpt[new_key] = v + if k.startswith('blocks'): + new_key = k.replace('blocks', 'layers') + if 'norm' in new_key: + new_key = new_key.replace('norm', 'ln') + elif 'mlp.fc1' in new_key: + new_key = new_key.replace('mlp.fc1', 'ffn.layers.0.0') + elif 'mlp.fc2' in new_key: + new_key = new_key.replace('mlp.fc2', 'ffn.layers.1') + new_ckpt[new_key] = v + else: + new_key = k + new_ckpt[new_key] = v + + return new_ckpt + + +def main(): + parser = argparse.ArgumentParser( + description='Convert keys in official pretrained beit models to' + 'MMSegmentation style.') + parser.add_argument('src', help='src model path or url') + # The dst path must be a full path of the new checkpoint. + parser.add_argument('dst', help='save path') + args = parser.parse_args() + + checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu') + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + elif 'model' in checkpoint: + state_dict = checkpoint['model'] + else: + state_dict = checkpoint + weight = convert_beit(state_dict) + mmcv.mkdir_or_exist(osp.dirname(args.dst)) + torch.save(weight, args.dst) + + +if __name__ == '__main__': + main() From c2c03e5a8b602e42d6a6abce3c95689865d2e764 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 23 Mar 2022 10:02:28 +0800 Subject: [PATCH 07/45] fix --- configs/beit/README.md | 4 +-- configs/beit/beit.yml | 45 ++++++++++++++++++++++++++++ model-index.yml | 1 + tools/model_converters/beit2mmseg.py | 26 ++++++++-------- 4 files changed, 61 insertions(+), 15 deletions(-) create mode 100644 configs/beit/beit.yml diff --git a/configs/beit/README.md b/configs/beit/README.md index f5ebd86e60..c1feadc7fc 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -21,7 +21,7 @@ We introduce a self-supervised vision representation model BEiT, which stands fo ```bibtex @article{beit, - title={{BEiT}: {BERT} Pre-Training of Image Transformers}, + title={{BEiT}: {BERT} Pre-Training of Image Transformers}, author={Hangbo Bao and Li Dong and Furu Wei}, year={2021}, eprint={2106.08254}, @@ -55,4 +55,4 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | xxxx | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml new file mode 100644 index 0000000000..f518cefe41 --- /dev/null +++ b/configs/beit/beit.yml @@ -0,0 +1,45 @@ +Models: +- Name: upernet_beit_base_12_640_slide_160k_ade20k_pt2ft + In Collection: UperNet + Metadata: + backbone: BEiT-B + crop size: (640,640) + lr schd: 160000 + inference time (ms/im): + - value: 500.0 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (640,640) + Training Memory (GB): 16.27 + Results: + - Task: Semantic Segmentation + Dataset: ADE20K + Metrics: + mIoU: 53.08 + mIoU(ms+flip): 53.84 + Config: configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth +- Name: upernet_beit_large_24_640_slide_160k_ade20k_ms + In Collection: UperNet + Metadata: + backbone: BEiT-L + crop size: (640,640) + lr schd: 320000 + inference time (ms/im): + - value: 1041.67 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (640,640) + Training Memory (GB): 23.18 + Results: + - Task: Semantic Segmentation + Dataset: ADE20K + Metrics: + mIoU: 56.33 + mIoU(ms+flip): 56.84 + Config: configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth diff --git a/model-index.yml b/model-index.yml index 235ad7f6e7..d8e9516bf4 100644 --- a/model-index.yml +++ b/model-index.yml @@ -1,6 +1,7 @@ Import: - configs/ann/ann.yml - configs/apcnet/apcnet.yml +- configs/beit/beit.yml - configs/bisenetv1/bisenetv1.yml - configs/bisenetv2/bisenetv2.yml - configs/ccnet/ccnet.yml diff --git a/tools/model_converters/beit2mmseg.py b/tools/model_converters/beit2mmseg.py index 94435825e0..d23cfdb0b3 100644 --- a/tools/model_converters/beit2mmseg.py +++ b/tools/model_converters/beit2mmseg.py @@ -3,8 +3,8 @@ import os.path as osp from collections import OrderedDict -import torch import mmcv +import torch from mmcv.runner import CheckpointLoader @@ -13,20 +13,20 @@ def convert_beit(ckpt): for k, v in ckpt.items(): if k.startswith('patch_embed'): - new_key = k.replace('patch_embed.proj', 'patch_embed.projection') - new_ckpt[new_key] = v + new_key = k.replace('patch_embed.proj', 'patch_embed.projection') + new_ckpt[new_key] = v if k.startswith('blocks'): - new_key = k.replace('blocks', 'layers') - if 'norm' in new_key: - new_key = new_key.replace('norm', 'ln') - elif 'mlp.fc1' in new_key: - new_key = new_key.replace('mlp.fc1', 'ffn.layers.0.0') - elif 'mlp.fc2' in new_key: - new_key = new_key.replace('mlp.fc2', 'ffn.layers.1') - new_ckpt[new_key] = v + new_key = k.replace('blocks', 'layers') + if 'norm' in new_key: + new_key = new_key.replace('norm', 'ln') + elif 'mlp.fc1' in new_key: + new_key = new_key.replace('mlp.fc1', 'ffn.layers.0.0') + elif 'mlp.fc2' in new_key: + new_key = new_key.replace('mlp.fc2', 'ffn.layers.1') + new_ckpt[new_key] = v else: - new_key = k - new_ckpt[new_key] = v + new_key = k + new_ckpt[new_key] = v return new_ckpt From a4fff29b82a1d8749ea208f09538cf21c0a19a7b Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 23 Mar 2022 10:25:42 +0800 Subject: [PATCH 08/45] fix --- configs/beit/beit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index f518cefe41..9fcc8ad6ee 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -32,7 +32,7 @@ Models: hardware: V100 backend: PyTorch batch size: 1 - mode: FP32 + mode: FP16 resolution: (640,640) Training Memory (GB): 23.18 Results: From bdf5f777968cf5d01aea4ba9af76b9e6bcb07c52 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 23 Mar 2022 11:06:41 +0800 Subject: [PATCH 09/45] fix --- configs/beit/README.md | 2 +- configs/beit/beit.yml | 4 ++-- ...=> upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py} | 0 ...upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py} | 4 +++- 4 files changed, 6 insertions(+), 4 deletions(-) rename configs/beit/{upernet_beit_large_24_640_slide_160k_ade20k_ms.py => upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py} (100%) rename configs/beit/{upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py => upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py} (94%) diff --git a/configs/beit/README.md b/configs/beit/README.md index c1feadc7fc..85c6d09b46 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -55,4 +55,4 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 9fcc8ad6ee..9913160900 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -21,7 +21,7 @@ Models: mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth -- Name: upernet_beit_large_24_640_slide_160k_ade20k_ms +- Name: upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft In Collection: UperNet Metadata: backbone: BEiT-L @@ -41,5 +41,5 @@ Models: Metrics: mIoU: 56.33 mIoU(ms+flip): 56.84 - Config: configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py + Config: configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth diff --git a/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py similarity index 100% rename from configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_ms.py rename to configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py diff --git a/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py b/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py similarity index 94% rename from configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py rename to configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py index 909b33534c..1bdfb6383a 100644 --- a/configs/beit/upernet_beit_large_24_640_slide_160k_ade20k_pt2ft.py +++ b/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py @@ -49,4 +49,6 @@ data = dict(samples_per_gpu=1) optimizer_config = dict( - type='GradientCumulativeOptimizerHook', cumulative_iters=2) + type='GradientCumulativeFp16OptimizerHook', cumulative_iters=2) + +fp16 = dict() From ac7c52a8de1decbf8551e622822ddc3dce23973d Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Thu, 24 Mar 2022 17:53:25 +0800 Subject: [PATCH 10/45] fix --- mmseg/models/necks/featurepyramid.py | 32 ++++++++----------- .../test_necks/test_featurepyramid.py | 12 +++---- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 2e961cd78d..34a293fe38 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn +from mmcv.cnn import build_norm_layer from ..builder import NECKS @@ -17,30 +18,23 @@ class Feature2Pyramid(nn.Module): norm (str) : bn or syncbn. """ - def __init__(self, embed_dim, rescales, norm='syncbn'): + def __init__(self, + embed_dim, + rescales, + norm_cfg=dict(type='SyncBN', requires_grad=True)): super(Feature2Pyramid, self).__init__() self.rescales = rescales self.upsample_4x = None for k in self.rescales: if k == 4: - if norm == 'bn': - self.upsample_4x = nn.Sequential( - nn.ConvTranspose2d( - embed_dim, embed_dim, kernel_size=2, stride=2), - nn.BatchNorm2d(embed_dim), - nn.GELU(), - nn.ConvTranspose2d( - embed_dim, embed_dim, kernel_size=2, stride=2), - ) - elif norm == 'syncbn': - self.upsample_4x = nn.Sequential( - nn.ConvTranspose2d( - embed_dim, embed_dim, kernel_size=2, stride=2), - nn.SyncBatchNorm(embed_dim), - nn.GELU(), - nn.ConvTranspose2d( - embed_dim, embed_dim, kernel_size=2, stride=2), - ) + self.upsample_4x = nn.Sequential( + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + build_norm_layer(norm_cfg, embed_dim)[1], + nn.GELU(), + nn.ConvTranspose2d( + embed_dim, embed_dim, kernel_size=2, stride=2), + ) elif k == 2: self.upsample_2x = nn.Sequential( nn.ConvTranspose2d( diff --git a/tests/test_models/test_necks/test_featurepyramid.py b/tests/test_models/test_necks/test_featurepyramid.py index 558841b4fa..7f1597c670 100644 --- a/tests/test_models/test_necks/test_featurepyramid.py +++ b/tests/test_models/test_necks/test_featurepyramid.py @@ -11,7 +11,8 @@ def test_fpn(): embed_dim = 64 inputs = [torch.randn(1, embed_dim, 32, 32) for i in range(len(rescales))] - fpn = Feature2Pyramid(embed_dim, rescales, norm='bn') + fpn = Feature2Pyramid( + embed_dim, rescales, norm_cfg=dict(type='BN', requires_grad=True)) outputs = fpn(inputs) assert outputs[0].shape == torch.Size([1, 64, 128, 128]) assert outputs[1].shape == torch.Size([1, 64, 64, 64]) @@ -22,17 +23,16 @@ def test_fpn(): rescales = [2, 1, 0.5, 0.25] inputs = [torch.randn(1, embed_dim, 32, 32) for i in range(len(rescales))] - fpn = Feature2Pyramid(embed_dim, rescales, norm='bn') + fpn = Feature2Pyramid( + embed_dim, rescales, norm_cfg=dict(type='BN', requires_grad=True)) outputs = fpn(inputs) assert outputs[0].shape == torch.Size([1, 64, 64, 64]) assert outputs[1].shape == torch.Size([1, 64, 32, 32]) assert outputs[2].shape == torch.Size([1, 64, 16, 16]) assert outputs[3].shape == torch.Size([1, 64, 8, 8]) - # test syncbn - fpn = Feature2Pyramid(embed_dim, rescales, norm='syncb') - # test rescales = [4, 2, 0.25, 0] rescales = [4, 2, 0.25, 0] with pytest.raises(KeyError): - fpn = Feature2Pyramid(embed_dim, rescales) + fpn = Feature2Pyramid( + embed_dim, rescales, norm_cfg=dict(type='BN', requires_grad=True)) From 720285e18b6db349307c3f4f4e8fc24f61beaf44 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 14:45:30 +0800 Subject: [PATCH 11/45] fix --- .../core/layer_decay_optimizer_constructor.py | 2 +- mmseg/models/necks/featurepyramid.py | 3 +- .../test_layer_decay_optimizer_constructor.py | 43 +++++++++++++++++-- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/mmseg/core/layer_decay_optimizer_constructor.py b/mmseg/core/layer_decay_optimizer_constructor.py index b0c50f6f37..f01acc27ee 100644 --- a/mmseg/core/layer_decay_optimizer_constructor.py +++ b/mmseg/core/layer_decay_optimizer_constructor.py @@ -69,7 +69,7 @@ def add_params(self, params, module, prefix='', is_dcn_module=None): this_weight_decay = weight_decay layer_id = get_num_layer_for_vit(name, num_layers) - group_name = 'layer_%d_%s' % (layer_id, group_name) + group_name = f'layer_{layer_id}_{group_name}' if group_name not in parameter_groups: scale = layer_decay_rate**(num_layers - layer_id - 1) diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 34a293fe38..564ed199fc 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -15,7 +15,8 @@ class Feature2Pyramid(nn.Module): embed_dims (int): embedding dimension. rescales (list[float]): different sampling multiples were used to obtain pyramid features. Default: (4, 2, 1, 0.5). - norm (str) : bn or syncbn. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='SyncBN'). """ def __init__(self, diff --git a/tests/test_core/test_layer_decay_optimizer_constructor.py b/tests/test_core/test_layer_decay_optimizer_constructor.py index 443e890385..f595d31331 100644 --- a/tests/test_core/test_layer_decay_optimizer_constructor.py +++ b/tests/test_core/test_layer_decay_optimizer_constructor.py @@ -5,6 +5,29 @@ from mmseg.core.layer_decay_optimizer_constructor import \ LayerDecayOptimizerConstructor +layer_wise_gt_lst = [{ + 'weight_decay': 0.0, + 'lr_scale': 16 +}, { + 'weight_decay': 0.05, + 'lr_scale': 8 +}, { + 'weight_decay': 0.0, + 'lr_scale': 8 +}, { + 'weight_decay': 0.05, + 'lr_scale': 4 +}, { + 'weight_decay': 0.0, + 'lr_scale': 4 +}, { + 'weight_decay': 0.05, + 'lr_scale': 2 +}, { + 'weight_decay': 0.0, + 'lr_scale': 2 +}] + class BEiTExampleModel(nn.Module): @@ -21,13 +44,27 @@ def __init__(self, depth): self.backbone.layers.append(layer) +def check_beit_adamw_optimizer(optimizer, gt_lst): + assert isinstance(optimizer, torch.optim.AdamW) + assert optimizer.defaults['lr'] == 1 + assert optimizer.defaults['weight_decay'] == 0.05 + param_groups = optimizer.param_groups + # 1 layer (cls_token and patch_embed) + 3 layers * 2 (w, b) = 7 layers + assert len(param_groups) == 7 + for i, param_dict in enumerate(param_groups): + assert param_dict['weight_decay'] == gt_lst[i]['weight_decay'] + assert param_dict['lr_scale'] == gt_lst[i]['lr_scale'] + assert param_dict['lr_scale'] == param_dict['lr'] + + def test_beit_layer_decay_optimizer_constructor(): # paramwise_cfg with ConvNeXtExampleModel - model = BEiTExampleModel(depth=12) + model = BEiTExampleModel(depth=3) optimizer_cfg = dict( type='AdamW', lr=1, betas=(0.9, 0.999), weight_decay=0.05) - paramwise_cfg = dict(num_layers=12, layer_decay_rate=0.9) + paramwise_cfg = dict(num_layers=3, layer_decay_rate=2) optim_constructor = LayerDecayOptimizerConstructor(optimizer_cfg, paramwise_cfg) - optim_constructor(model) + optimizer = optim_constructor(model) + check_beit_adamw_optimizer(optimizer, layer_wise_gt_lst) From 22d864e007314c064291fd66b8bc82939e8acb58 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 15:05:24 +0800 Subject: [PATCH 12/45] add link --- configs/beit/README.md | 4 ++-- configs/beit/beit.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 85c6d09b46..514ce07b89 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -54,5 +54,5 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 9913160900..5dc60314d0 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -20,7 +20,7 @@ Models: mIoU: 53.08 mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_base.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth - Name: upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft In Collection: UperNet Metadata: @@ -42,4 +42,4 @@ Models: mIoU: 56.33 mIoU(ms+flip): 56.84 Config: configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/beit_large.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth From 0864be3a01ff5006431ddb349b8a0a656b4b7ecd Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 15:15:49 +0800 Subject: [PATCH 13/45] fix memory --- configs/beit/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 514ce07b89..d0d4ff2dc9 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -54,5 +54,5 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 16.27 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 23.18 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.log.json) | From 5b1b7b76e664a86e4658767d811d01a77486718b Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 16:38:44 +0800 Subject: [PATCH 14/45] fix --- configs/beit/README.md | 2 +- configs/beit/beit.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index d0d4ff2dc9..ba75a6ea6c 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -43,7 +43,7 @@ python tools/model_converters/beit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH} E.g. ```shell -python tools/model_converters/swin2mmseg.py https://unilm.blob.core.windows.net/beit/beit_base_patch16_224_pt22k_ft22k.pth pretrain/beit_base_patch16_224_pt22k_ft22k.pth +python tools/model_converters/beit2mmseg.py https://unilm.blob.core.windows.net/beit/beit_base_patch16_224_pt22k_ft22k.pth pretrain/beit_base_patch16_224_pt22k_ft22k.pth ``` This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 5dc60314d0..f7a9e31877 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -12,7 +12,7 @@ Models: batch size: 1 mode: FP32 resolution: (640,640) - Training Memory (GB): 16.27 + Training Memory (GB): 15.88 Results: - Task: Semantic Segmentation Dataset: ADE20K @@ -34,7 +34,7 @@ Models: batch size: 1 mode: FP16 resolution: (640,640) - Training Memory (GB): 23.18 + Training Memory (GB): 22.64 Results: - Task: Semantic Segmentation Dataset: ADE20K From e55c3b19a120403e85e3c96e737189efbea9d4d4 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 16:47:05 +0800 Subject: [PATCH 15/45] fix --- configs/beit/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/configs/beit/README.md b/configs/beit/README.md index ba75a6ea6c..54d7fa808b 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -48,6 +48,13 @@ python tools/model_converters/beit2mmseg.py https://unilm.blob.core.windows.net/ This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`. +In our default setting, pretrained models could be defined below: + + | pretrained models | original models | + | ------ | -------- | + |BEiT_base.pth | ['BEiT_base'](https://unilm.blob.core.windows.net/beit/beit_base_patch16_224_pt22k_ft22k.pth) | + |BEiT_large.pth | ['BEiT_large'](https://unilm.blob.core.windows.net/beit/beit_large_patch16_224_pt22k_ft22k.pth) | + ## Results and models ### ADE20K From ca488f4187632ffa6c56924fd4ad55e7e07a8052 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 21:29:11 +0800 Subject: [PATCH 16/45] fix --- configs/beit/README.md | 6 ++--- configs/beit/beit.yml | 12 +++++----- ...ernet_beit_base_640x640_160k_ade20k_ms.py} | 10 ++++----- ...rnet_beit_base_8x2_640x640_160k_ade20k.py} | 4 ++-- ...beit_large_fp16_640x640_160k_ade20k_ms.py} | 7 +++--- ...eit_large_fp16_8x1_640x640_160k_ade20k.py} | 6 ++--- .../core/layer_decay_optimizer_constructor.py | 2 +- mmseg/models/backbones/beit.py | 22 ++++++------------- mmseg/models/necks/featurepyramid.py | 2 +- ...turepyramid.py => test_feature2pyramid.py} | 2 +- 10 files changed, 32 insertions(+), 41 deletions(-) rename configs/beit/{upernet_beit_base_12_640_slide_160k_ade20k_ms.py => upernet_beit_base_640x640_160k_ade20k_ms.py} (92%) rename configs/beit/{upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py => upernet_beit_base_8x2_640x640_160k_ade20k.py} (96%) rename configs/beit/{upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py => upernet_beit_large_fp16_640x640_160k_ade20k_ms.py} (93%) rename configs/beit/{upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py => upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py} (93%) rename tests/test_models/test_necks/{test_featurepyramid.py => test_feature2pyramid.py} (97%) diff --git a/configs/beit/README.md b/configs/beit/README.md index 54d7fa808b..d15857761a 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -8,7 +8,7 @@ Official Repo -Code Snippet +Code Snippet ## Abstract @@ -61,5 +61,5 @@ In our default setting, pretrained models could be defined below: | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index f7a9e31877..a6ac4c0494 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -1,5 +1,5 @@ Models: -- Name: upernet_beit_base_12_640_slide_160k_ade20k_pt2ft +- Name: upernet_beit_base_8x2_640x640_160k_ade20k In Collection: UperNet Metadata: backbone: BEiT-B @@ -19,9 +19,9 @@ Models: Metrics: mIoU: 53.08 mIoU(ms+flip): 53.84 - Config: configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft_upernet_beit_base_12_640_slide_160k_ade20k_pt2f-eead221d.pth -- Name: upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft + Config: configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth +- Name: upernet_beit_large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: backbone: BEiT-L @@ -41,5 +41,5 @@ Models: Metrics: mIoU: 56.33 mIoU(ms+flip): 56.84 - Config: configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft_upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2f-8fc0dd5d.pth + Config: configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth diff --git a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py similarity index 92% rename from configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py rename to configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py index acffb362d5..59c354adc8 100644 --- a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py @@ -2,13 +2,13 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] -crop_size = (640, 640) +crop_size=(640, 640) model = dict( pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k_new1.pth', backbone=dict( type='BEiT', - img_size=(640, 640), + img_size=crop_size, patch_size=16, embed_dims=768, num_layers=12, @@ -45,7 +45,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -min_size = 640 + find_unused_parameters = True test_pipeline = [ @@ -56,7 +56,7 @@ img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=True, transforms=[ - dict(type='Resize', keep_ratio=True, min_size=min_size), + dict(type='Resize', keep_ratio=True, min_size=640), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), @@ -66,5 +66,5 @@ data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline), - samples_per_gpu=2, + samples_per_gpu=2 ) diff --git a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py similarity index 96% rename from configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py rename to configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py index 4d2842bf77..ba5b04eac0 100644 --- a/configs/beit/upernet_beit_base_12_640_slide_160k_ade20k_pt2ft.py +++ b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py @@ -2,13 +2,13 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] -crop_size = (640, 640) +crop_size=(640, 640) model = dict( pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k.pth', backbone=dict( type='BEiT', - img_size=(640, 640), + img_size=crop_size, patch_size=16, embed_dims=768, num_layers=12, diff --git a/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py similarity index 93% rename from configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py rename to configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py index be492b56ea..668aaa1055 100644 --- a/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py @@ -2,13 +2,13 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' ] -crop_size = (640, 640) +crop_size=(640, 640) model = dict( pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', backbone=dict( type='BEiT', - img_size=(640, 640), + img_size=crop_size, patch_size=16, embed_dims=1024, num_layers=24, @@ -49,7 +49,6 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -min_size = 640 find_unused_parameters = True @@ -61,7 +60,7 @@ img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], flip=True, transforms=[ - dict(type='Resize', keep_ratio=True, min_size=min_size), + dict(type='Resize', keep_ratio=True, min_size=640), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='ImageToTensor', keys=['img']), diff --git a/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py similarity index 93% rename from configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py rename to configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py index 1bdfb6383a..3c94c11f8f 100644 --- a/configs/beit/upernet_beit_large_fp16_24_640_slide_160k_ade20k_pt2ft.py +++ b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py @@ -2,13 +2,13 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' ] -crop_size = (640, 640) +crop_size=(640, 640) model = dict( pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', backbone=dict( type='BEiT', - img_size=(640, 640), + img_size=crop_size, patch_size=16, embed_dims=1024, num_layers=24, @@ -17,7 +17,7 @@ qv_bias=True, init_values=1e-6, drop_path_rate=0.2, - out_indices=[7, 11, 15, 23], + out_indices=[7, 11, 15, 23] ), neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( diff --git a/mmseg/core/layer_decay_optimizer_constructor.py b/mmseg/core/layer_decay_optimizer_constructor.py index f01acc27ee..4456fb4652 100644 --- a/mmseg/core/layer_decay_optimizer_constructor.py +++ b/mmseg/core/layer_decay_optimizer_constructor.py @@ -93,7 +93,7 @@ def add_params(self, params, module, prefix='', is_dcn_module=None): 'param_names': parameter_groups[key]['param_names'], 'lr_scale': parameter_groups[key]['lr_scale'], 'lr': parameter_groups[key]['lr'], - 'weight_decay': parameter_groups[key]['weight_decay'], + 'weight_decay': parameter_groups[key]['weight_decay'] } logger.info(f'Param groups ={to_display}') diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index b4431cb83c..27da90da26 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -import math import warnings import numpy as np @@ -342,8 +341,7 @@ def __init__(self, stride=patch_size, padding=0, norm_cfg=norm_cfg if patch_norm else None, - init_cfg=None, - ) + init_cfg=None) window_size = (img_size[0] // patch_size, img_size[1] // patch_size) self.patch_shape = window_size @@ -386,15 +384,6 @@ def __init__(self, def norm1(self): return getattr(self, self.norm1_name) - def fix_init_weight(self): - - def rescale(param, layer_id): - param.div_(math.sqrt(2.0 * layer_id)) - - for layer_id, layer in enumerate(self.layers): - rescale(layer.attn.proj.weight.data, layer_id + 1) - rescale(layer.ffn.layers[1].weight.data, layer_id + 1) - def resize_rel_pos_embed(self, checkpoint): """Resize relative pos_embed weights. @@ -423,6 +412,7 @@ def resize_rel_pos_embed(self, checkpoint): dst_patch_shape = self.patch_shape if dst_patch_shape[0] != dst_patch_shape[1]: raise NotImplementedError() + # Count the number of extra tokens. num_extra_tokens = dst_num_pos - ( dst_patch_shape[0] * 2 - 1) * ( dst_patch_shape[1] * 2 - 1) @@ -432,9 +422,11 @@ def resize_rel_pos_embed(self, checkpoint): extra_tokens = rel_pos_bias[-num_extra_tokens:, :] rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + # Geometric sequence interpolation. def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) + # Here is a binary function. left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 @@ -443,7 +435,8 @@ def geometric_progression(a, r, n): right = q else: left = q - + # The position of each interpolated point is determined + # by the ratio obtained by dichotomy. dis = [] cur = 1 for i in range(src_size // 2): @@ -458,7 +451,7 @@ def geometric_progression(a, r, n): t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) - + # Interpolation functions are being executed and called. all_rel_pos_bias = [] for i in range(num_attn_heads): @@ -488,7 +481,6 @@ def _init_weights(m): nn.init.constant_(m.weight, 1.0) self.apply(_init_weights) - self.fix_init_weight() if (isinstance(self.init_cfg, dict) and self.init_cfg.get('type') == 'Pretrained'): diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 564ed199fc..1111c4d2b5 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -16,7 +16,7 @@ class Feature2Pyramid(nn.Module): rescales (list[float]): different sampling multiples were used to obtain pyramid features. Default: (4, 2, 1, 0.5). norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='SyncBN'). + Default: dict(type='SyncBN', requires_grad=True). """ def __init__(self, diff --git a/tests/test_models/test_necks/test_featurepyramid.py b/tests/test_models/test_necks/test_feature2pyramid.py similarity index 97% rename from tests/test_models/test_necks/test_featurepyramid.py rename to tests/test_models/test_necks/test_feature2pyramid.py index 7f1597c670..d62708d767 100644 --- a/tests/test_models/test_necks/test_featurepyramid.py +++ b/tests/test_models/test_necks/test_feature2pyramid.py @@ -5,7 +5,7 @@ from mmseg.models import Feature2Pyramid -def test_fpn(): +def test_Feature2Pyramid(): # test rescales = [4, 2, 1, 0.5] embed_dim = 64 From b9cb63980e1092a195cf8f2a789c5f8fca95baff Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 22:11:42 +0800 Subject: [PATCH 17/45] fix --- configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py | 5 ++--- configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py | 2 +- .../beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py | 2 +- .../beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py | 5 ++--- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py index 59c354adc8..8ede064b60 100644 --- a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py @@ -2,7 +2,7 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] -crop_size=(640, 640) +crop_size = (640, 640) model = dict( pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k_new1.pth', @@ -66,5 +66,4 @@ data = dict( val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline), - samples_per_gpu=2 -) + samples_per_gpu=2) diff --git a/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py index ba5b04eac0..30525e5dde 100644 --- a/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py +++ b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py @@ -2,7 +2,7 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] -crop_size=(640, 640) +crop_size = (640, 640) model = dict( pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k.pth', diff --git a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py index 668aaa1055..536e0505f3 100644 --- a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py @@ -2,7 +2,7 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' ] -crop_size=(640, 640) +crop_size = (640, 640) model = dict( pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', diff --git a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py index 3c94c11f8f..40a64a94ff 100644 --- a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py +++ b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py @@ -2,7 +2,7 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' ] -crop_size=(640, 640) +crop_size = (640, 640) model = dict( pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', @@ -17,8 +17,7 @@ qv_bias=True, init_values=1e-6, drop_path_rate=0.2, - out_indices=[7, 11, 15, 23] - ), + out_indices=[7, 11, 15, 23]), neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( in_channels=[1024, 1024, 1024, 1024], From 468feb6bc7327b17c20536a9ec8e7dc1f846b0e7 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Fri, 25 Mar 2022 23:09:15 +0800 Subject: [PATCH 18/45] fix --- configs/beit/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/configs/beit/README.md b/configs/beit/README.md index d15857761a..6beaeb239b 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -55,6 +55,22 @@ In our default setting, pretrained models could be defined below: |BEiT_base.pth | ['BEiT_base'](https://unilm.blob.core.windows.net/beit/beit_base_patch16_224_pt22k_ft22k.pth) | |BEiT_large.pth | ['BEiT_large'](https://unilm.blob.core.windows.net/beit/beit_large_patch16_224_pt22k_ft22k.pth) | +Verify the single-scale results of the model: + +```shell +sh tools/dist_test.sh \ +configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py \ +upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU +``` + +For multi-scale inference: + +```shell +sh tools/dist_test.sh \ +configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py \ +upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU +``` + ## Results and models ### ADE20K From 2dac77adbe52b28c6a67171a6bfddc013323e723 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sat, 26 Mar 2022 00:10:05 +0800 Subject: [PATCH 19/45] fix --- configs/beit/README.md | 4 ++++ .../beit/upernet_beit_base_640x640_160k_ade20k_ms.py | 2 -- .../upernet_beit_large_fp16_640x640_160k_ade20k_ms.py | 10 ++-------- .../upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py | 5 +---- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 6beaeb239b..3d56c5d3ea 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -16,6 +16,10 @@ We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder. Experimental results on image classification and semantic segmentation show that our model achieves competitive results with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K, significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains 86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%). The code and pretrained models are available at [this https URL](https://github.com/microsoft/unilm/tree/master/beit). + +
+ +
## Citation diff --git a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py index 8ede064b60..b9f9de5d97 100644 --- a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py @@ -46,8 +46,6 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -find_unused_parameters = True - test_pipeline = [ dict(type='LoadImageFromFile'), dict( diff --git a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py index 536e0505f3..95505177b2 100644 --- a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py @@ -17,14 +17,10 @@ qv_bias=True, init_values=1e-6, drop_path_rate=0.2, - out_indices=[7, 11, 15, 23], - ), + out_indices=[7, 11, 15, 23]), neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( - in_channels=[1024, 1024, 1024, 1024], - num_classes=150, - channels=1024, - ), + in_channels=[1024, 1024, 1024, 1024], num_classes=150, channels=1024), auxiliary_head=dict(in_channels=1024, num_classes=150), test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) @@ -50,8 +46,6 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) -find_unused_parameters = True - test_pipeline = [ dict(type='LoadImageFromFile'), dict( diff --git a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py index 40a64a94ff..a89c5fa390 100644 --- a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py +++ b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py @@ -20,10 +20,7 @@ out_indices=[7, 11, 15, 23]), neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), decode_head=dict( - in_channels=[1024, 1024, 1024, 1024], - num_classes=150, - channels=1024, - ), + in_channels=[1024, 1024, 1024, 1024], num_classes=150, channels=1024), auxiliary_head=dict(in_channels=1024, num_classes=150), test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) From 215842575900dd24cb0d4eb06a348f811ee2fb2d Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sat, 26 Mar 2022 23:46:12 +0800 Subject: [PATCH 20/45] fix --- configs/_base_/models/upernet_beit.py | 4 +- ...pernet_beit_base_640x640_160k_ade20k_ms.py | 45 +------------------ ...ernet_beit_base_8x2_640x640_160k_ade20k.py | 19 +------- ..._beit_large_fp16_640x640_160k_ade20k_ms.py | 45 +------------------ ...beit_large_fp16_8x1_640x640_160k_ade20k.py | 5 +-- 5 files changed, 6 insertions(+), 112 deletions(-) diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py index d83f3792b0..09bdba51d8 100644 --- a/configs/_base_/models/upernet_beit.py +++ b/configs/_base_/models/upernet_beit.py @@ -36,13 +36,13 @@ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), auxiliary_head=dict( type='FCNHead', - in_channels=384, + in_channels=768, in_index=2, channels=256, num_convs=1, concat_input=False, dropout_ratio=0.1, - num_classes=19, + num_classes=150, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( diff --git a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py index b9f9de5d97..29ae554428 100644 --- a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py @@ -1,47 +1,4 @@ -_base_ = [ - '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' -] -crop_size = (640, 640) - -model = dict( - pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k_new1.pth', - backbone=dict( - type='BEiT', - img_size=crop_size, - patch_size=16, - embed_dims=768, - num_layers=12, - num_heads=12, - mlp_ratio=4, - qv_bias=True, - init_values=0.1, - drop_path_rate=0.1, - out_indices=[3, 5, 7, 11]), - neck=dict(embed_dim=768, rescales=[4, 2, 1, 0.5]), - decode_head=dict( - in_channels=[768, 768, 768, 768], num_classes=150, channels=768), - auxiliary_head=dict(in_channels=768, num_classes=150), - test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) - -optimizer = dict( - _delete_=True, - type='AdamW', - lr=3e-5, - betas=(0.9, 0.999), - weight_decay=0.05, - constructor='LayerDecayOptimizerConstructor', - paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9)) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=1500, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) +_base_ = './upernet_beit_base_8x2_640x640_160k_ade20k.py' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) diff --git a/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py index 30525e5dde..b36adc3c0d 100644 --- a/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py +++ b/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py @@ -2,27 +2,10 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] -crop_size = (640, 640) model = dict( pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k.pth', - backbone=dict( - type='BEiT', - img_size=crop_size, - patch_size=16, - embed_dims=768, - num_layers=12, - num_heads=12, - mlp_ratio=4, - qv_bias=True, - init_values=0.1, - drop_path_rate=0.1, - out_indices=[3, 5, 7, 11]), - neck=dict(embed_dim=768, rescales=[4, 2, 1, 0.5]), - decode_head=dict( - in_channels=[768, 768, 768, 768], num_classes=150, channels=768), - auxiliary_head=dict(in_channels=768, num_classes=150), - test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + test_cfg=dict(mode='slide', crop_size=(640, 640), stride=(426, 426))) optimizer = dict( _delete_=True, diff --git a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py index 95505177b2..c314b5856f 100644 --- a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py @@ -1,47 +1,4 @@ -_base_ = [ - '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', - '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' -] -crop_size = (640, 640) - -model = dict( - pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', - backbone=dict( - type='BEiT', - img_size=crop_size, - patch_size=16, - embed_dims=1024, - num_layers=24, - num_heads=16, - mlp_ratio=4, - qv_bias=True, - init_values=1e-6, - drop_path_rate=0.2, - out_indices=[7, 11, 15, 23]), - neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]), - decode_head=dict( - in_channels=[1024, 1024, 1024, 1024], num_classes=150, channels=1024), - auxiliary_head=dict(in_channels=1024, num_classes=150), - test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) - -optimizer = dict( - _delete_=True, - type='AdamW', - lr=2e-5, - betas=(0.9, 0.999), - weight_decay=0.05, - constructor='LayerDecayOptimizerConstructor', - paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.95)) - -lr_config = dict( - _delete_=True, - policy='poly', - warmup='linear', - warmup_iters=3000, - warmup_ratio=1e-6, - power=1.0, - min_lr=0.0, - by_epoch=False) +_base_ = './upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) diff --git a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py index a89c5fa390..e6247b7352 100644 --- a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py +++ b/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py @@ -2,14 +2,11 @@ '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py' ] -crop_size = (640, 640) model = dict( pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth', backbone=dict( type='BEiT', - img_size=crop_size, - patch_size=16, embed_dims=1024, num_layers=24, num_heads=16, @@ -22,7 +19,7 @@ decode_head=dict( in_channels=[1024, 1024, 1024, 1024], num_classes=150, channels=1024), auxiliary_head=dict(in_channels=1024, num_classes=150), - test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426))) + test_cfg=dict(mode='slide', crop_size=(640, 640), stride=(426, 426))) optimizer = dict( _delete_=True, From 28c15ed4c105a81e305c7dcad0a47a8c6ec826ed Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sat, 26 Mar 2022 23:48:53 +0800 Subject: [PATCH 21/45] fix --- configs/_base_/models/upernet_beit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py index 09bdba51d8..186dbba48c 100644 --- a/configs/_base_/models/upernet_beit.py +++ b/configs/_base_/models/upernet_beit.py @@ -24,7 +24,7 @@ neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]), decode_head=dict( type='UPerHead', - in_channels=[384, 384, 384, 384], + in_channels=[768, 768, 768, 768], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), channels=512, From 574b66f44868e18a3d429e7f388c5f85750be7a9 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sat, 26 Mar 2022 23:54:45 +0800 Subject: [PATCH 22/45] fix --- configs/_base_/models/upernet_beit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py index 186dbba48c..d0393287a8 100644 --- a/configs/_base_/models/upernet_beit.py +++ b/configs/_base_/models/upernet_beit.py @@ -27,9 +27,9 @@ in_channels=[768, 768, 768, 768], in_index=[0, 1, 2, 3], pool_scales=(1, 2, 3, 6), - channels=512, + channels=768, dropout_ratio=0.1, - num_classes=19, + num_classes=150, norm_cfg=norm_cfg, align_corners=False, loss_decode=dict( From 0744f637fadf832000814974d9a22990f743a871 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sun, 27 Mar 2022 01:33:46 +0800 Subject: [PATCH 23/45] fix --- mmseg/core/layer_decay_optimizer_constructor.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mmseg/core/layer_decay_optimizer_constructor.py b/mmseg/core/layer_decay_optimizer_constructor.py index 4456fb4652..895c97811b 100644 --- a/mmseg/core/layer_decay_optimizer_constructor.py +++ b/mmseg/core/layer_decay_optimizer_constructor.py @@ -31,7 +31,7 @@ def get_num_layer_for_vit(var_name, num_max_layer): class LayerDecayOptimizerConstructor(DefaultOptimizerConstructor): """Different learning rates are set for different layers of backbone.""" - def add_params(self, params, module, prefix='', is_dcn_module=None): + def add_params(self, params, module): """Add all parameters of module to the params list. The parameters of the given module will be added to the list of param @@ -40,10 +40,6 @@ def add_params(self, params, module, prefix='', is_dcn_module=None): params (list[dict]): A list of param groups, it will be modified in place. module (nn.Module): The module to be added. - prefix (str): The prefix of the module - is_dcn_module (int|float|None): If the current module is a - submodule of DCN, `is_dcn_module` will be passed to - control conv_offset layer's learning rate. Defaults to None. """ parameter_groups = {} logger = get_root_logger() From 56a9d00dcaea3147febe5ad36f0e4973ad9f6a6e Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sun, 27 Mar 2022 17:07:24 +0800 Subject: [PATCH 24/45] fix --- mmseg/models/backbones/beit.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 27da90da26..1c95690fc2 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -32,7 +32,7 @@ class BEiTAttention(BaseModule): embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (tuple[int]): The height and width of the window. - qv_bias (bool, optional): If True, add a learnable bias to q, v. + qv_bias (bool): If True, add a learnable bias to q, v. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. @@ -61,9 +61,14 @@ def __init__(self, if qv_bias: self.q_bias = nn.Parameter(torch.zeros(embed_dims)) self.v_bias = nn.Parameter(torch.zeros(embed_dims)) + self.qkv_bias = torch.cat( + (self.q_bias, + torch.zeros_like(self.v_bias, + requires_grad=False), self.v_bias)) else: self.q_bias = None self.v_bias = None + self.qkv_bias = None self.window_size = window_size # cls to token & token 2 cls & cls to cls @@ -116,14 +121,8 @@ def forward(self, x): x (tensor): input features with shape of (num_windows*B, N, C). """ B, N, C = x.shape - qkv_bias = None - if self.q_bias is not None: - qkv_bias = torch.cat( - (self.q_bias, - torch.zeros_like(self.v_bias, - requires_grad=False), self.v_bias)) - qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=self.qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] @@ -267,7 +266,7 @@ class BEiT(BaseModule): final_norm (bool): Whether to add a additional layer to normalize final feature map. Default: False. interpolate_mode (str): Select the interpolate mode for position - embeding vector resize. Default: bicubic. + embeding vector resize. Default: 'bicubic'. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. norm_eval (bool): Whether to set norm layers to eval mode, namely, @@ -387,6 +386,11 @@ def norm1(self): def resize_rel_pos_embed(self, checkpoint): """Resize relative pos_embed weights. + This function is modified from + https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/checkpoint.py. # noqa: E501 + Copyright (c) Microsoft Corporation + Licensed under the MIT License + Args: checkpoint (dict): Key and value of the pretrain model. Returns: From 1cfee521a8e5bd4eb0b318bad7aef07f4a1ee6e6 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Sun, 27 Mar 2022 17:53:52 +0800 Subject: [PATCH 25/45] fix --- mmseg/models/backbones/beit.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 1c95690fc2..9cb63088d3 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -61,14 +61,9 @@ def __init__(self, if qv_bias: self.q_bias = nn.Parameter(torch.zeros(embed_dims)) self.v_bias = nn.Parameter(torch.zeros(embed_dims)) - self.qkv_bias = torch.cat( - (self.q_bias, - torch.zeros_like(self.v_bias, - requires_grad=False), self.v_bias)) else: self.q_bias = None self.v_bias = None - self.qkv_bias = None self.window_size = window_size # cls to token & token 2 cls & cls to cls @@ -121,8 +116,14 @@ def forward(self, x): x (tensor): input features with shape of (num_windows*B, N, C). """ B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat( + (self.q_bias, + torch.zeros_like(self.v_bias, + requires_grad=False), self.v_bias)) - qkv = F.linear(input=x, weight=self.qkv.weight, bias=self.qkv_bias) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] From ba9b840dcb189fba7e15079e5fb09c8cf5b36dc8 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 11:25:29 +0800 Subject: [PATCH 26/45] fix --- configs/beit/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 3d56c5d3ea..d18791c449 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -67,7 +67,7 @@ configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py \ upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU ``` -For multi-scale inference: +Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=640, that is, the shortest edge is 640. So the multi-scale inference of config is performed separately, instead of '--aug-tes'. For multi-scale inference: ```shell sh tools/dist_test.sh \ From 8230e4d9879502ad0c980280e2a47cdbebd9f176 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 11:45:36 +0800 Subject: [PATCH 27/45] fix --- configs/_base_/models/upernet_beit.py | 1 - mmseg/models/backbones/beit.py | 20 +++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py index d0393287a8..36a12ccbb7 100644 --- a/configs/_base_/models/upernet_beit.py +++ b/configs/_base_/models/upernet_beit.py @@ -19,7 +19,6 @@ norm_cfg=dict(type='LN', eps=1e-6), act_cfg=dict(type='GELU'), norm_eval=False, - interpolate_mode='bicubic', init_values=0.1), neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]), decode_head=dict( diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 9cb63088d3..a991a4b46c 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -118,10 +118,8 @@ def forward(self, x): B, N, C = x.shape qkv_bias = None if self.q_bias is not None: - qkv_bias = torch.cat( - (self.q_bias, - torch.zeros_like(self.v_bias, - requires_grad=False), self.v_bias)) + k_bias = torch.zeros_like(self.v_bias, requires_grad=False) + qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias)) qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) @@ -131,11 +129,11 @@ def forward(self, x): attn = (q @ k.transpose(-2, -1)) if self.relative_position_bias_table is not None: - relative_position_bias = \ - self.relative_position_bias_table[ - self.relative_position_index.view(-1)].view( - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1) + Wh = self.window_size[0] + Ww = self.window_size[1] + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + Wh * Ww + 1, Wh * Ww + 1, -1) relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) @@ -266,8 +264,6 @@ class BEiT(BaseModule): Default: False. final_norm (bool): Whether to add a additional layer to normalize final feature map. Default: False. - interpolate_mode (str): Select the interpolate mode for position - embeding vector resize. Default: 'bicubic'. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. norm_eval (bool): Whether to set norm layers to eval mode, namely, @@ -299,7 +295,6 @@ def __init__(self, act_cfg=dict(type='GELU'), patch_norm=False, final_norm=False, - interpolate_mode='bicubic', num_fcs=2, norm_eval=False, with_cp=False, @@ -328,7 +323,6 @@ def __init__(self, self.img_size = img_size self.patch_size = patch_size - self.interpolate_mode = interpolate_mode self.norm_eval = norm_eval self.with_cp = with_cp self.pretrained = pretrained From ef1a0e65fe47fe1bef311a7dd2e5e488c2ea9d9c Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 11:48:21 +0800 Subject: [PATCH 28/45] fix --- configs/beit/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index d18791c449..5d707b8347 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -67,7 +67,7 @@ configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py \ upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU ``` -Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=640, that is, the shortest edge is 640. So the multi-scale inference of config is performed separately, instead of '--aug-tes'. For multi-scale inference: +Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=640, that is, the shortest edge is 640. So the multi-scale inference of config is performed separately, instead of '--aug-test'. For multi-scale inference: ```shell sh tools/dist_test.sh \ From 95151e88fb3f224a4c090aae7025825c6dbb8304 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 12:37:46 +0800 Subject: [PATCH 29/45] fix --- configs/_base_/models/upernet_beit.py | 1 - mmseg/models/backbones/beit.py | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/configs/_base_/models/upernet_beit.py b/configs/_base_/models/upernet_beit.py index 36a12ccbb7..9c5bfa3310 100644 --- a/configs/_base_/models/upernet_beit.py +++ b/configs/_base_/models/upernet_beit.py @@ -13,7 +13,6 @@ mlp_ratio=4, out_indices=(3, 5, 7, 11), qv_bias=True, - drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.1, norm_cfg=dict(type='LN', eps=1e-6), diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index a991a4b46c..1577fd9b0d 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -251,8 +251,6 @@ class BEiT(BaseModule): out_indices (list | tuple | int): Output from which stages. Default: -1. qkv_bias (bool): enable bias for qkv if True. Default: True. - drop_rate (float): Probability of an element to be zeroed. - Default 0.0 attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): stochastic depth rate. Default 0.0. @@ -269,8 +267,6 @@ class BEiT(BaseModule): norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save - some memory while slowing down the training speed. Default: False. pretrained (str, optional): model pretrained path. Default: None. init_values (float): Initialize the values of BEiTAttention and FFN with learnable scaling. @@ -288,7 +284,6 @@ def __init__(self, mlp_ratio=4, out_indices=-1, qv_bias=True, - drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_cfg=dict(type='LN'), @@ -297,7 +292,6 @@ def __init__(self, final_norm=False, num_fcs=2, norm_eval=False, - with_cp=False, pretrained=None, init_values=0.1, init_cfg=None): @@ -324,7 +318,6 @@ def __init__(self, self.img_size = img_size self.patch_size = patch_size self.norm_eval = norm_eval - self.with_cp = with_cp self.pretrained = pretrained self.patch_embed = PatchEmbed( @@ -340,7 +333,6 @@ def __init__(self, window_size = (img_size[0] // patch_size, img_size[1] // patch_size) self.patch_shape = window_size self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) - self.drop_after_pos = nn.Dropout(p=drop_rate) if isinstance(out_indices, int): if out_indices == -1: @@ -493,6 +485,8 @@ def _init_weights(m): else: # We only implement the 'jax_impl' initialization implemented at # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 + # Copyright 2019 Ross Wightman + # Licensed under the Apache License, Version 2.0 trunc_normal_(self.cls_token, std=.02) for n, m in self.named_modules(): if isinstance(m, nn.Linear): From decf9d25257ffe63626d48772e2ad2d8c322dadf Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 12:42:20 +0800 Subject: [PATCH 30/45] fix --- mmseg/models/backbones/beit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 1577fd9b0d..70289c30ab 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -486,7 +486,7 @@ def _init_weights(m): # We only implement the 'jax_impl' initialization implemented at # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 # Copyright 2019 Ross Wightman - # Licensed under the Apache License, Version 2.0 + # Licensed under the Apache License, Version 2.0 (the "License") trunc_normal_(self.cls_token, std=.02) for n, m in self.named_modules(): if isinstance(m, nn.Linear): From 4aafb7eef01b7966b1c8eee0d056309f11b5c27d Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 12:56:07 +0800 Subject: [PATCH 31/45] fix test_beit.py --- tests/test_models/test_backbones/test_beit.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py index 9a8f0d327a..bc38f0f870 100644 --- a/tests/test_models/test_backbones/test_beit.py +++ b/tests/test_models/test_backbones/test_beit.py @@ -95,12 +95,6 @@ def test_beit_backbone(): feat = model(imgs) assert feat[-1].shape == (1, 768, 14, 21) - # Test with_cp=True - model = BEiT(with_cp=True) - imgs = torch.randn(1, 3, 224, 224) - feat = model(imgs) - assert feat[-1].shape == (1, 768, 14, 14) - # Test init_values=0 model = BEiT(init_values=0) imgs = torch.randn(1, 3, 224, 224) From 4ca514db031793bc1bd8106cfd6ac134f3f39696 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 17:08:23 +0800 Subject: [PATCH 32/45] fix --- mmseg/models/backbones/beit.py | 119 +++++++++++++++------------ mmseg/models/necks/featurepyramid.py | 4 +- 2 files changed, 68 insertions(+), 55 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 70289c30ab..c487544f77 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -156,10 +156,10 @@ class TransformerEncoderLayer(BaseModule): feedforward_channels (int): The hidden dimension for FFNs. attn_drop_rate (float): The drop out rate for attention layer. Default: 0.0. - drop_path_rate (float): stochastic depth rate. Default 0.0. + drop_path_rate (float): Stochastic depth rate. Default 0.0. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. - qkv_bias (bool): enable bias for qkv if True. Default: True + qkv_bias (bool): Enable bias for qkv if True. Default: True act_cfg (dict): The activation config for FFNs. Default: dict(type='GELU'). norm_cfg (dict): Config dict for normalization layer. @@ -243,17 +243,17 @@ class BEiT(BaseModule): img_size (int | tuple): Input image size. Default: 224. patch_size (int): The patch size. Default: 16. in_channels (int): Number of input channels. Default: 3. - embed_dims (int): embedding dimension. Default: 768. - num_layers (int): depth of transformer. Default: 12. - num_heads (int): number of attention heads. Default: 12. - mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + embed_dims (int): Embedding dimension. Default: 768. + num_layers (int): Depth of transformer. Default: 12. + num_heads (int): Number of attention heads. Default: 12. + mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. Default: 4. out_indices (list | tuple | int): Output from which stages. Default: -1. - qkv_bias (bool): enable bias for qkv if True. Default: True. + qkv_bias (bool): Enable bias for qkv if True. Default: True. attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 - drop_path_rate (float): stochastic depth rate. Default 0.0. + drop_path_rate (float): Stochastic depth rate. Default 0.0. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN') act_cfg (dict): The activation config for FFNs. @@ -267,7 +267,7 @@ class BEiT(BaseModule): norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. Default: False. - pretrained (str, optional): model pretrained path. Default: None. + pretrained (str, optional): Model pretrained path. Default: None. init_values (float): Initialize the values of BEiTAttention and FFN with learnable scaling. init_cfg (dict or list[dict], optional): Initialization config dict. @@ -370,6 +370,60 @@ def __init__(self, def norm1(self): return getattr(self, self.norm1_name) + def _get_new_rel_pos_bias(self, src_size, dst_size, extra_tokens, + rel_pos_bias, num_attn_heads): + """Get new relative position bias. + + Args: + src_size (int): Number of pos_embedding in pre-trained model. + dst_size (int): Number of pos_embedding in the current model. + extra_tokens (tensor): The bias of the extra tokens. + rel_pos_bias (tensor): The relative position bias of the pretrain + model after removing the extra tokens. + num_attn_heads (int): Number of attention heads. + Returns: + new_rel_pos_bias (tensor): Interpolate the pre-trained relative + position bias to the size of the current model. + """ + + # Geometric sequence interpolation. + def geometric_progression(a, r, n): + return a * (1.0 - r**n) / (1.0 - r) + + # Here is a binary function. + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + # The position of each interpolated point is determined + # by the ratio obtained by dichotomy. + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q**(i + 1) + r_ids = [-_ for _ in reversed(dis)] + x = r_ids + [0] + dis + y = r_ids + [0] + dis + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + # Interpolation functions are being executed and called. + all_rel_pos_bias = [] + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to( + rel_pos_bias.device)) + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) + return new_rel_pos_bias + def resize_rel_pos_embed(self, checkpoint): """Resize relative pos_embed weights. @@ -412,50 +466,9 @@ def resize_rel_pos_embed(self, checkpoint): if src_size != dst_size: extra_tokens = rel_pos_bias[-num_extra_tokens:, :] rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] - - # Geometric sequence interpolation. - def geometric_progression(a, r, n): - return a * (1.0 - r**n) / (1.0 - r) - - # Here is a binary function. - left, right = 1.01, 1.5 - while right - left > 1e-6: - q = (left + right) / 2.0 - gp = geometric_progression(1, q, src_size // 2) - if gp > dst_size // 2: - right = q - else: - left = q - # The position of each interpolated point is determined - # by the ratio obtained by dichotomy. - dis = [] - cur = 1 - for i in range(src_size // 2): - dis.append(cur) - cur += q**(i + 1) - - r_ids = [-_ for _ in reversed(dis)] - - x = r_ids + [0] + dis - y = r_ids + [0] + dis - - t = dst_size // 2.0 - dx = np.arange(-t, t + 0.1, 1.0) - dy = np.arange(-t, t + 0.1, 1.0) - # Interpolation functions are being executed and called. - all_rel_pos_bias = [] - - for i in range(num_attn_heads): - z = rel_pos_bias[:, i].view(src_size, - src_size).float().numpy() - f = interpolate.interp2d(x, y, z, kind='cubic') - all_rel_pos_bias.append( - torch.Tensor(f(dx, dy)).contiguous().view( - -1, 1).to(rel_pos_bias.device)) - - rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) - new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), - dim=0) + new_rel_pos_bias = self._get_new_rel_pos_bias( + src_size, dst_size, extra_tokens, rel_pos_bias, + num_attn_heads) state_dict[key] = new_rel_pos_bias return state_dict diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 1111c4d2b5..062e897528 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -12,8 +12,8 @@ class Feature2Pyramid(nn.Module): A neck structure connect ViT backbone and decoder_heads. Args: - embed_dims (int): embedding dimension. - rescales (list[float]): different sampling multiples were + embed_dims (int): Embedding dimension. + rescales (list[float]): Different sampling multiples were used to obtain pyramid features. Default: (4, 2, 1, 0.5). norm_cfg (dict): Config dict for normalization layer. Default: dict(type='SyncBN', requires_grad=True). From 34339fa01e96bb48c8509b52a77d3728b0added8 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Mon, 28 Mar 2022 18:47:08 +0800 Subject: [PATCH 33/45] fix --- mmseg/models/backbones/beit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index c487544f77..6befa87d3b 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -237,7 +237,7 @@ def forward(self, x): @BACKBONES.register_module() class BEiT(BaseModule): - """VisionTransformer with support for patch. + """BERT Pre-Training of Image Transformers. Args: img_size (int | tuple): Input image size. Default: 224. From f7dc33e2052b01c028a94d99b9dc92480e359a73 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 01:03:54 +0800 Subject: [PATCH 34/45] fix --- mmseg/models/backbones/beit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 6befa87d3b..910e7d3c11 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -36,9 +36,9 @@ class BEiTAttention(BaseModule): Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. - attn_drop_rate (float, optional): Dropout ratio of attention weight. + attn_drop_rate (float): Dropout ratio of attention weight. Default: 0.0 - proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + proj_drop_rate (float): Dropout ratio of output. Default: 0. init_cfg (dict | None, optional): The Config for initialization. Default: None. """ From 2e5a973121ba39c20c695e697bc935447c13812a Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 12:49:11 +0800 Subject: [PATCH 35/45] fix --- configs/beit/README.md | 4 ++-- configs/beit/beit.yml | 4 ++-- mmseg/models/necks/featurepyramid.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 5d707b8347..be9f63d434 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -81,5 +81,5 @@ upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index a6ac4c0494..4fcfaf4550 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -20,7 +20,7 @@ Models: mIoU: 53.08 mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth - Name: upernet_beit_large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: @@ -42,4 +42,4 @@ Models: mIoU: 56.33 mIoU(ms+flip): 56.84 Config: configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py index 062e897528..82a00ceb1c 100644 --- a/mmseg/models/necks/featurepyramid.py +++ b/mmseg/models/necks/featurepyramid.py @@ -14,14 +14,14 @@ class Feature2Pyramid(nn.Module): Args: embed_dims (int): Embedding dimension. rescales (list[float]): Different sampling multiples were - used to obtain pyramid features. Default: (4, 2, 1, 0.5). + used to obtain pyramid features. Default: [4, 2, 1, 0.5]. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='SyncBN', requires_grad=True). """ def __init__(self, embed_dim, - rescales, + rescales=[4, 2, 1, 0.5], norm_cfg=dict(type='SyncBN', requires_grad=True)): super(Feature2Pyramid, self).__init__() self.rescales = rescales From 7ca0f7ecb87fb502f2945717ad1fe3519e54d9ba Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 14:08:07 +0800 Subject: [PATCH 36/45] fix --- mmseg/models/backbones/beit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 910e7d3c11..308cf4b565 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -159,7 +159,7 @@ class TransformerEncoderLayer(BaseModule): drop_path_rate (float): Stochastic depth rate. Default 0.0. num_fcs (int): The number of fully-connected layers for FFNs. Default: 2. - qkv_bias (bool): Enable bias for qkv if True. Default: True + qv_bias (bool): Enable bias for qv if True. Default: True act_cfg (dict): The activation config for FFNs. Default: dict(type='GELU'). norm_cfg (dict): Config dict for normalization layer. @@ -250,7 +250,7 @@ class BEiT(BaseModule): Default: 4. out_indices (list | tuple | int): Output from which stages. Default: -1. - qkv_bias (bool): Enable bias for qkv if True. Default: True. + qv_bias (bool): Enable bias for qv if True. Default: True. attn_drop_rate (float): The drop out rate for attention layer. Default 0.0 drop_path_rate (float): Stochastic depth rate. Default 0.0. From 19b5e283d3eccfbf01cda3e08f383b04e310cbe7 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 23:02:27 +0800 Subject: [PATCH 37/45] fix --- configs/beit/README.md | 23 +++++++++---------- configs/beit/beit.yml | 12 +++++----- ...ernet_beit-base_640x640_160k_ade20k_ms.py} | 2 +- ...rnet_beit-base_8x2_640x640_160k_ade20k.py} | 0 ...beit-large_fp16_640x640_160k_ade20k_ms.py} | 2 +- ...eit-large_fp16_8x1_640x640_160k_ade20k.py} | 0 .../core/layer_decay_optimizer_constructor.py | 11 +-------- mmseg/models/backbones/beit.py | 13 ----------- tests/test_models/test_backbones/test_beit.py | 2 +- .../test_necks/test_feature2pyramid.py | 2 +- 10 files changed, 22 insertions(+), 45 deletions(-) rename configs/beit/{upernet_beit_base_640x640_160k_ade20k_ms.py => upernet_beit-base_640x640_160k_ade20k_ms.py} (92%) rename configs/beit/{upernet_beit_base_8x2_640x640_160k_ade20k.py => upernet_beit-base_8x2_640x640_160k_ade20k.py} (100%) rename configs/beit/{upernet_beit_large_fp16_640x640_160k_ade20k_ms.py => upernet_beit-large_fp16_640x640_160k_ade20k_ms.py} (91%) rename configs/beit/{upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py => upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py} (100%) diff --git a/configs/beit/README.md b/configs/beit/README.md index be9f63d434..0618c708e2 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -24,13 +24,12 @@ We introduce a self-supervised vision representation model BEiT, which stands fo ## Citation ```bibtex -@article{beit, +@inproceedings{beit, title={{BEiT}: {BERT} Pre-Training of Image Transformers}, - author={Hangbo Bao and Li Dong and Furu Wei}, - year={2021}, - eprint={2106.08254}, - archivePrefix={arXiv}, - primaryClass={cs.CV} + author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei}, + booktitle={International Conference on Learning Representations}, + year={2022}, + url={https://openreview.net/forum?id=p-BhZSz59o4} } ``` @@ -63,16 +62,16 @@ Verify the single-scale results of the model: ```shell sh tools/dist_test.sh \ -configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py \ -upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU +configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py \ +upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU ``` Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=640, that is, the shortest edge is 640. So the multi-scale inference of config is performed separately, instead of '--aug-test'. For multi-scale inference: ```shell sh tools/dist_test.sh \ -configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py \ -upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU +configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py \ +upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU ``` ## Results and models @@ -81,5 +80,5 @@ upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 4fcfaf4550..86530ec83b 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -1,5 +1,5 @@ Models: -- Name: upernet_beit_base_8x2_640x640_160k_ade20k +- Name: upernet_beit-base_8x2_640x640_160k_ade20k In Collection: UperNet Metadata: backbone: BEiT-B @@ -19,9 +19,9 @@ Models: Metrics: mIoU: 53.08 mIoU(ms+flip): 53.84 - Config: configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_base_8x2_640x640_160k_ade20k-eead221d.pth -- Name: upernet_beit_large_fp16_8x1_640x640_160k_ade20k + Config: configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth +- Name: upernet_beit-large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: backbone: BEiT-L @@ -41,5 +41,5 @@ Models: Metrics: mIoU: 56.33 mIoU(ms+flip): 56.84 - Config: configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth + Config: configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth diff --git a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py similarity index 92% rename from configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py rename to configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py index 29ae554428..f764c92c11 100644 --- a/configs/beit/upernet_beit_base_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py @@ -1,4 +1,4 @@ -_base_ = './upernet_beit_base_8x2_640x640_160k_ade20k.py' +_base_ = './upernet_beit-base_8x2_640x640_160k_ade20k.py' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) diff --git a/configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py b/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py similarity index 100% rename from configs/beit/upernet_beit_base_8x2_640x640_160k_ade20k.py rename to configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py diff --git a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py similarity index 91% rename from configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py rename to configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py index c314b5856f..fd4d9477d4 100644 --- a/configs/beit/upernet_beit_large_fp16_640x640_160k_ade20k_ms.py +++ b/configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py @@ -1,4 +1,4 @@ -_base_ = './upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py' +_base_ = './upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) diff --git a/configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py b/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py similarity index 100% rename from configs/beit/upernet_beit_large_fp16_8x1_640x640_160k_ade20k.py rename to configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py diff --git a/mmseg/core/layer_decay_optimizer_constructor.py b/mmseg/core/layer_decay_optimizer_constructor.py index 895c97811b..30a09ba08e 100644 --- a/mmseg/core/layer_decay_optimizer_constructor.py +++ b/mmseg/core/layer_decay_optimizer_constructor.py @@ -44,15 +44,11 @@ def add_params(self, params, module): parameter_groups = {} logger = get_root_logger() logger.info(self.paramwise_cfg) - num_layers = self.paramwise_cfg.get('num_layers') + 2 layer_decay_rate = self.paramwise_cfg.get('layer_decay_rate') - logger.info(f'Build LayerDecayOptimizerConstructor ' f'{layer_decay_rate} - {num_layers}') - weight_decay = self.base_wd - for name, param in module.named_parameters(): if not param.requires_grad: continue # frozen weights @@ -63,22 +59,18 @@ def add_params(self, params, module): else: group_name = 'decay' this_weight_decay = weight_decay - layer_id = get_num_layer_for_vit(name, num_layers) group_name = f'layer_{layer_id}_{group_name}' - if group_name not in parameter_groups: scale = layer_decay_rate**(num_layers - layer_id - 1) - parameter_groups[group_name] = { 'weight_decay': this_weight_decay, 'params': [], 'param_names': [], 'lr_scale': scale, 'group_name': group_name, - 'lr': scale * self.base_lr, + 'lr': scale * self.base_lr } - parameter_groups[group_name]['params'].append(param) parameter_groups[group_name]['param_names'].append(name) rank, _ = get_dist_info() @@ -92,5 +84,4 @@ def add_params(self, params, module): 'weight_decay': parameter_groups[key]['weight_decay'] } logger.info(f'Param groups ={to_display}') - params.extend(parameter_groups.values()) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 308cf4b565..808582a0a8 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -52,7 +52,6 @@ def __init__(self, attn_drop_rate=0., proj_drop_rate=0., init_cfg=None): - super().__init__(init_cfg=init_cfg) self.embed_dims = embed_dims self.num_heads = num_heads @@ -92,7 +91,6 @@ def __init__(self, relative_position_index = torch.zeros( size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) - # relative_position_index shape is (Wh*Ww, Wh*Ww) relative_position_index[1:, 1:] = relative_coords.sum(-1) relative_position_index[0, 0:] = self.num_relative_distance - 3 @@ -101,7 +99,6 @@ def __init__(self, self.register_buffer('relative_position_index', relative_position_index) - self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=False) self.attn_drop = nn.Dropout(attn_drop_rate) self.proj = nn.Linear(embed_dims, embed_dims) @@ -124,10 +121,8 @@ def forward(self, x): qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] - q = q * self.scale attn = (q @ k.transpose(-2, -1)) - if self.relative_position_bias_table is not None: Wh = self.window_size[0] Ww = self.window_size[1] @@ -137,10 +132,8 @@ def forward(self, x): relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) - attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) - x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) @@ -183,11 +176,9 @@ def __init__(self, window_size=None, init_values=None): super(TransformerEncoderLayer, self).__init__() - self.norm1_name, norm1 = build_norm_layer( norm_cfg, embed_dims, postfix=1) self.add_module(self.norm1_name, norm1) - self.attn = BEiTAttention( embed_dims=embed_dims, num_heads=num_heads, @@ -205,17 +196,14 @@ def __init__(self, dropout_layer=None, act_cfg=act_cfg, add_identity=False) - self.norm2_name, norm2 = build_norm_layer( norm_cfg, embed_dims, postfix=2) self.add_module(self.norm2_name, norm2) - # NOTE: drop path for stochastic depth, we shall see if # this is better than dropout here dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate) self.drop_path = build_dropout( dropout_layer) if dropout_layer else nn.Identity() - self.gamma_1 = nn.Parameter( init_values * torch.ones((embed_dims)), requires_grad=True) self.gamma_2 = nn.Parameter( @@ -296,7 +284,6 @@ def __init__(self, init_values=0.1, init_cfg=None): super(BEiT, self).__init__(init_cfg=init_cfg) - if isinstance(img_size, int): img_size = to_2tuple(img_size) elif isinstance(img_size, tuple): diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py index bc38f0f870..cf3960894d 100644 --- a/tests/test_models/test_backbones/test_beit.py +++ b/tests/test_models/test_backbones/test_beit.py @@ -58,7 +58,7 @@ def test_beit_backbone(): assert check_norm_state(model.modules(), True) - # Test normal size input image + # Test image size = (224, 224) imgs = torch.randn(1, 3, 224, 224) feat = model(imgs) assert feat[-1].shape == (1, 768, 14, 14) diff --git a/tests/test_models/test_necks/test_feature2pyramid.py b/tests/test_models/test_necks/test_feature2pyramid.py index d62708d767..44fd02c489 100644 --- a/tests/test_models/test_necks/test_feature2pyramid.py +++ b/tests/test_models/test_necks/test_feature2pyramid.py @@ -5,7 +5,7 @@ from mmseg.models import Feature2Pyramid -def test_Feature2Pyramid(): +def test_feature2pyramid(): # test rescales = [4, 2, 1, 0.5] embed_dim = 64 From 783dcfb02e9e29c95c8e2de1c5457db2dd168f01 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 23:03:49 +0800 Subject: [PATCH 38/45] fix --- configs/beit/beit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 86530ec83b..b4f554bf48 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -26,7 +26,7 @@ Models: Metadata: backbone: BEiT-L crop size: (640,640) - lr schd: 320000 + lr schd: 160000 inference time (ms/im): - value: 1041.67 hardware: V100 From b556b697b256b54606613a4ca0f95815f2522673 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Tue, 29 Mar 2022 23:22:12 +0800 Subject: [PATCH 39/45] fix --- mmseg/models/backbones/beit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 808582a0a8..170282a529 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -372,7 +372,6 @@ def _get_new_rel_pos_bias(self, src_size, dst_size, extra_tokens, new_rel_pos_bias (tensor): Interpolate the pre-trained relative position bias to the size of the current model. """ - # Geometric sequence interpolation. def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) From aab40633108de927d39abb2edde221cd058d52d4 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 00:07:13 +0800 Subject: [PATCH 40/45] fix --- configs/beit/README.md | 4 ++-- configs/beit/beit.yml | 6 +++--- mmseg/models/backbones/beit.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 0618c708e2..ff27084f0f 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -80,5 +80,5 @@ upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | -| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index b4f554bf48..c0eb9c2d4e 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -20,13 +20,13 @@ Models: mIoU: 53.08 mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth - Name: upernet_beit-large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: backbone: BEiT-L crop size: (640,640) - lr schd: 160000 + lr schd: 320000 inference time (ms/im): - value: 1041.67 hardware: V100 @@ -42,4 +42,4 @@ Models: mIoU: 56.33 mIoU(ms+flip): 56.84 Config: configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 170282a529..808582a0a8 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -372,6 +372,7 @@ def _get_new_rel_pos_bias(self, src_size, dst_size, extra_tokens, new_rel_pos_bias (tensor): Interpolate the pre-trained relative position bias to the size of the current model. """ + # Geometric sequence interpolation. def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) From f161d0d72c82e84c16de1d24d0c36def63d62700 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 00:31:46 +0800 Subject: [PATCH 41/45] fix --- mmseg/models/backbones/beit.py | 37 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 808582a0a8..2aab9d9920 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -357,23 +357,21 @@ def __init__(self, def norm1(self): return getattr(self, self.norm1_name) - def _get_new_rel_pos_bias(self, src_size, dst_size, extra_tokens, - rel_pos_bias, num_attn_heads): + def _geometric_sequence_interpolation(self, src_size, dst_size, sequence, + num): """Get new relative position bias. Args: src_size (int): Number of pos_embedding in pre-trained model. dst_size (int): Number of pos_embedding in the current model. - extra_tokens (tensor): The bias of the extra tokens. - rel_pos_bias (tensor): The relative position bias of the pretrain + sequence (tensor): The relative position bias of the pretrain model after removing the extra tokens. - num_attn_heads (int): Number of attention heads. + num (int): Number of attention heads. Returns: - new_rel_pos_bias (tensor): Interpolate the pre-trained relative + new_sequence (tensor): Interpolate the pre-trained relative position bias to the size of the current model. """ - # Geometric sequence interpolation. def geometric_progression(a, r, n): return a * (1.0 - r**n) / (1.0 - r) @@ -400,16 +398,16 @@ def geometric_progression(a, r, n): dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) # Interpolation functions are being executed and called. - all_rel_pos_bias = [] - for i in range(num_attn_heads): - z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + new_sequence = [] + for i in range(num): + z = sequence[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') - all_rel_pos_bias.append( - torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to( - rel_pos_bias.device)) - rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) - new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0) - return new_rel_pos_bias + new_sequence.append( + torch.Tensor(f(dx, + dy)).contiguous().view(-1, + 1).to(sequence.device)) + new_sequence = torch.cat(new_sequence, dim=-1) + return new_sequence def resize_rel_pos_embed(self, checkpoint): """Resize relative pos_embed weights. @@ -453,9 +451,10 @@ def resize_rel_pos_embed(self, checkpoint): if src_size != dst_size: extra_tokens = rel_pos_bias[-num_extra_tokens:, :] rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] - new_rel_pos_bias = self._get_new_rel_pos_bias( - src_size, dst_size, extra_tokens, rel_pos_bias, - num_attn_heads) + new_rel_pos_bias = self._geometric_sequence_interpolation( + src_size, dst_size, rel_pos_bias, num_attn_heads) + new_rel_pos_bias = torch.cat( + (new_rel_pos_bias, extra_tokens), dim=0) state_dict[key] = new_rel_pos_bias return state_dict From c48c8f0f4b327431c55a697fe09fab79de64dea1 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 00:41:52 +0800 Subject: [PATCH 42/45] fix --- mmseg/models/backbones/beit.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 2aab9d9920..4e2f9adeca 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -359,17 +359,18 @@ def norm1(self): def _geometric_sequence_interpolation(self, src_size, dst_size, sequence, num): - """Get new relative position bias. + """Get new sequence via geometric sequence interpolation. Args: - src_size (int): Number of pos_embedding in pre-trained model. - dst_size (int): Number of pos_embedding in the current model. + src_size (int): Pos_embedding size in pre-trained model. + dst_size (int): Pos_embedding size in the current model. sequence (tensor): The relative position bias of the pretrain model after removing the extra tokens. num (int): Number of attention heads. Returns: - new_sequence (tensor): Interpolate the pre-trained relative - position bias to the size of the current model. + new_sequence (tensor): Geometric sequence interpolate the + pre-trained relative position bias to the size of + the current model. """ def geometric_progression(a, r, n): From 9fa14a3bf2c9e97a70a42d8c79f1e89c776a3cdd Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 00:52:47 +0800 Subject: [PATCH 43/45] fix --- mmseg/models/backbones/beit.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py index 4e2f9adeca..26be3156fe 100644 --- a/mmseg/models/backbones/beit.py +++ b/mmseg/models/backbones/beit.py @@ -404,9 +404,7 @@ def geometric_progression(a, r, n): z = sequence[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') new_sequence.append( - torch.Tensor(f(dx, - dy)).contiguous().view(-1, - 1).to(sequence.device)) + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(sequence)) new_sequence = torch.cat(new_sequence, dim=-1) return new_sequence From 7bb7dd92a84a175ddc6fefbb072ccf1eae6108dd Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 02:21:16 +0800 Subject: [PATCH 44/45] fix --- configs/beit/README.md | 2 +- configs/beit/beit.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index ff27084f0f..55b60be8d1 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -80,5 +80,5 @@ upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | | UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index c0eb9c2d4e..721332e1ef 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -20,7 +20,7 @@ Models: mIoU: 53.08 mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth - Name: upernet_beit-large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: From d080d84854dc5f3948c2c63c09d56b0b96420417 Mon Sep 17 00:00:00 2001 From: linfangjian01 Date: Wed, 30 Mar 2022 12:35:36 +0800 Subject: [PATCH 45/45] fix --- configs/beit/README.md | 2 +- configs/beit/beit.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/beit/README.md b/configs/beit/README.md index 55b60be8d1..31bf285356 100644 --- a/configs/beit/README.md +++ b/configs/beit/README.md @@ -80,5 +80,5 @@ upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU | Method | Backbone | Crop Size | pretrain | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | ------ | -------- | --------- | ---------- | ------- | -------- | --- | --- | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | +| UperNet | BEiT-B | 640x640 | ImageNet-22K | 224x224 | 16 | 160000 | 15.88 | 2.00 | 53.08 | 53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json) | | UperNet | BEiT-L | 640x640 | ImageNet-22K | 224x224 | 8 | 320000 | 22.64 | 0.96 | 56.33 | 56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) | diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml index 721332e1ef..6f3cee3edd 100644 --- a/configs/beit/beit.yml +++ b/configs/beit/beit.yml @@ -20,7 +20,7 @@ Models: mIoU: 53.08 mIoU(ms+flip): 53.84 Config: configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth - Name: upernet_beit-large_fp16_8x1_640x640_160k_ade20k In Collection: UperNet Metadata: