open-mmlab · xvjiarui · Dec 2, 2021 · Oct 13, 2021 · Oct 13, 2021 · Oct 18, 2021
diff --git a/configs/_base_/models/erfnet.py b/configs/_base_/models/erfnet.py
@@ -0,0 +1,28 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='ERFNet',
+ in_channels=3,
+ enc_downsample_channels=(16, 64, 128),
+ enc_num_stages_non_bottleneck=(5, 8),
+ enc_non_bottleneck_dilations=(2, 4, 8, 16),
+ enc_non_bottleneck_channels=(64, 128),
+ dec_upsample_channels=(64, 16),
+ dec_num_stages_non_bottleneck=(2, 2),
+ dec_non_bottleneck_channels=(64, 16),
+ dropout_ratio=0.1,
+ init_cfg=None),
+ decode_head=dict(
+ type='ERFHead',
+ in_channels=16,
+ channels=19,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/configs/erfnet/erfnet_4x4_1024x1024_160k_cityscapes.py b/configs/erfnet/erfnet_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,4 @@
+_base_ = [
+ '../_base_/models/erfnet.py', '../_base_/datasets/cityscapes_1024x1024.py',
+ '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py
@@ -2,6 +2,7 @@
 from .bisenetv1 import BiSeNetV1
 from .bisenetv2 import BiSeNetV2
 from .cgnet import CGNet
+from .erfnet import ERFNet
 from .fast_scnn import FastSCNN
 from .hrnet import HRNet
 from .icnet import ICNet
@@ -19,5 +20,5 @@
  'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
  'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
  'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
- 'BiSeNetV1', 'BiSeNetV2', 'ICNet'
+ 'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'ERFNet'
 ]
diff --git a/mmseg/models/backbones/erfnet.py b/mmseg/models/backbones/erfnet.py
@@ -0,0 +1,345 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES
+
+
+class DownsamplerBlock(BaseModule):
+ """Downsampler block of ERFNet.
+
+ This module is a little different from basical ConvModule. Concatenation
+ of Conv and MaxPool will be used before Batch Norm.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): The number of output channels.
+ conv_cfg (dict | None): Config of conv layers.
+ Default: None.
+ norm_cfg (dict | None): Config of norm layers.
+ Default: dict(type='BN').
+ act_cfg (dict): Config of activation layers.
+ Default: dict(type='ReLU').
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Default: None.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN', eps=1e-3),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(DownsamplerBlock, self).__init__(init_cfg=init_cfg)
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+
+ self.conv = build_conv_layer(
+ self.conv_cfg,
+ in_channels,
+ out_channels - in_channels,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ bias=True)
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+ self.act = build_activation_layer(self.act_cfg)
+
+ def forward(self, input):
+ conv_out = self.conv(input)
+ pool_out = self.pool(input)
+ pool_out = resize(
+ input=pool_out,
+ size=conv_out.size()[2:],
+ mode='bilinear',
+ align_corners=False)
+ output = torch.cat([conv_out, pool_out], 1)
+ output = self.bn(output)
+ output = self.act(output)
+ return output
+
+
+class non_bottleneck_1d(BaseModule):
+ """Non-bottleneck block of ERFNet.
+
+ Args:
+ channels (int): Number of channels in Non-bottleneck block.
+ drop_rate (float): Probability of an element to be zeroed.
+ Default 0.
+ dilation (int): Dilation parameter for last two conv layers.
+ Default 1.
+ conv_cfg (dict | None): Config of conv layers.
+ Default: None.
+ norm_cfg (dict | None): Config of norm layers.
+ Default: dict(type='BN').
+ act_cfg (dict): Config of activation layers.
+ Default: dict(type='ReLU').
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Default: None.
+ """
+
+ def __init__(self,
+ channels,
+ drop_rate=0,
+ dilation=1,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN', eps=1e-3),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(non_bottleneck_1d, self).__init__(init_cfg=init_cfg)
+
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+ self.conv3x1_1 = build_conv_layer(
+ self.conv_cfg,
+ channels,
+ channels,
+ kernel_size=(3, 1),
+ stride=1,
+ padding=(1, 0),
+ bias=True)
+
+ self.conv1x3_1 = build_conv_layer(
+ self.conv_cfg,
+ channels,
+ channels,
+ kernel_size=(1, 3),
+ stride=1,
+ padding=(0, 1),
+ bias=True)
+
+ self.bn1 = build_norm_layer(self.norm_cfg, channels)[1]
+
+ self.conv3x1_2 = build_conv_layer(
+ self.conv_cfg,
+ channels,
+ channels,
+ kernel_size=(3, 1),
+ stride=1,
+ padding=(1 * dilation, 0),
+ bias=True,
+ dilation=(dilation, 1))
+
+ self.conv1x3_2 = build_conv_layer(
+ self.conv_cfg,
+ channels,
+ channels,
+ kernel_size=(1, 3),
+ stride=1,
+ padding=(0, 1 * dilation),
+ bias=True,
+ dilation=(1, dilation))
+
+ self.bn2 = build_norm_layer(self.norm_cfg, channels)[1]
+ self.act = build_activation_layer(self.act_cfg)
+ self.dropout = nn.Dropout(p=drop_rate)
+
+ def forward(self, input):
+ output = self.conv3x1_1(input)
+ output = self.act(output)
+ output = self.conv1x3_1(output)
+ output = self.bn1(output)
+ output = self.act(output)
+
+ output = self.conv3x1_2(output)
+ output = self.act(output)
+ output = self.conv1x3_2(output)
+ output = self.bn2(output)
+
+ output = self.dropout(output)
+ output = self.act(output + input)
+ return output
+
+
+class UpsamplerBlock(BaseModule):
+ """Upsampler block of ERFNet.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): The number of output channels.
+ conv_cfg (dict | None): Config of conv layers.
+ Default: None.
+ norm_cfg (dict | None): Config of norm layers.
+ Default: dict(type='BN').
+ act_cfg (dict): Config of activation layers.
+ Default: dict(type='ReLU').
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Default: None.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN', eps=1e-3),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(UpsamplerBlock, self).__init__(init_cfg=init_cfg)
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+
+ self.conv = nn.ConvTranspose2d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ output_padding=1,
+ bias=True)
+ self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+ self.act = build_activation_layer(self.act_cfg)
+
+ def forward(self, input):
+ output = self.conv(input)
+ output = self.bn(output)
+ output = self.act(output)
+ return output
+
+
+@BACKBONES.register_module()
+class ERFNet(BaseModule):
+ """ERFNet backbone.
+
+ This backbone is the implementation of `ERFNet: Efficient Residual
+ Factorized ConvNet for Real-time SemanticSegmentation
+ <https://ieeexplore.ieee.org/document/8063438>`_.
+
+ Args:
+ in_channels (int): The number of channels of input
+ image. Default: 3.
+ enc_downsample_channels (Tuple[int]): Size of channel
+ numbers of various Downsampler block in encoder.
+ Default: (16, 64, 128).
+ enc_num_stages_non_bottleneck (Tuple[int]): Number of stages of
+ Non-bottleneck block in encoder.
+ Default: (5, 8).
+ enc_non_bottleneck_dilations (Tuple[int]): Dilation rate of each
+ stage of Non-bottleneck block of encoder.
+ Default: (1, 2, 4, 8, 16).
+ enc_non_bottleneck_channels (Tuple[int]): Size of channel
+ numbers of various Non-bottleneck block in encoder.
+ Default: (64, 128).
+ dec_upsample_channels (Tuple[int]): Size of channel numbers of
+ various Deconvolution block in decoder.
+ Default: (64, 16).
+ dec_num_stages_non_bottleneck (Tuple[int]): Number of stages of
+ Non-bottleneck block in decoder.
+ Default: (2, 2).
+ dec_non_bottleneck_channels (Tuple[int]): Size of channel
+ numbers of various Non-bottleneck block in decoder.
+ Default: (64, 16).
+ drop_rate (float): Probability of an element to be zeroed.
+ Default 0.1.
+ """
+
+ def __init__(self,
+ in_channels=3,
+ enc_downsample_channels=(16, 64, 128),
+ enc_num_stages_non_bottleneck=(5, 8),
+ enc_non_bottleneck_dilations=(2, 4, 8, 16),
+ enc_non_bottleneck_channels=(64, 128),
+ dec_upsample_channels=(64, 16),
+ dec_num_stages_non_bottleneck=(2, 2),
+ dec_non_bottleneck_channels=(64, 16),
+ dropout_ratio=0.1,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+
+ super(ERFNet, self).__init__(init_cfg=init_cfg)
+ assert len(enc_downsample_channels) \
+ == len(dec_upsample_channels)+1, 'Number of downsample\
+ block of encoder does not \
+ match number of upsample block of decoder!'
+ assert len(enc_downsample_channels) \
+ == len(enc_num_stages_non_bottleneck)+1, 'Number of \
+ downsample block of encoder does not match \
+ number of Non-bottleneck block of encoder!'
+ assert len(enc_downsample_channels) \
+ == len(enc_non_bottleneck_channels)+1, 'Number of \
+ downsample block of encoder does not match \
+ number of channels of Non-bottleneck block of encoder!'
+ assert enc_num_stages_non_bottleneck[-1] \
+ % len(enc_non_bottleneck_dilations) == 0, 'Number of \
+ Non-bottleneck block of encoder does not match \
+ number of Non-bottleneck block of encoder!'
+ assert len(dec_upsample_channels) \
+ == len(dec_num_stages_non_bottleneck), 'Number of \
+ upsample block of decoder does not match \
+ number of Non-bottleneck block of decoder!'
+ assert len(dec_num_stages_non_bottleneck) \
+ == len(dec_non_bottleneck_channels), 'Number of \
+ Non-bottleneck block of decoder does not match \
+ number of channels of Non-bottleneck block of decoder!'
+
+ self.in_channels = in_channels
+ self.enc_downsample_channels = enc_downsample_channels
+ self.enc_num_stages_non_bottleneck = enc_num_stages_non_bottleneck
+ self.enc_non_bottleneck_dilations = enc_non_bottleneck_dilations
+ self.enc_non_bottleneck_channels = enc_non_bottleneck_channels
+ self.dec_upsample_channels = dec_upsample_channels
+ self.dec_num_stages_non_bottleneck = dec_num_stages_non_bottleneck
+ self.dec_non_bottleneck_channels = dec_non_bottleneck_channels
+ self.dropout_ratio = dropout_ratio
+
+ self.encoder = nn.ModuleList()
+ self.decoder = nn.ModuleList()
+
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+
+ self.encoder.append(
+ DownsamplerBlock(self.in_channels, enc_downsample_channels[0]))
+
+ for i in range(len(enc_downsample_channels) - 1):
+ self.encoder.append(
+ DownsamplerBlock(enc_downsample_channels[i],
+ enc_downsample_channels[i + 1]))
+ # Last part of encoder is some dilated non_bottleneck_1d blocks.
+ if i == len(enc_downsample_channels) - 2:
+ iteration_times = int(enc_num_stages_non_bottleneck[-1] /
+ len(enc_non_bottleneck_dilations))
+ for j in range(iteration_times):
+ for k in range(len(enc_non_bottleneck_dilations)):
+ self.encoder.append(
+ non_bottleneck_1d(enc_downsample_channels[-1],
+ self.dropout_ratio,
+ enc_non_bottleneck_dilations[k]))
+ else:
+ for j in range(enc_num_stages_non_bottleneck[i]):
+ self.encoder.append(
+ non_bottleneck_1d(enc_downsample_channels[i + 1],
+ self.dropout_ratio))
+
+ for i in range(len(dec_upsample_channels)):
+ if i == 0:
+ self.decoder.append(
+ UpsamplerBlock(enc_downsample_channels[-1],
+ dec_non_bottleneck_channels[i]))
+ for j in range(dec_num_stages_non_bottleneck[i]):
+ self.decoder.append(
+ non_bottleneck_1d(dec_non_bottleneck_channels[i]))
+ else:
+ self.decoder.append(
+ UpsamplerBlock(dec_non_bottleneck_channels[i - 1],
+ dec_non_bottleneck_channels[i]))
+ for j in range(dec_num_stages_non_bottleneck[i]):
+ self.decoder.append(
+ non_bottleneck_1d(dec_non_bottleneck_channels[i]))
+
+ def forward(self, x):
+ for enc in self.encoder:
+ x = enc(x)
+ for dec in self.decoder:
+ x = dec(x)
+ return [x]