diff --git a/configs/enet/README.md b/configs/enet/README.md new file mode 100644 index 0000000000..e098df2a7a --- /dev/null +++ b/configs/enet/README.md @@ -0,0 +1,14 @@ +# ENet: A Deep Neural Network Architecture forReal-Time Semantic Segmentation + +## Reference +> Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello. "ENet: A Deep Neural Network Architecture for +Real-Time Semantic Segmentation." arXiv preprint arXiv:1606.02147(2016). + +## Performance + +### Cityscapes + +| Model | Backbone | Resolution | Training Iters | mIoU | mIoU (flip) | mIoU (ms+flip) | Links | +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +|ENet|-|1024x512|80000|67.42%|68.11%|67.99%|[model](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/bisenetv1_cityscapes_1024x512_160k/model.pdparams)\|[log](https://bj.bcebos.com/paddleseg/dygraph/cityscapes/bisenetv1_cityscapes_1024x512_160k/train.log)\|[vdl](https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=5d57386cdfcdb6a6bcb5135af134a0f2)| +| diff --git a/configs/enet/enet_cityscapes_1024x512_adam_0.002_80k.yml b/configs/enet/enet_cityscapes_1024x512_adam_0.002_80k.yml new file mode 100644 index 0000000000..7abfa2f89b --- /dev/null +++ b/configs/enet/enet_cityscapes_1024x512_adam_0.002_80k.yml @@ -0,0 +1,31 @@ +_base_: '../_base_/cityscapes.yml' +batch_size: 8 + +train_dataset: + type: Cityscapes + dataset_root: data/cityscapes + transforms: + - type: RandomPaddingCrop + crop_size: [1024, 512] + - type: RandomDistort + brightness_range: 0.4 + contrast_range: 0.4 + saturation_range: 0.4 + - type: Normalize + mode: train + +model: + type: ENet + num_classes: 19 + pretrained: Null + +optimizer: + _inherited_: False + type: adam + weight_decay: 0.0002 + +lr_scheduler: + end_lr: 0 + learning_rate: 0.001 + power: 0.9 + type: PolynomialDecay diff --git a/paddleseg/models/__init__.py b/paddleseg/models/__init__.py index dd69d1838e..62b3cc64e8 100644 --- a/paddleseg/models/__init__.py +++ b/paddleseg/models/__init__.py @@ -51,6 +51,7 @@ from .espnet import ESPNetV2 from .dmnet import DMNet from .espnetv1 import ESPNetV1 +from .enet import ENet from .bisenetv1 import BiseNetV1 from .fastfcn import FastFCN -from .pfpnnet import PFPNNet +from .pfpnnet import PFPNNet \ No newline at end of file diff --git a/paddleseg/models/enet.py b/paddleseg/models/enet.py new file mode 100644 index 0000000000..3971a48e41 --- /dev/null +++ b/paddleseg/models/enet.py @@ -0,0 +1,622 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddleseg import utils +from paddleseg.models import layers +from paddleseg.cvlibs import manager, param_init + +__all__ = ['ENet'] + + +@manager.MODELS.add_component +class ENet(nn.Layer): + """ + The ENet implementation based on PaddlePaddle. + + The original article refers to + Adam Paszke, Abhishek Chaurasia, Sangpil Kim, Eugenio Culurciello, et al."ENet: A Deep Neural Network Architecture for Real-Time Semantic Segmentation" + (https://arxiv.org/abs/1606.02147). + + Args: + num_classes (int): The unique number of target classes. + pretrained (str, optional): The path or url of pretrained model. Default: None. + encoder_relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: False. + decoder_relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + def __init__(self, + num_classes, + pretrained=None, + encoder_relu=False, + decoder_relu=True): + super(ENet, self).__init__() + + self.numclasses = num_classes + self.initial_block = InitialBlock(3, 16, relu=encoder_relu) + + self.downsample1_0 = DownsamplingBottleneck(16, + 64, + return_indices=True, + dropout_prob=0.01, + relu=encoder_relu) + self.regular1_1 = RegularBottleneck(64, + padding=1, + dropout_prob=0.01, + relu=encoder_relu) + self.regular1_2 = RegularBottleneck(64, + padding=1, + dropout_prob=0.01, + relu=encoder_relu) + self.regular1_3 = RegularBottleneck(64, + padding=1, + dropout_prob=0.01, + relu=encoder_relu) + self.regular1_4 = RegularBottleneck(64, + padding=1, + dropout_prob=0.01, + relu=encoder_relu) + + self.downsample2_0 = DownsamplingBottleneck(64, + 128, + return_indices=True, + dropout_prob=0.1, + relu=encoder_relu) + self.regular2_1 = RegularBottleneck(128, + padding=1, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_2 = RegularBottleneck(128, + dilation=2, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.asymmetric2_3 = RegularBottleneck(128, + kernel_size=5, + padding=2, + asymmetric=True, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_4 = RegularBottleneck(128, + dilation=4, + padding=4, + dropout_prob=0.1, + relu=encoder_relu) + self.regular2_5 = RegularBottleneck(128, + padding=1, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_6 = RegularBottleneck(128, + dilation=8, + padding=8, + dropout_prob=0.1, + relu=encoder_relu) + self.asymmetric2_7 = RegularBottleneck(128, + kernel_size=5, + asymmetric=True, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated2_8 = RegularBottleneck(128, + dilation=16, + padding=16, + dropout_prob=0.1, + relu=encoder_relu) + + self.regular3_0 = RegularBottleneck(128, + padding=1, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_1 = RegularBottleneck(128, + dilation=2, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.asymmetric3_2 = RegularBottleneck(128, + kernel_size=5, + padding=2, + asymmetric=True, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_3 = RegularBottleneck(128, + dilation=4, + padding=4, + dropout_prob=0.1, + relu=encoder_relu) + self.regular3_4 = RegularBottleneck(128, + padding=1, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_5 = RegularBottleneck(128, + dilation=8, + padding=8, + dropout_prob=0.1, + relu=encoder_relu) + self.asymmetric3_6 = RegularBottleneck(128, + kernel_size=5, + asymmetric=True, + padding=2, + dropout_prob=0.1, + relu=encoder_relu) + self.dilated3_7 = RegularBottleneck(128, + dilation=16, + padding=16, + dropout_prob=0.1, + relu=encoder_relu) + + self.upsample4_0 = UpsamplingBottleneck(128, + 64, + dropout_prob=0.1, + relu=decoder_relu) + self.regular4_1 = RegularBottleneck(64, + padding=1, + dropout_prob=0.1, + relu=decoder_relu) + self.regular4_2 = RegularBottleneck(64, + padding=1, + dropout_prob=0.1, + relu=decoder_relu) + + self.upsample5_0 = UpsamplingBottleneck(64, + 16, + dropout_prob=0.1, + relu=decoder_relu) + self.regular5_1 = RegularBottleneck(16, + padding=1, + dropout_prob=0.1, + relu=decoder_relu) + self.transposed_conv = nn.Conv2DTranspose(16, + num_classes, + kernel_size=3, + stride=2, + padding=1, + bias_attr=False) + + self.pretrained = pretrained + self.init_weight() + + def forward(self, x): + + input_size = x.shape + x = self.initial_block(x) + + stage1_input_size = x.shape + x, max_indices1_0 = self.downsample1_0(x) + x = self.regular1_1(x) + x = self.regular1_2(x) + x = self.regular1_3(x) + x = self.regular1_4(x) + + stage2_input_size = x.shape + x, max_indices2_0 = self.downsample2_0(x) + x = self.regular2_1(x) + x = self.dilated2_2(x) + x = self.asymmetric2_3(x) + x = self.dilated2_4(x) + x = self.regular2_5(x) + x = self.dilated2_6(x) + x = self.asymmetric2_7(x) + x = self.dilated2_8(x) + + x = self.regular3_0(x) + x = self.dilated3_1(x) + x = self.asymmetric3_2(x) + x = self.dilated3_3(x) + x = self.regular3_4(x) + x = self.dilated3_5(x) + x = self.asymmetric3_6(x) + x = self.dilated3_7(x) + + x = self.upsample4_0(x, max_indices2_0, output_size=stage2_input_size) + x = self.regular4_1(x) + x = self.regular4_2(x) + + x = self.upsample5_0(x, max_indices1_0, output_size=stage1_input_size) + x = self.regular5_1(x) + x = self.transposed_conv(x, output_size=input_size[2:]) + return [x] + + def init_weight(self): + if self.pretrained is not None: + utils.load_pretrained_model(self, self.pretrained) + + +class InitialBlock(nn.Layer): + """ + The initial block is composed of two branches: + 1. a main branch which performs a regular convolution with stride 2; + 2. an extension branch which performs max-pooling. + Doing both operations in parallel and concatenating their results + allows for efficient downsampling and expansion. The main branch + outputs 13 feature maps while the extension branch outputs 3, for a + total of 16 feature maps after concatenation. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number output channels. + kernel_size (int, optional): the kernel size of the filters used in + the convolution layer. Default: 3. + padding (int, optional): zero-padding added to both sides of the + input. Default: 0. + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + def __init__(self, in_channels, out_channels, bias=False, relu=True): + super(InitialBlock, self).__init__() + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_branch = nn.Conv2D(in_channels, + out_channels - 3, + kernel_size=3, + stride=2, + padding=1, + bias_attr=bias) + + self.ext_branch = nn.MaxPool2D(3, stride=2, padding=1) + + self.batch_norm = layers.SyncBatchNorm(out_channels) + + self.out_activation = activation() + + def forward(self, x): + main = self.main_branch(x) + ext = self.ext_branch(x) + + out = paddle.concat((main, ext), 1) + + out = self.batch_norm(out) + + return self.out_activation(out) + + +class RegularBottleneck(nn.Layer): + """ + Regular bottlenecks are the main building block of ENet. + Main branch: + 1. Shortcut connection. + Extension branch: + 1. 1x1 convolution which decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. regular, dilated or asymmetric convolution; + 3. 1x1 convolution which increases the number of channels back to + ``channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + channels (int): the number of input and output channels. + internal_ratio (int, optional): a scale factor applied to + ``channels`` used to compute the number of + channels after the projection. eg. given ``channels`` equal to 128 and + internal_ratio equal to 2 the number of channels after the projection + is 64. Default: 4. + kernel_size (int, optional): the kernel size of the filters used in + the convolution layer described above in item 2 of the extension + branch. Default: 3. + padding (int, optional): zero-padding added to both sides of the + input. Default: 0. + dilation (int, optional): spacing between kernel elements for the + convolution described in item 2 of the extension branch. Default: 1. + asymmetric (bool, optional): flags if the convolution described in + item 2 of the extension branch is asymmetric or not. Default: False. + dropout_prob (float, optional): probability of an element to be + zeroed. Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + def __init__(self, + channels, + internal_ratio=4, + kernel_size=3, + padding=0, + dilation=1, + asymmetric=False, + dropout_prob=0, + bias=False, + relu=True): + super(RegularBottleneck, self).__init__() + + if internal_ratio <= 1 or internal_ratio > channels: + raise RuntimeError( + "Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}.".format( + channels, internal_ratio)) + + internal_channels = channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.ext_conv1 = nn.Sequential( + nn.Conv2D(channels, + internal_channels, + kernel_size=1, + stride=1, + bias_attr=bias), layers.SyncBatchNorm(internal_channels), + activation()) + + if asymmetric: + self.ext_conv2 = nn.Sequential( + nn.Conv2D(internal_channels, + internal_channels, + kernel_size=(kernel_size, 1), + stride=1, + padding=(padding, 0), + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), activation(), + nn.Conv2D(internal_channels, + internal_channels, + kernel_size=(1, kernel_size), + stride=1, + padding=(0, padding), + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), activation()) + else: + self.ext_conv2 = nn.Sequential( + nn.Conv2D(internal_channels, + internal_channels, + kernel_size=kernel_size, + stride=1, + padding=padding, + dilation=dilation, + bias_attr=bias), + layers.SyncBatchNorm(internal_channels), activation()) + + self.ext_conv3 = nn.Sequential( + nn.Conv2D(internal_channels, + channels, + kernel_size=1, + stride=1, + bias_attr=bias), layers.SyncBatchNorm(channels), + activation()) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x): + main = x + + ext = self.ext_conv1(x) + ext = self.ext_conv2(ext) + ext = self.ext_conv3(ext) + ext = self.ext_regul(ext) + + out = main + ext + + return self.out_activation(out) + + +class DownsamplingBottleneck(nn.Layer): + """ + Downsampling bottlenecks further downsample the feature map size. + Main branch: + 1. max pooling with stride 2; indices are saved to be used for + unpooling later. + Extension branch: + 1. 2x2 convolution with stride 2 that decreases the number of channels + by ``internal_ratio``, also called a projection; + 2. regular convolution (by default, 3x3); + 3. 1x1 convolution which increases the number of channels to + ``out_channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number of output channels. + internal_ratio (int, optional): a scale factor applied to ``channels`` + used to compute the number of channels after the projection. eg. given + ``channels`` equal to 128 and internal_ratio equal to 2 the number of + channels after the projection is 64. Default: 4. + return_indices (bool, optional): if ``True``, will return the max + indices along with the outputs. Useful when unpooling later. + dropout_prob (float, optional): probability of an element to be + zeroed. Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if + ``True``. Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + def __init__(self, + in_channels, + out_channels, + internal_ratio=4, + return_indices=False, + dropout_prob=0, + bias=False, + relu=True): + super(DownsamplingBottleneck, self).__init__() + + self.return_indices = return_indices + + if internal_ratio <= 1 or internal_ratio > in_channels: + raise RuntimeError( + "Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}. ".format( + in_channels, internal_ratio)) + + internal_channels = in_channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_max1 = nn.MaxPool2D(2, stride=2, return_mask=return_indices) + + self.ext_conv1 = nn.Sequential( + nn.Conv2D(in_channels, + internal_channels, + kernel_size=2, + stride=2, + bias_attr=bias), layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_conv2 = nn.Sequential( + nn.Conv2D(internal_channels, + internal_channels, + kernel_size=3, + stride=1, + padding=1, + bias_attr=bias), layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_conv3 = nn.Sequential( + nn.Conv2D(internal_channels, + out_channels, + kernel_size=1, + stride=1, + bias_attr=bias), layers.SyncBatchNorm(out_channels), + activation()) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x): + if self.return_indices: + main, max_indices = self.main_max1(x) + else: + main = self.main_max1(x) + + ext = self.ext_conv1(x) + ext = self.ext_conv2(ext) + ext = self.ext_conv3(ext) + ext = self.ext_regul(ext) + + n, ch_ext, h, w = ext.shape + ch_main = main.shape[1] + padding = paddle.zeros((n, ch_ext - ch_main, h, w)) + + main = paddle.concat((main, padding), 1) + + out = main + ext + + return self.out_activation(out), max_indices + + +class UpsamplingBottleneck(nn.Layer): + """ + The upsampling bottlenecks upsample the feature map resolution using max + pooling indices stored from the corresponding downsampling bottleneck. + Main branch: + 1. 1x1 convolution with stride 1 that decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. max unpool layer using the max pool indices from the corresponding + downsampling max pool layer. + Extension branch: + 1. 1x1 convolution with stride 1 that decreases the number of channels by + ``internal_ratio``, also called a projection; + 2. transposed convolution (by default, 3x3); + 3. 1x1 convolution which increases the number of channels to + ``out_channels``, also called an expansion; + 4. dropout as a regularizer. + + Args: + in_channels (int): the number of input channels. + out_channels (int): the number of output channels. + internal_ratio (int, optional): a scale factor applied to ``in_channels`` + used to compute the number of channels after the projection. eg. given + ``in_channels`` equal to 128 and ``internal_ratio`` equal to 2 the number + of channels after the projection is 64. Default: 4. + dropout_prob (float, optional): probability of an element to be zeroed. + Default: 0 (no dropout). + bias (bool, optional): Adds a learnable bias to the output if ``True``. + Default: False. + relu (bool, optional): When ``True`` ReLU is used as the activation + function; otherwise, PReLU is used. Default: True. + """ + def __init__(self, + in_channels, + out_channels, + internal_ratio=4, + dropout_prob=0, + bias=False, + relu=True): + super(UpsamplingBottleneck, self).__init__() + + if internal_ratio <= 1 or internal_ratio > in_channels: + raise RuntimeError( + "Value out of range. Expected value in the " + "interval [1, {0}], got internal_scale={1}. ".format( + in_channels, internal_ratio)) + + internal_channels = in_channels // internal_ratio + + if relu: + activation = nn.ReLU + else: + activation = nn.PReLU + + self.main_conv1 = nn.Sequential( + nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=bias), + layers.SyncBatchNorm(out_channels)) + + self.ext_conv1 = nn.Sequential( + nn.Conv2D(in_channels, + internal_channels, + kernel_size=1, + bias_attr=bias), layers.SyncBatchNorm(internal_channels), + activation()) + + self.ext_tconv1 = nn.Conv2DTranspose(internal_channels, + internal_channels, + kernel_size=2, + stride=2, + bias_attr=bias) + self.ext_tconv1_bnorm = layers.SyncBatchNorm(internal_channels) + self.ext_tconv1_activation = activation() + + self.ext_conv2 = nn.Sequential( + nn.Conv2D(internal_channels, + out_channels, + kernel_size=1, + bias_attr=bias), layers.SyncBatchNorm(out_channels)) + + self.ext_regul = nn.Dropout2D(p=dropout_prob) + + self.out_activation = activation() + + def forward(self, x, max_indices, output_size): + main = self.main_conv1(x) + main = F.max_unpool2d(main, + max_indices, + kernel_size=2, + output_size=output_size) + + ext = self.ext_conv1(x) + ext = self.ext_tconv1(ext, output_size=output_size[2:]) + ext = self.ext_tconv1_bnorm(ext) + ext = self.ext_tconv1_activation(ext) + ext = self.ext_conv2(ext) + ext = self.ext_regul(ext) + + out = main + ext + + return self.out_activation(out)