From 46dce780fa2e4d6cf71c3de57c649f5d924dce2c Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 17 Jun 2021 16:53:51 +0800
Subject: [PATCH 01/38] add DPT head

---
 mmseg/models/decode_heads/dpt.py         | 194 +++++++++++++++++++++++
 mmseg/models/utils/__init__.py           |   4 +-
 mmseg/models/utils/post_process_layer.py |  66 ++++++++
 3 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 mmseg/models/decode_heads/dpt.py
 create mode 100644 mmseg/models/utils/post_process_layer.py

diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt.py
new file mode 100644
index 0000000000..d5f3a1c58f
--- /dev/null
+++ b/mmseg/models/decode_heads/dpt.py
@@ -0,0 +1,194 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (Conv2d, ConvModule, ConvTranspose2d,
+                      build_activation_layer, build_norm_layer)
+
+from ..builder import HEADS
+from ..utils import Transpose, _make_readout_ops
+from .decode_head import BaseDecodeHead
+
+
+class ViTPostProcessBlock(nn.Module):
+
+    def __init__(self,
+                 channels=768,
+                 out_channels=[96, 192, 384, 768],
+                 img_size=[384, 384],
+                 readout_type='ignore',
+                 start_index=1,
+                 kernel_sizes=[4, 2, 1, 3],
+                 strides=[4, 2, 1, 2],
+                 paddings=[0, 0, 0, 1]):
+        super(ViTPostProcessBlock, self).__init__()
+
+        self.readout_ops = _make_readout_ops(channels, out_channels,
+                                             readout_type, start_index)
+
+        self.unflatten_size = torch.Size(img_size[0] // 16, img_size[1] // 16)
+
+        self.post_process_ops = []
+        for idx, out_channels in enumerate(out_channels):
+            self.post_process_ops.append(
+                nn.Sequential(
+                    self.readout_ops[idx], Transpose(1, 2),
+                    nn.Unflatten(2, self.unflatten_size),
+                    Conv2d(channels, out_channels, kernel_size=1),
+                    ConvTranspose2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=kernel_sizes[idx],
+                        stride=strides[idx],
+                        padding=paddings[idx])))
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.readout_ops)
+        for idx, x in enumerate(inputs):
+            inputs[idx] = self.post_process_ops[idx](x)
+        return inputs
+
+
+class ResidualConvUnit(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN')):
+        super(ResidualConvUnit, self).__init__()
+        self.channels = in_channels
+
+        self.activation = build_activation_layer(act_cfg)
+        self.bn = False if norm_cfg is None else True
+        self.bias = not self.bn
+
+        self.conv1 = Conv2d(
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            bias=self.bias)
+
+        self.conv2 = Conv2d(
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            bias=self.bias)
+
+        if self.bn:
+            self.bn1 = build_norm_layer(norm_cfg, self.channels)
+            self.bn2 = build_norm_layer(norm_cfg, self.channels)
+
+    def forward(self, inputs):
+        x = self.activation(inputs)
+        x = self.conv1(x)
+        if self.bn:
+            x = self.bn1(x)
+
+        x = self.activation(x)
+        x = self.conv2(x)
+        if self.bn:
+            x = self.bn2(x)
+
+        return x + inputs
+
+
+class FeatureFusionBlock(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg=None,
+                 norm_cfg=None,
+                 deconv=False,
+                 expand=False,
+                 align_corners=True):
+        super(FeatureFusionBlock, self).__init__()
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.out_conv = Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+
+        self.res_conv_unit1 = ResidualConvUnit(self.in_channels, act_cfg,
+                                               norm_cfg)
+        self.res_conv_unit2 = ResidualConvUnit(self.in_channels, act_cfg,
+                                               norm_cfg)
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            x = x + self.res_conv_unit1(inputs[1])
+        x = self.res_conv_unit2(x)
+        x = F.interpolate(
+            x,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return self.out_conv(x)
+
+
+@HEADS.register_module()
+class DPTHead(BaseDecodeHead):
+
+    def __init__(self,
+                 num_classes,
+                 in_channels=256,
+                 img_size=[384, 384],
+                 channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_start_index=1,
+                 post_process_kernel_size=[4, 2, 1, 3],
+                 post_process_strides=[4, 2, 1, 2],
+                 post_process_paddings=[0, 0, 0, 1],
+                 expand_channels=False,
+                 act_cfg=None,
+                 norm_cfg=None):
+        super(DPTHead, self).__init__()
+
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.channels = channels
+        self.expand_channels = expand_channels
+        self.post_process_block = ViTPostProcessBlock(
+            in_channels, channels, img_size, readout_type, patch_start_index,
+            post_process_kernel_size, post_process_strides,
+            post_process_paddings)
+
+        out_channels = [
+            channel * math.pow(2, idx) if expand_channels else channel
+            for idx, channel in enumerate(channels)
+        ]
+        self.convs = []
+        for idx, channel in enumerate(channels):
+            self.convs.append(
+                Conv2d(channel, out_channels[idx], kernel_size=3, padding=1))
+
+        self.refinenet0 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
+        self.refinenet1 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
+        self.refinenet2 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
+        self.refinenet3 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
+
+        self.conv = ConvModule(
+            self.in_channels, self.in_channels, kernel_size=3, padding=1)
+
+    def forward(self, inputs):
+        x = self.post_process_block(self._transform_inputs(inputs))
+        x = [self.convs[idx](feature) for idx, feature in enumerate(x)]
+
+        path_3 = self.refinenet3(x[3])
+        path_2 = self.refinenet2(path_3, x[2])
+        path_1 = self.refinenet1(path_2, x[1])
+        path_0 = self.refinenet0(path_1, x[0])
+
+        x = self.conv(path_0)
+        output = self.cls_seg(x)
+        return output
diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py
index 3d3bdd349b..b7141aab27 100644
--- a/mmseg/models/utils/__init__.py
+++ b/mmseg/models/utils/__init__.py
@@ -1,6 +1,7 @@
 from .drop import DropPath
 from .inverted_residual import InvertedResidual, InvertedResidualV3
 from .make_divisible import make_divisible
+from .post_process_layer import Transpose, _make_readout_ops
 from .res_layer import ResLayer
 from .se_layer import SELayer
 from .self_attention_block import SelfAttentionBlock
@@ -9,5 +10,6 @@
 
 __all__ = [
     'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
-    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
+    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath',
+    'trunc_normal_', '_make_readout_ops', 'Transpose'
 ]
diff --git a/mmseg/models/utils/post_process_layer.py b/mmseg/models/utils/post_process_layer.py
new file mode 100644
index 0000000000..8e4fa57be8
--- /dev/null
+++ b/mmseg/models/utils/post_process_layer.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+
+
+class Readout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Readout, self).__init__()
+        self.start_index = start_index
+
+
+class Slice(Readout):
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(Readout):
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(Readout):
+
+    def __init__(self, in_channels, start_index=1):
+        super().__init__(start_index=start_index)
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_channels, in_channels), nn.GELU)
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+        return self.project(features)
+
+
+def _make_readout_ops(channels, out_channels, readout_type, start_index):
+    if readout_type == 'ignore':
+        readout_ops = [Slice(start_index) for _ in out_channels]
+    elif readout_type == 'add':
+        readout_ops = [AddReadout(start_index) for _ in out_channels]
+    elif readout_type == 'project':
+        readout_ops = [
+            ProjectReadout(channels, start_index) for _ in out_channels
+        ]
+    else:
+        assert f"unexpected readout operation type, expected 'ignore',\
+            'add' or 'project', but got {readout_type}"
+
+    return readout_ops
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x

From 7b80fd0a0e9a65b948648f3eafc78f614d05e9fa Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 17 Jun 2021 19:49:41 +0800
Subject: [PATCH 02/38] [fix] fix init error

---
 mmseg/models/decode_heads/__init__.py |  3 +-
 mmseg/models/decode_heads/dpt.py      | 61 +++++++++++++++------------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py
index 662aae3c00..f93c5cc49f 100644
--- a/mmseg/models/decode_heads/__init__.py
+++ b/mmseg/models/decode_heads/__init__.py
@@ -5,6 +5,7 @@
 from .da_head import DAHead
 from .dm_head import DMHead
 from .dnl_head import DNLHead
+from .dpt import DPTHead
 from .ema_head import EMAHead
 from .enc_head import EncHead
 from .fcn_head import FCNHead
@@ -24,5 +25,5 @@
     'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
     'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
     'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
-    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead'
+    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'DPTHead'
 ]
diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt.py
index d5f3a1c58f..162b095943 100644
--- a/mmseg/models/decode_heads/dpt.py
+++ b/mmseg/models/decode_heads/dpt.py
@@ -14,7 +14,7 @@
 class ViTPostProcessBlock(nn.Module):
 
     def __init__(self,
-                 channels=768,
+                 in_channels=768,
                  out_channels=[96, 192, 384, 768],
                  img_size=[384, 384],
                  readout_type='ignore',
@@ -24,10 +24,11 @@ def __init__(self,
                  paddings=[0, 0, 0, 1]):
         super(ViTPostProcessBlock, self).__init__()
 
-        self.readout_ops = _make_readout_ops(channels, out_channels,
+        self.readout_ops = _make_readout_ops(in_channels, out_channels,
                                              readout_type, start_index)
 
-        self.unflatten_size = torch.Size(img_size[0] // 16, img_size[1] // 16)
+        self.unflatten_size = torch.Size(
+            [img_size[0] // 16, img_size[1] // 16])
 
         self.post_process_ops = []
         for idx, out_channels in enumerate(out_channels):
@@ -35,7 +36,7 @@ def __init__(self,
                 nn.Sequential(
                     self.readout_ops[idx], Transpose(1, 2),
                     nn.Unflatten(2, self.unflatten_size),
-                    Conv2d(channels, out_channels, kernel_size=1),
+                    Conv2d(in_channels, out_channels, kernel_size=1),
                     ConvTranspose2d(
                         out_channels,
                         out_channels,
@@ -52,10 +53,7 @@ def forward(self, inputs):
 
 class ResidualConvUnit(nn.Module):
 
-    def __init__(self,
-                 in_channels,
-                 act_cfg=dict(type='ReLU'),
-                 norm_cfg=dict(type='BN')):
+    def __init__(self, in_channels, act_cfg=None, norm_cfg=None):
         super(ResidualConvUnit, self).__init__()
         self.channels = in_channels
 
@@ -138,50 +136,57 @@ def forward(self, *inputs):
 
 @HEADS.register_module()
 class DPTHead(BaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+
+    Args:
+    """
 
     def __init__(self,
-                 num_classes,
-                 in_channels=256,
                  img_size=[384, 384],
-                 channels=[96, 192, 384, 768],
+                 out_channels=[96, 192, 384, 768],
                  readout_type='ignore',
                  patch_start_index=1,
                  post_process_kernel_size=[4, 2, 1, 3],
                  post_process_strides=[4, 2, 1, 2],
                  post_process_paddings=[0, 0, 0, 1],
                  expand_channels=False,
-                 act_cfg=None,
-                 norm_cfg=None):
-        super(DPTHead, self).__init__()
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN'),
+                 **kwards):
+        super(DPTHead, self).__init__(**kwards)
 
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.channels = channels
+        self.in_channels = self.in_channels
+        self.out_channels = out_channels
         self.expand_channels = expand_channels
         self.post_process_block = ViTPostProcessBlock(
-            in_channels, channels, img_size, readout_type, patch_start_index,
-            post_process_kernel_size, post_process_strides,
+            self.channels, out_channels, img_size, readout_type,
+            patch_start_index, post_process_kernel_size, post_process_strides,
             post_process_paddings)
 
         out_channels = [
             channel * math.pow(2, idx) if expand_channels else channel
-            for idx, channel in enumerate(channels)
+            for idx, channel in enumerate(self.out_channels)
         ]
         self.convs = []
-        for idx, channel in enumerate(channels):
+        for idx, channel in enumerate(self.out_channels):
             self.convs.append(
-                Conv2d(channel, out_channels[idx], kernel_size=3, padding=1))
+                Conv2d(
+                    channel, self.out_channels[idx], kernel_size=3, padding=1))
 
-        self.refinenet0 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
-        self.refinenet1 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
-        self.refinenet2 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
-        self.refinenet3 = FeatureFusionBlock(in_channels, act_cfg, norm_cfg)
+        self.refinenet0 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
+        self.refinenet1 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
+        self.refinenet2 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
+        self.refinenet3 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
 
         self.conv = ConvModule(
-            self.in_channels, self.in_channels, kernel_size=3, padding=1)
+            self.channels, self.channels, kernel_size=3, padding=1)
 
     def forward(self, inputs):
-        x = self.post_process_block(self._transform_inputs(inputs))
+        x = self._transform_inputs(inputs)
+        x = self.post_process_block(x)
+
         x = [self.convs[idx](feature) for idx, feature in enumerate(x)]
 
         path_3 = self.refinenet3(x[3])

From 01b3da266e3ed36869c3fcdfa642d13b47f0ac78 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Fri, 18 Jun 2021 18:57:09 +0800
Subject: [PATCH 03/38] use mmcv function

---
 mmseg/models/decode_heads/dpt.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt.py
index 162b095943..b834c75e9c 100644
--- a/mmseg/models/decode_heads/dpt.py
+++ b/mmseg/models/decode_heads/dpt.py
@@ -2,12 +2,12 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import (Conv2d, ConvModule, ConvTranspose2d,
-                      build_activation_layer, build_norm_layer)
+from mmcv.cnn import (Conv2d, ConvModule, build_activation_layer,
+                      build_norm_layer, build_upsample_layer)
 
+from mmseg.ops import resize
 from ..builder import HEADS
-from ..utils import Transpose, _make_readout_ops
+from ..utils import _make_readout_ops
 from .decode_head import BaseDecodeHead
 
 
@@ -34,10 +34,10 @@ def __init__(self,
         for idx, out_channels in enumerate(out_channels):
             self.post_process_ops.append(
                 nn.Sequential(
-                    self.readout_ops[idx], Transpose(1, 2),
-                    nn.Unflatten(2, self.unflatten_size),
-                    Conv2d(in_channels, out_channels, kernel_size=1),
-                    ConvTranspose2d(
+                    self.readout_ops[idx], torch.Tensor.transpose(1, 2),
+                    torch.Tensor.view(self.unflatten_size),
+                    ConvModule(in_channels, out_channels, kernel_size=1),
+                    build_upsample_layer(
                         out_channels,
                         out_channels,
                         kernel_size=kernel_sizes[idx],
@@ -61,14 +61,14 @@ def __init__(self, in_channels, act_cfg=None, norm_cfg=None):
         self.bn = False if norm_cfg is None else True
         self.bias = not self.bn
 
-        self.conv1 = Conv2d(
+        self.conv1 = ConvModule(
             self.channels,
             self.channels,
             kernel_size=3,
             padding=1,
             bias=self.bias)
 
-        self.conv2 = Conv2d(
+        self.conv2 = ConvModule(
             self.channels,
             self.channels,
             kernel_size=3,
@@ -76,8 +76,8 @@ def __init__(self, in_channels, act_cfg=None, norm_cfg=None):
             bias=self.bias)
 
         if self.bn:
-            self.bn1 = build_norm_layer(norm_cfg, self.channels)
-            self.bn2 = build_norm_layer(norm_cfg, self.channels)
+            _, self.bn1 = build_norm_layer(norm_cfg, self.channels)
+            _, self.bn2 = build_norm_layer(norm_cfg, self.channels)
 
     def forward(self, inputs):
         x = self.activation(inputs)
@@ -126,7 +126,7 @@ def forward(self, *inputs):
         if len(inputs) == 2:
             x = x + self.res_conv_unit1(inputs[1])
         x = self.res_conv_unit2(x)
-        x = F.interpolate(
+        x = resize(
             x,
             scale_factor=2,
             mode='bilinear',

From e9df435f81032cf2a1eb3d6282141842513f8847 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Sat, 19 Jun 2021 14:36:04 +0800
Subject: [PATCH 04/38] delete code

---
 mmseg/models/decode_heads/dpt.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt.py
index b834c75e9c..11e0c8cec7 100644
--- a/mmseg/models/decode_heads/dpt.py
+++ b/mmseg/models/decode_heads/dpt.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import (Conv2d, ConvModule, build_activation_layer,
-                      build_norm_layer, build_upsample_layer)
+                      build_norm_layer)
 
 from mmseg.ops import resize
 from ..builder import HEADS
@@ -19,9 +19,7 @@ def __init__(self,
                  img_size=[384, 384],
                  readout_type='ignore',
                  start_index=1,
-                 kernel_sizes=[4, 2, 1, 3],
-                 strides=[4, 2, 1, 2],
-                 paddings=[0, 0, 0, 1]):
+                 scale_factors=[4, 2, 1, 0.5]):
         super(ViTPostProcessBlock, self).__init__()
 
         self.readout_ops = _make_readout_ops(in_channels, out_channels,
@@ -30,24 +28,9 @@ def __init__(self,
         self.unflatten_size = torch.Size(
             [img_size[0] // 16, img_size[1] // 16])
 
-        self.post_process_ops = []
-        for idx, out_channels in enumerate(out_channels):
-            self.post_process_ops.append(
-                nn.Sequential(
-                    self.readout_ops[idx], torch.Tensor.transpose(1, 2),
-                    torch.Tensor.view(self.unflatten_size),
-                    ConvModule(in_channels, out_channels, kernel_size=1),
-                    build_upsample_layer(
-                        out_channels,
-                        out_channels,
-                        kernel_size=kernel_sizes[idx],
-                        stride=strides[idx],
-                        padding=paddings[idx])))
-
     def forward(self, inputs):
         assert len(inputs) == len(self.readout_ops)
-        for idx, x in enumerate(inputs):
-            inputs[idx] = self.post_process_ops[idx](x)
+
         return inputs
 
 

From b21ea1505910e2d84d7d9aad61bfe050a2f73264 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Sat, 19 Jun 2021 15:33:19 +0800
Subject: [PATCH 05/38] remove transpose clas

---
 mmseg/models/utils/post_process_layer.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mmseg/models/utils/post_process_layer.py b/mmseg/models/utils/post_process_layer.py
index 8e4fa57be8..f341c43f9b 100644
--- a/mmseg/models/utils/post_process_layer.py
+++ b/mmseg/models/utils/post_process_layer.py
@@ -52,15 +52,3 @@ def _make_readout_ops(channels, out_channels, readout_type, start_index):
             'add' or 'project', but got {readout_type}"
 
     return readout_ops
-
-
-class Transpose(nn.Module):
-
-    def __init__(self, dim0, dim1):
-        super(Transpose, self).__init__()
-        self.dim0 = dim0
-        self.dim1 = dim1
-
-    def forward(self, x):
-        x = x.transpose(self.dim0, self.dim1)
-        return x

From 2efb2eb0476e9b7624089b1c588944a45efc1c0d Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Sat, 19 Jun 2021 15:47:26 +0800
Subject: [PATCH 06/38] support NLC output shape

---
 mmseg/models/backbones/vit.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index 774f555c49..ed48bdc48b 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -187,6 +187,8 @@ class VisionTransformer(BaseModule):
             Defalut: dict(type='GELU').
         final_norm (bool):  Whether to add a additional layer to normalize
             final feature map. Default: False.
+        out_reshape (str): Select the output format of feature information.
+            Default: NCHW.
         interpolate_mode (str): Select the interpolate mode for position
             embeding vector resize. Default: bicubic.
         num_fcs (int): The number of fully-connected layers for FFNs.
@@ -217,6 +219,7 @@ def __init__(self,
                  norm_cfg=dict(type='LN'),
                  act_cfg=dict(type='GELU'),
                  final_norm=False,
+                 out_shape='NCHW',
                  interpolate_mode='bicubic',
                  num_fcs=2,
                  norm_eval=False,
@@ -234,8 +237,10 @@ def __init__(self,
                 f'but got {len(img_size)}'
 
         assert pretrain_style in ['timm', 'mmcls']
-
+        assert out_shape in ['NLC',
+                             'NCHW'], 'output shape must be "NLC" or "NCHW".'
         self.pretrain_style = pretrain_style
+        self.out_shape = out_shape
         self.img_size = img_size
         self.patch_size = patch_size
 
@@ -431,15 +436,18 @@ def forward(self, inputs):
                 if self.final_norm:
                     x = self.norm1(x)
             if i in self.out_indices:
-                if self.with_cls_token:
-                    # Remove class token and reshape token for decoder head
-                    out = x[:, 1:]
+                if self.out_shape == 'NCHW':
+                    if self.with_cls_token:
+                        # Remove class token and reshape token for decoder head
+                        out = x[:, 1:]
+                    else:
+                        out = x
+                    B, _, C = out.shape
+                    out = out.reshape(B, inputs.shape[2] // self.patch_size,
+                                      inputs.shape[3] // self.patch_size,
+                                      C).permute(0, 3, 1, 2)
                 else:
                     out = x
-                B, _, C = out.shape
-                out = out.reshape(B, inputs.shape[2] // self.patch_size,
-                                  inputs.shape[3] // self.patch_size,
-                                  C).permute(0, 3, 1, 2)
                 outs.append(out)
 
         return tuple(outs)

From 5f877e1ff920f5510fe50819acfa7aa48309d807 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B0=A2=E6=98=95=E8=BE=B0?= <xinchen.xie@qq.com>
Date: Tue, 22 Jun 2021 11:32:01 +0800
Subject: [PATCH 07/38] Delete post_process_layer.py

---
 mmseg/models/utils/post_process_layer.py | 54 ------------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 mmseg/models/utils/post_process_layer.py

diff --git a/mmseg/models/utils/post_process_layer.py b/mmseg/models/utils/post_process_layer.py
deleted file mode 100644
index f341c43f9b..0000000000
--- a/mmseg/models/utils/post_process_layer.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class Readout(nn.Module):
-
-    def __init__(self, start_index=1):
-        super(Readout, self).__init__()
-        self.start_index = start_index
-
-
-class Slice(Readout):
-
-    def forward(self, x):
-        return x[:, self.start_index:]
-
-
-class AddReadout(Readout):
-
-    def forward(self, x):
-        if self.start_index == 2:
-            readout = (x[:, 0] + x[:, 1]) / 2
-        else:
-            readout = x[:, 0]
-        return x[:, self.start_index:] + readout.unsqueeze(1)
-
-
-class ProjectReadout(Readout):
-
-    def __init__(self, in_channels, start_index=1):
-        super().__init__(start_index=start_index)
-        self.project = nn.Sequential(
-            nn.Linear(2 * in_channels, in_channels), nn.GELU)
-
-    def forward(self, x):
-        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
-        features = torch.cat((x[:, self.start_index:], readout), -1)
-        return self.project(features)
-
-
-def _make_readout_ops(channels, out_channels, readout_type, start_index):
-    if readout_type == 'ignore':
-        readout_ops = [Slice(start_index) for _ in out_channels]
-    elif readout_type == 'add':
-        readout_ops = [AddReadout(start_index) for _ in out_channels]
-    elif readout_type == 'project':
-        readout_ops = [
-            ProjectReadout(channels, start_index) for _ in out_channels
-        ]
-    else:
-        assert f"unexpected readout operation type, expected 'ignore',\
-            'add' or 'project', but got {readout_type}"
-
-    return readout_ops

From 5ce02d32aa83469b00aedf17f380acf840ce410c Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 22 Jun 2021 19:26:27 +0800
Subject: [PATCH 08/38] add unittest and docstring

---
 mmseg/models/backbones/vit.py                 |   9 +-
 mmseg/models/decode_heads/dpt.py              | 151 +++++++++++++-----
 mmseg/models/utils/post_process_layer.py      |  54 -------
 tests/test_models/test_heads/test_dpt_head.py |  26 +++
 4 files changed, 146 insertions(+), 94 deletions(-)
 delete mode 100644 mmseg/models/utils/post_process_layer.py
 create mode 100644 tests/test_models/test_heads/test_dpt_head.py

diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index ed48bdc48b..e367f9b878 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -224,7 +224,8 @@ def __init__(self,
                  num_fcs=2,
                  norm_eval=False,
                  with_cp=False,
-                 pretrain_style='timm'):
+                 pretrain_style='timm',
+                 with_image_shape=False):
         super(VisionTransformer, self).__init__()
 
         if isinstance(img_size, int):
@@ -294,6 +295,7 @@ def __init__(self,
 
         self.norm_eval = norm_eval
         self.with_cp = with_cp
+        self.with_image_shape = with_image_shape
 
     @property
     def norm1(self):
@@ -447,7 +449,10 @@ def forward(self, inputs):
                                       inputs.shape[3] // self.patch_size,
                                       C).permute(0, 3, 1, 2)
                 else:
-                    out = x
+                    if self.with_image_shape:
+                        out = [x, [inputs.shape[2], inputs.shape[3]]]
+                    else:
+                        out = x
                 outs.append(out)
 
         return tuple(outs)
diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt.py
index 11e0c8cec7..cf8b27199c 100644
--- a/mmseg/models/decode_heads/dpt.py
+++ b/mmseg/models/decode_heads/dpt.py
@@ -7,36 +7,95 @@
 
 from mmseg.ops import resize
 from ..builder import HEADS
-from ..utils import _make_readout_ops
 from .decode_head import BaseDecodeHead
 
 
 class ViTPostProcessBlock(nn.Module):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and resize
+    the feature vector to feature map.
+
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        start_index (int): Start index of feature vector. Default: 1.
+        patch_size (int): The patch size. Default: 16.
+    """
 
     def __init__(self,
                  in_channels=768,
                  out_channels=[96, 192, 384, 768],
-                 img_size=[384, 384],
                  readout_type='ignore',
                  start_index=1,
-                 scale_factors=[4, 2, 1, 0.5]):
+                 patch_size=16):
         super(ViTPostProcessBlock, self).__init__()
 
-        self.readout_ops = _make_readout_ops(in_channels, out_channels,
-                                             readout_type, start_index)
-
-        self.unflatten_size = torch.Size(
-            [img_size[0] // 16, img_size[1] // 16])
-
-    def forward(self, inputs):
-        assert len(inputs) == len(self.readout_ops)
+        assert readout_type in ['ignore', 'add', 'project']
+        self.readout_type = readout_type
+        self.start_index = start_index
+        self.patch_size = patch_size
+
+        self.convs = [
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+            ) for out_channel in out_channels
+        ]
 
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+
+    def forward(self, inputs, img_size):
+        for i, x in enumerate(inputs):
+            if self.readout_type == 'ignore':
+                x = x[:, self.start_index:]
+            elif self.readout_type == 'add':
+                x = x[:, self.start_index] + x[:, 0].unsqueeze(1)
+            else:
+                readout = x[:, 0].unsqueeze(1).expand_as(x[:,
+                                                           self.start_index])
+                x = torch.cat((x[:, self.start_indx], readout), -1)
+            B, _, C = x.shape
+            x = x.reshape(B, img_size[0] // self.patch_size,
+                          img_size[1] // self.patch_size,
+                          C).permute(0, 3, 1, 2)
+            x = self.convs[i](x)
+            x = self.resize_layers[i](x)
+            inputs[i] = x
         return inputs
 
 
 class ResidualConvUnit(nn.Module):
+    """ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config before conv.
+        norm_cfg (dict): Config dict for normalization layer.
+    """
 
-    def __init__(self, in_channels, act_cfg=None, norm_cfg=None):
+    def __init__(self, in_channels, act_cfg, norm_cfg):
         super(ResidualConvUnit, self).__init__()
         self.channels = in_channels
 
@@ -77,19 +136,28 @@ def forward(self, inputs):
 
 
 class FeatureFusionBlock(nn.Module):
+    """FeatureFusionBlock, merge feature map from different stage.
+
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config for ResidualConvUnit.
+        norm_cfg (dict): Config dict for normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+    """
 
     def __init__(self,
                  in_channels,
-                 act_cfg=None,
-                 norm_cfg=None,
-                 deconv=False,
+                 act_cfg,
+                 norm_cfg,
                  expand=False,
                  align_corners=True):
         super(FeatureFusionBlock, self).__init__()
 
         self.in_channels = in_channels
         self.expand = expand
-        self.deconv = deconv
         self.align_corners = align_corners
 
         self.out_channels = in_channels
@@ -124,16 +192,25 @@ class DPTHead(BaseDecodeHead):
     This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
 
     Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_start_index (int): Start index of feature vector.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+        act_cfg (dict): The activation config for residual conv unit.
+            Defalut 'ReLU'.
+        norm_cfg (dict): Config dict for normalization layer. Default 'BN'.
     """
 
     def __init__(self,
-                 img_size=[384, 384],
-                 out_channels=[96, 192, 384, 768],
+                 embed_dims=768,
+                 post_process_channels=[96, 192, 384, 768],
                  readout_type='ignore',
                  patch_start_index=1,
-                 post_process_kernel_size=[4, 2, 1, 3],
-                 post_process_strides=[4, 2, 1, 2],
-                 post_process_paddings=[0, 0, 0, 1],
+                 patch_size=16,
                  expand_channels=False,
                  act_cfg=dict(type='ReLU'),
                  norm_cfg=dict(type='BN'),
@@ -141,22 +218,21 @@ def __init__(self,
         super(DPTHead, self).__init__(**kwards)
 
         self.in_channels = self.in_channels
-        self.out_channels = out_channels
         self.expand_channels = expand_channels
-        self.post_process_block = ViTPostProcessBlock(
-            self.channels, out_channels, img_size, readout_type,
-            patch_start_index, post_process_kernel_size, post_process_strides,
-            post_process_paddings)
-
-        out_channels = [
-            channel * math.pow(2, idx) if expand_channels else channel
-            for idx, channel in enumerate(self.out_channels)
+        self.post_process_block = ViTPostProcessBlock(embed_dims,
+                                                      post_process_channels,
+                                                      readout_type,
+                                                      patch_start_index,
+                                                      patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel
+            for i, channel in enumerate(post_process_channels)
         ]
-        self.convs = []
-        for idx, channel in enumerate(self.out_channels):
+        self.convs = nn.ModuleList()
+        for _, channel in enumerate(self.post_process_channels):
             self.convs.append(
-                Conv2d(
-                    channel, self.out_channels[idx], kernel_size=3, padding=1))
+                Conv2d(channel, self.channels, kernel_size=3, padding=1))
 
         self.refinenet0 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
         self.refinenet1 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
@@ -168,15 +244,14 @@ def __init__(self,
 
     def forward(self, inputs):
         x = self._transform_inputs(inputs)
-        x = self.post_process_block(x)
-
-        x = [self.convs[idx](feature) for idx, feature in enumerate(x)]
+        x, img_size = [i[0] for i in x], x[0][1]
+        x = self.post_process_block(x, img_size)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
 
         path_3 = self.refinenet3(x[3])
         path_2 = self.refinenet2(path_3, x[2])
         path_1 = self.refinenet1(path_2, x[1])
         path_0 = self.refinenet0(path_1, x[0])
-
         x = self.conv(path_0)
         output = self.cls_seg(x)
         return output
diff --git a/mmseg/models/utils/post_process_layer.py b/mmseg/models/utils/post_process_layer.py
deleted file mode 100644
index f341c43f9b..0000000000
--- a/mmseg/models/utils/post_process_layer.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class Readout(nn.Module):
-
-    def __init__(self, start_index=1):
-        super(Readout, self).__init__()
-        self.start_index = start_index
-
-
-class Slice(Readout):
-
-    def forward(self, x):
-        return x[:, self.start_index:]
-
-
-class AddReadout(Readout):
-
-    def forward(self, x):
-        if self.start_index == 2:
-            readout = (x[:, 0] + x[:, 1]) / 2
-        else:
-            readout = x[:, 0]
-        return x[:, self.start_index:] + readout.unsqueeze(1)
-
-
-class ProjectReadout(Readout):
-
-    def __init__(self, in_channels, start_index=1):
-        super().__init__(start_index=start_index)
-        self.project = nn.Sequential(
-            nn.Linear(2 * in_channels, in_channels), nn.GELU)
-
-    def forward(self, x):
-        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
-        features = torch.cat((x[:, self.start_index:], readout), -1)
-        return self.project(features)
-
-
-def _make_readout_ops(channels, out_channels, readout_type, start_index):
-    if readout_type == 'ignore':
-        readout_ops = [Slice(start_index) for _ in out_channels]
-    elif readout_type == 'add':
-        readout_ops = [AddReadout(start_index) for _ in out_channels]
-    elif readout_type == 'project':
-        readout_ops = [
-            ProjectReadout(channels, start_index) for _ in out_channels
-        ]
-    else:
-        assert f"unexpected readout operation type, expected 'ignore',\
-            'add' or 'project', but got {readout_type}"
-
-    return readout_ops
diff --git a/tests/test_models/test_heads/test_dpt_head.py b/tests/test_models/test_heads/test_dpt_head.py
new file mode 100644
index 0000000000..5f5cd84e5e
--- /dev/null
+++ b/tests/test_models/test_heads/test_dpt_head.py
@@ -0,0 +1,26 @@
+import pytest
+import torch
+
+from mmseg.models.decode_heads import DPTHead
+
+
+def test_dpt_head():
+
+    with pytest.raises(AssertionError):
+        # input_transform must be 'multiple_select'
+        head = DPTHead(
+            in_channels=[768, 768, 768, 768],
+            channels=256,
+            num_classes=19,
+            in_index=[0, 1, 2, 3])
+
+    head = DPTHead(
+        in_channels=[768, 768, 768, 768],
+        channels=256,
+        num_classes=19,
+        in_index=[0, 1, 2, 3],
+        input_transform='multiple_select')
+
+    inputs = [[torch.randn(4, 5, 768), [32, 32]] for _ in range(4)]
+    output = head(inputs)
+    assert output.shape == torch.Size((4, 19, 16, 16))

From 31c42bdb84f4cfe7ecf0ec3632ae3c12c761dc25 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Mon, 5 Jul 2021 17:27:56 +0800
Subject: [PATCH 09/38] rename variables

---
 mmseg/models/backbones/vit.py                 |   8 +-
 mmseg/models/decode_heads/__init__.py         |   2 +-
 .../decode_heads/{dpt.py => dpt_head.py}      | 159 +++++++++---------
 3 files changed, 88 insertions(+), 81 deletions(-)
 rename mmseg/models/decode_heads/{dpt.py => dpt_head.py} (63%)

diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index f7bdfdfa0b..b53c51bf55 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -141,6 +141,8 @@ class VisionTransformer(BaseModule):
             some memory while slowing down the training speed. Default: False.
         pretrain_style (str): Choose to use timm or mmcls pretrain weights.
             Default: timm.
+        with_spatial_size (bool): Whether append input image shape to output
+            feature vector when out_shape is not 'NCHW'. Default: False.
         pretrained (str, optional): model pretrained path. Default: None.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Default: None.
@@ -170,7 +172,7 @@ def __init__(self,
                  norm_eval=False,
                  with_cp=False,
                  pretrain_style='timm',
-                 with_image_shape=False,
+                 with_spatial_size=False,
                  pretrained=None,
                  init_cfg=None):
         super(VisionTransformer, self).__init__()
@@ -204,7 +206,7 @@ def __init__(self,
         self.pretrain_style = pretrain_style
         self.pretrained = pretrained
         self.init_cfg = init_cfg
-        self.with_image_shape = with_image_shape
+        self.with_spatial_size = with_spatial_size
 
         self.patch_embed = PatchEmbed(
             in_channels=in_channels,
@@ -413,7 +415,7 @@ def forward(self, inputs):
                                       inputs.shape[3] // self.patch_size,
                                       C).permute(0, 3, 1, 2)
                 else:
-                    if self.with_image_shape:
+                    if self.with_spatial_size:
                         out = [x, [inputs.shape[2], inputs.shape[3]]]
                     else:
                         out = x
diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py
index 52ef540cf9..f837fabb32 100644
--- a/mmseg/models/decode_heads/__init__.py
+++ b/mmseg/models/decode_heads/__init__.py
@@ -5,7 +5,7 @@
 from .da_head import DAHead
 from .dm_head import DMHead
 from .dnl_head import DNLHead
-from .dpt import DPTHead
+from .dpt_head import DPTHead
 from .ema_head import EMAHead
 from .enc_head import EncHead
 from .fcn_head import FCNHead
diff --git a/mmseg/models/decode_heads/dpt.py b/mmseg/models/decode_heads/dpt_head.py
similarity index 63%
rename from mmseg/models/decode_heads/dpt.py
rename to mmseg/models/decode_heads/dpt_head.py
index cf8b27199c..6c4e1db647 100644
--- a/mmseg/models/decode_heads/dpt.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -2,17 +2,18 @@
 
 import torch
 import torch.nn as nn
-from mmcv.cnn import (Conv2d, ConvModule, build_activation_layer,
+from mmcv.cnn import (ConvModule, build_activation_layer, build_conv_layer,
                       build_norm_layer)
+from mmcv.runner import BaseModule
 
 from mmseg.ops import resize
 from ..builder import HEADS
 from .decode_head import BaseDecodeHead
 
 
-class ViTPostProcessBlock(nn.Module):
-    """ViTPostProcessBlock, process cls_token in ViT backbone output and resize
-    the feature vector to feature map.
+class ReassembleBlocks(BaseModule):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
 
     Args:
         in_channels (int): ViT feature channels. Default: 768.
@@ -29,14 +30,14 @@ def __init__(self,
                  readout_type='ignore',
                  start_index=1,
                  patch_size=16):
-        super(ViTPostProcessBlock, self).__init__()
+        super(ReassembleBlocks, self).__init__()
 
         assert readout_type in ['ignore', 'add', 'project']
         self.readout_type = readout_type
         self.start_index = start_index
         self.patch_size = patch_size
 
-        self.convs = [
+        self.projects = [
             ConvModule(
                 in_channels=in_channels,
                 out_channels=out_channel,
@@ -71,71 +72,76 @@ def forward(self, inputs, img_size):
             if self.readout_type == 'ignore':
                 x = x[:, self.start_index:]
             elif self.readout_type == 'add':
-                x = x[:, self.start_index] + x[:, 0].unsqueeze(1)
+                x = x[:, self.start_index:] + x[:, 0].unsqueeze(1)
             else:
                 readout = x[:, 0].unsqueeze(1).expand_as(x[:,
                                                            self.start_index])
-                x = torch.cat((x[:, self.start_indx], readout), -1)
+                x = torch.cat((x[:, self.start_indx:], readout), -1)
             B, _, C = x.shape
             x = x.reshape(B, img_size[0] // self.patch_size,
                           img_size[1] // self.patch_size,
                           C).permute(0, 3, 1, 2)
-            x = self.convs[i](x)
+            x = self.projects[i](x)
             x = self.resize_layers[i](x)
             inputs[i] = x
         return inputs
 
 
-class ResidualConvUnit(nn.Module):
-    """ResidualConvUnit, pre-activate residual unit.
+class PreActResidualConvUnit(BaseModule):
+    """ResidualConvUnit, pre-activate residual unit."""
 
-    Args:
-        in_channels (int): Input channels.
-        act_cfg (dict): The activation config before conv.
-        norm_cfg (dict): Config dict for normalization layer.
-    """
-
-    def __init__(self, in_channels, act_cfg, norm_cfg):
-        super(ResidualConvUnit, self).__init__()
-        self.channels = in_channels
-
-        self.activation = build_activation_layer(act_cfg)
-        self.bn = False if norm_cfg is None else True
-        self.bias = not self.bn
-
-        self.conv1 = ConvModule(
-            self.channels,
-            self.channels,
-            kernel_size=3,
-            padding=1,
-            bias=self.bias)
-
-        self.conv2 = ConvModule(
-            self.channels,
-            self.channels,
-            kernel_size=3,
-            padding=1,
-            bias=self.bias)
-
-        if self.bn:
-            _, self.bn1 = build_norm_layer(norm_cfg, self.channels)
-            _, self.bn2 = build_norm_layer(norm_cfg, self.channels)
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 conv_cfg=None,
+                 stride=1,
+                 dilation=1,
+                 init_cfg=None):
+        super(PreActResidualConvUnit, self).__init__(init_cfg)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, in_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, in_channels, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, in_channels, in_channels, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+        self.activate = build_activation_layer(act_cfg)
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
 
     def forward(self, inputs):
-        x = self.activation(inputs)
+        x = self.activate(inputs)
         x = self.conv1(x)
-        if self.bn:
-            x = self.bn1(x)
-
-        x = self.activation(x)
+        x = self.norm1(x)
+        x = self.activate(x)
         x = self.conv2(x)
-        if self.bn:
-            x = self.bn2(x)
+        x = self.norm2(x)
 
         return x + inputs
 
 
-class FeatureFusionBlock(nn.Module):
+class FeatureFusionBlock(BaseModule):
     """FeatureFusionBlock, merge feature map from different stage.
 
     Args:
@@ -164,13 +170,13 @@ def __init__(self,
         if self.expand:
             self.out_channels = in_channels // 2
 
-        self.out_conv = Conv2d(
+        self.project = ConvModule(
             self.in_channels, self.out_channels, kernel_size=1)
 
-        self.res_conv_unit1 = ResidualConvUnit(self.in_channels, act_cfg,
-                                               norm_cfg)
-        self.res_conv_unit2 = ResidualConvUnit(self.in_channels, act_cfg,
-                                               norm_cfg)
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
 
     def forward(self, *inputs):
         x = inputs[0]
@@ -182,7 +188,7 @@ def forward(self, *inputs):
             scale_factor=2,
             mode='bilinear',
             align_corners=self.align_corners)
-        return self.out_conv(x)
+        return self.project(x)
 
 
 @HEADS.register_module()
@@ -214,16 +220,16 @@ def __init__(self,
                  expand_channels=False,
                  act_cfg=dict(type='ReLU'),
                  norm_cfg=dict(type='BN'),
-                 **kwards):
-        super(DPTHead, self).__init__(**kwards)
+                 **kwargs):
+        super(DPTHead, self).__init__(**kwargs)
 
         self.in_channels = self.in_channels
         self.expand_channels = expand_channels
-        self.post_process_block = ViTPostProcessBlock(embed_dims,
-                                                      post_process_channels,
-                                                      readout_type,
-                                                      patch_start_index,
-                                                      patch_size)
+        self.reassemble_blocks = ReassembleBlocks(embed_dims,
+                                                  post_process_channels,
+                                                  readout_type,
+                                                  patch_start_index,
+                                                  patch_size)
 
         self.post_process_channels = [
             channel * math.pow(2, i) if expand_channels else channel
@@ -232,26 +238,25 @@ def __init__(self,
         self.convs = nn.ModuleList()
         for _, channel in enumerate(self.post_process_channels):
             self.convs.append(
-                Conv2d(channel, self.channels, kernel_size=3, padding=1))
+                ConvModule(channel, self.channels, kernel_size=3, padding=1))
 
-        self.refinenet0 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
-        self.refinenet1 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
-        self.refinenet2 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
-        self.refinenet3 = FeatureFusionBlock(self.channels, act_cfg, norm_cfg)
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(
+                FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
 
-        self.conv = ConvModule(
+        self.project = ConvModule(
             self.channels, self.channels, kernel_size=3, padding=1)
 
     def forward(self, inputs):
         x = self._transform_inputs(inputs)
         x, img_size = [i[0] for i in x], x[0][1]
-        x = self.post_process_block(x, img_size)
+        x = self.reassemble_blocks(x, img_size)
         x = [self.convs[i](feature) for i, feature in enumerate(x)]
 
-        path_3 = self.refinenet3(x[3])
-        path_2 = self.refinenet2(path_3, x[2])
-        path_1 = self.refinenet1(path_2, x[1])
-        path_0 = self.refinenet0(path_1, x[0])
-        x = self.conv(path_0)
-        output = self.cls_seg(x)
-        return output
+        out = self.fusion_blocks[3](x[3])
+        for i in range(2, -1, -1):
+            out = self.fusion_blocks[i](out, x[i])
+        out = self.project(out)
+        out = self.cls_seg(out)
+        return out

From bf900b6161e6351f1c4138b20454f502ccc1ffd0 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Mon, 5 Jul 2021 18:07:25 +0800
Subject: [PATCH 10/38] fix project error and add unittest

---
 mmseg/models/decode_heads/dpt_head.py         | 11 ++++++++--
 tests/test_models/test_heads/test_dpt_head.py | 21 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 6c4e1db647..5b6a37fdfc 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -66,6 +66,12 @@ def __init__(self,
                 stride=2,
                 padding=1)
         ])
+        if self.readout_type == 'project':
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels), nn.GELU()))
 
     def forward(self, inputs, img_size):
         for i, x in enumerate(inputs):
@@ -75,8 +81,9 @@ def forward(self, inputs, img_size):
                 x = x[:, self.start_index:] + x[:, 0].unsqueeze(1)
             else:
                 readout = x[:, 0].unsqueeze(1).expand_as(x[:,
-                                                           self.start_index])
-                x = torch.cat((x[:, self.start_indx:], readout), -1)
+                                                           self.start_index:])
+                x = torch.cat((x[:, self.start_index:], readout), -1)
+                x = self.readout_projects[i](x)
             B, _, C = x.shape
             x = x.reshape(B, img_size[0] // self.patch_size,
                           img_size[1] // self.patch_size,
diff --git a/tests/test_models/test_heads/test_dpt_head.py b/tests/test_models/test_heads/test_dpt_head.py
index 5f5cd84e5e..1c730c61c2 100644
--- a/tests/test_models/test_heads/test_dpt_head.py
+++ b/tests/test_models/test_heads/test_dpt_head.py
@@ -24,3 +24,24 @@ def test_dpt_head():
     inputs = [[torch.randn(4, 5, 768), [32, 32]] for _ in range(4)]
     output = head(inputs)
     assert output.shape == torch.Size((4, 19, 16, 16))
+
+    # test readout operation
+    head = DPTHead(
+        in_channels=[768, 768, 768, 768],
+        channels=256,
+        num_classes=19,
+        in_index=[0, 1, 2, 3],
+        input_transform='multiple_select',
+        readout_type='add')
+    output = head(inputs)
+    assert output.shape == torch.Size((4, 19, 16, 16))
+
+    head = DPTHead(
+        in_channels=[768, 768, 768, 768],
+        channels=256,
+        num_classes=19,
+        in_index=[0, 1, 2, 3],
+        input_transform='multiple_select',
+        readout_type='project')
+    output = head(inputs)
+    assert output.shape == torch.Size((4, 19, 16, 16))

From 716863bfa9d644d39d5daa7f4761d211e4b2e4ba Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 6 Jul 2021 17:23:00 +0800
Subject: [PATCH 11/38] match dpt weights

---
 mmseg/models/decode_heads/dpt_head.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 5b6a37fdfc..ce19073133 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -37,13 +37,13 @@ def __init__(self,
         self.start_index = start_index
         self.patch_size = patch_size
 
-        self.projects = [
+        self.projects = nn.ModuleList([
             ConvModule(
                 in_channels=in_channels,
                 out_channels=out_channel,
                 kernel_size=1,
             ) for out_channel in out_channels
-        ]
+        ])
 
         self.resize_layers = nn.ModuleList([
             nn.ConvTranspose2d(
@@ -245,7 +245,12 @@ def __init__(self,
         self.convs = nn.ModuleList()
         for _, channel in enumerate(self.post_process_channels):
             self.convs.append(
-                ConvModule(channel, self.channels, kernel_size=3, padding=1))
+                ConvModule(
+                    channel,
+                    self.channels,
+                    kernel_size=3,
+                    padding=1,
+                    bias=False))
 
         self.fusion_blocks = nn.ModuleList()
         for _ in range(len(self.convs)):
@@ -253,7 +258,11 @@ def __init__(self,
                 FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
 
         self.project = ConvModule(
-            self.channels, self.channels, kernel_size=3, padding=1)
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg)
 
     def forward(self, inputs):
         x = self._transform_inputs(inputs)

From 94bf93563891090514fad444313b3b5bb249ad81 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 6 Jul 2021 17:37:28 +0800
Subject: [PATCH 12/38] add configs

---
 configs/_base_/models/dpt_vit-l16.py          | 43 +++++++++++++++++++
 .../dpt/dpt_vit-l16_512x512_160k_ade20k.py    |  7 +++
 2 files changed, 50 insertions(+)
 create mode 100644 configs/_base_/models/dpt_vit-l16.py
 create mode 100644 configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
new file mode 100644
index 0000000000..534c0ade1a
--- /dev/null
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -0,0 +1,43 @@
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz', # noqa
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=384,
+        embed_dims=1024,
+        num_heads=16,
+        num_layers=24,
+        out_indices=(2, 5, 8, 11),
+        out_shape='NLC',
+        final_norm=True,
+        with_spatial_size=True),
+    decode_head=dict(
+        type='DPTHead',
+        in_channels=(1024, 1024, 1024, 1024),
+        channels=256,
+        embed_dims=1024,
+        post_process_channels=[256, 512, 1024, 1024],
+        num_classes=19,
+        readout_type='project',
+        input_transform='multiple_select',
+        in_index=(0, 1, 2, 3),
+        norm_cfg=norm_cfg,
+        loss_decode=dict(
+           type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=768,
+        in_index=3,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
new file mode 100644
index 0000000000..d3e45dfd04
--- /dev/null
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/dpt_vit-16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+
+model = dict(
+    decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150))

From d4cd92408fc324828b33df89a041f1ea6f4ba4fb Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 7 Jul 2021 15:58:23 +0800
Subject: [PATCH 13/38] fix vit pos_embed bug and dpt feature fusion bug

---
 configs/_base_/models/dpt_vit-l16.py  |  2 +-
 mmseg/models/backbones/vit.py         | 57 ++++++++++++++++++++++-----
 mmseg/models/decode_heads/dpt_head.py |  9 ++++-
 mmseg/models/utils/embed.py           |  2 +
 4 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index 534c0ade1a..00eee459d3 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -8,7 +8,7 @@
         embed_dims=1024,
         num_heads=16,
         num_layers=24,
-        out_indices=(2, 5, 8, 11),
+        out_indices=(5, 11, 17, 23),
         out_shape='NLC',
         final_norm=True,
         with_spatial_size=True),
diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index b53c51bf55..45995706ff 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import (build_norm_layer, constant_init, kaiming_init,
-                      normal_init, trunc_normal_init)
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      kaiming_init, normal_init, trunc_normal_init)
 from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
 from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -13,7 +13,7 @@
 
 from mmseg.utils import get_root_logger
 from ..builder import BACKBONES
-from ..utils import PatchEmbed, vit_convert
+from ..utils import vit_convert
 
 
 class TransformerEncoderLayer(BaseModule):
@@ -93,6 +93,49 @@ def forward(self, x):
         return x
 
 
+# Modified from pytorch-image-models
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        patch_size (int): The size of one patch
+        in_channels (int): The num of input channels.
+        embed_dims (int): The dimensions of embedding.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+        conv_cfg (dict, optional): The config dict for conv layers.
+            Default: None.
+    """
+
+    def __init__(self,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 norm_cfg=None,
+                 conv_cfg=None):
+        super(PatchEmbed, self).__init__()
+
+        # Use conv layer to embed
+        self.projection = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            embed_dims,
+            kernel_size=patch_size,
+            stride=patch_size)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        x = self.projection(x).flatten(2).transpose(1, 2)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
+
+
 @BACKBONES.register_module()
 class VisionTransformer(BaseModule):
     """Vision Transformer.
@@ -209,14 +252,10 @@ def __init__(self,
         self.with_spatial_size = with_spatial_size
 
         self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
             in_channels=in_channels,
             embed_dims=embed_dims,
-            conv_type='Conv2d',
-            kernel_size=patch_size,
-            stride=patch_size,
-            norm_cfg=norm_cfg if patch_norm else None,
-            init_cfg=None,
-        )
+            norm_cfg=norm_cfg if patch_norm else None)
 
         num_patches = (img_size[0] // patch_size) * \
             (img_size[1] // patch_size)
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index ce19073133..695317f4a1 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -144,7 +144,6 @@ def forward(self, inputs):
         x = self.activate(x)
         x = self.conv2(x)
         x = self.norm2(x)
-
         return x + inputs
 
 
@@ -188,7 +187,13 @@ def __init__(self,
     def forward(self, *inputs):
         x = inputs[0]
         if len(inputs) == 2:
-            x = x + self.res_conv_unit1(inputs[1])
+            if x.shape != inputs[1].shape:
+                x_ = resize(
+                    inputs[1],
+                    size=(x.shape[2], x.shape[3]),
+                    mode='bilinear',
+                    align_corners=False)
+            x = x + self.res_conv_unit1(x_)
         x = self.res_conv_unit2(x)
         x = resize(
             x,
diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py
index 3bbb45b37a..48aa9262dd 100644
--- a/mmseg/models/utils/embed.py
+++ b/mmseg/models/utils/embed.py
@@ -73,12 +73,14 @@ def __init__(self,
 
     def forward(self, x):
         H, W = x.shape[2], x.shape[3]
+        print(f'img size {x.shape}')
         if H % self.patch_size[0] != 0:
             x = F.pad(x,
                       (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
         if W % self.patch_size[1] != 0:
             x = F.pad(x,
                       (0, self.patch_size[1] - W % self.patch_size[1], 0, 0))
+        print(f'after pad {x.shape}')
         x = self.projection(x)
         self.DH, self.DW = x.shape[2], x.shape[3]
         x = x.flatten(2).transpose(1, 2)

From f147aa9e7aa7bb31e137b4cfdd9a2dfe8d5fa822 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 20 Jul 2021 14:56:08 +0800
Subject: [PATCH 14/38] match vit output

---
 mmseg/models/backbones/vit.py         | 54 ++-----------------
 mmseg/models/decode_heads/dpt_head.py | 76 +++++++++++++++------------
 2 files changed, 46 insertions(+), 84 deletions(-)

diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index b5432263a1..021bf09331 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -4,8 +4,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
-                      kaiming_init, normal_init, trunc_normal_init)
+from mmcv.cnn import (build_norm_layer, constant_init, kaiming_init,
+                      normal_init, trunc_normal_init)
 from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
 from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -13,7 +13,7 @@
 
 from mmseg.utils import get_root_logger
 from ..builder import BACKBONES
-from ..utils import vit_convert
+from ..utils import PatchEmbed, vit_convert
 
 
 class TransformerEncoderLayer(BaseModule):
@@ -93,49 +93,6 @@ def forward(self, x):
         return x
 
 
-# Modified from pytorch-image-models
-class PatchEmbed(BaseModule):
-    """Image to Patch Embedding.
-
-    Args:
-        patch_size (int): The size of one patch
-        in_channels (int): The num of input channels.
-        embed_dims (int): The dimensions of embedding.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-        conv_cfg (dict, optional): The config dict for conv layers.
-            Default: None.
-    """
-
-    def __init__(self,
-                 patch_size=16,
-                 in_channels=3,
-                 embed_dims=768,
-                 norm_cfg=None,
-                 conv_cfg=None):
-        super(PatchEmbed, self).__init__()
-
-        # Use conv layer to embed
-        self.projection = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            embed_dims,
-            kernel_size=patch_size,
-            stride=patch_size)
-
-        if norm_cfg is not None:
-            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        x = self.projection(x).flatten(2).transpose(1, 2)
-
-        if self.norm is not None:
-            x = self.norm(x)
-
-        return x
-
-
 @BACKBONES.register_module()
 class VisionTransformer(BaseModule):
     """Vision Transformer.
@@ -184,8 +141,6 @@ class VisionTransformer(BaseModule):
             some memory while slowing down the training speed. Default: False.
         pretrain_style (str): Choose to use timm or mmcls pretrain weights.
             Default: timm.
-        with_spatial_size (bool): Whether append input image shape to output
-            feature vector when out_shape is not 'NCHW'. Default: False.
         pretrained (str, optional): model pretrained path. Default: None.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Default: None.
@@ -215,7 +170,6 @@ def __init__(self,
                  norm_eval=False,
                  with_cp=False,
                  pretrain_style='timm',
-                 with_spatial_size=False,
                  pretrained=None,
                  init_cfg=None):
         super(VisionTransformer, self).__init__()
@@ -249,10 +203,8 @@ def __init__(self,
         self.pretrain_style = pretrain_style
         self.pretrained = pretrained
         self.init_cfg = init_cfg
-        self.with_spatial_size = with_spatial_size
 
         self.patch_embed = PatchEmbed(
-            patch_size=patch_size,
             in_channels=in_channels,
             embed_dims=embed_dims,
             conv_type='Conv2d',
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 695317f4a1..f502825a4b 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -20,7 +20,6 @@ class ReassembleBlocks(BaseModule):
         out_channels (List): output channels of each stage.
             Default: [96, 192, 384, 768].
         readout_type (str): Type of readout operation. Default: 'ignore'.
-        start_index (int): Start index of feature vector. Default: 1.
         patch_size (int): The patch size. Default: 16.
     """
 
@@ -28,13 +27,11 @@ def __init__(self,
                  in_channels=768,
                  out_channels=[96, 192, 384, 768],
                  readout_type='ignore',
-                 start_index=1,
                  patch_size=16):
         super(ReassembleBlocks, self).__init__()
 
         assert readout_type in ['ignore', 'add', 'project']
         self.readout_type = readout_type
-        self.start_index = start_index
         self.patch_size = patch_size
 
         self.projects = nn.ModuleList([
@@ -73,29 +70,41 @@ def __init__(self,
                     nn.Sequential(
                         nn.Linear(2 * in_channels, in_channels), nn.GELU()))
 
-    def forward(self, inputs, img_size):
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
         for i, x in enumerate(inputs):
-            if self.readout_type == 'ignore':
-                x = x[:, self.start_index:]
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == 'project':
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
             elif self.readout_type == 'add':
-                x = x[:, self.start_index:] + x[:, 0].unsqueeze(1)
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
             else:
-                readout = x[:, 0].unsqueeze(1).expand_as(x[:,
-                                                           self.start_index:])
-                x = torch.cat((x[:, self.start_index:], readout), -1)
-                x = self.readout_projects[i](x)
-            B, _, C = x.shape
-            x = x.reshape(B, img_size[0] // self.patch_size,
-                          img_size[1] // self.patch_size,
-                          C).permute(0, 3, 1, 2)
+                pass
             x = self.projects[i](x)
             x = self.resize_layers[i](x)
-            inputs[i] = x
-        return inputs
+            out.append(x)
+        return out
 
 
 class PreActResidualConvUnit(BaseModule):
-    """ResidualConvUnit, pre-activate residual unit."""
+    """ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_cfg (dict): dictionary to construct and config norm layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
 
     def __init__(self,
                  in_channels,
@@ -188,12 +197,12 @@ def forward(self, *inputs):
         x = inputs[0]
         if len(inputs) == 2:
             if x.shape != inputs[1].shape:
-                x_ = resize(
+                inputs[1] = resize(
                     inputs[1],
                     size=(x.shape[2], x.shape[3]),
                     mode='bilinear',
                     align_corners=False)
-            x = x + self.res_conv_unit1(x_)
+            x = x + self.res_conv_unit1(inputs[1])
         x = self.res_conv_unit2(x)
         x = resize(
             x,
@@ -214,20 +223,19 @@ class DPTHead(BaseDecodeHead):
         post_process_channels (List): Out channels of post process conv
             layers. Default: [96, 192, 384, 768].
         readout_type (str): Type of readout operation. Default: 'ignore'.
-        patch_start_index (int): Start index of feature vector.
         patch_size (int): The patch size. Default: 16.
         expand_channels (bool): Whether expand the channels in post process
             block. Default: False.
         act_cfg (dict): The activation config for residual conv unit.
-            Defalut 'ReLU'.
-        norm_cfg (dict): Config dict for normalization layer. Default 'BN'.
+            Defalut dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
     """
 
     def __init__(self,
                  embed_dims=768,
                  post_process_channels=[96, 192, 384, 768],
                  readout_type='ignore',
-                 patch_start_index=1,
                  patch_size=16,
                  expand_channels=False,
                  act_cfg=dict(type='ReLU'),
@@ -239,16 +247,14 @@ def __init__(self,
         self.expand_channels = expand_channels
         self.reassemble_blocks = ReassembleBlocks(embed_dims,
                                                   post_process_channels,
-                                                  readout_type,
-                                                  patch_start_index,
-                                                  patch_size)
+                                                  readout_type, patch_size)
 
         self.post_process_channels = [
             channel * math.pow(2, i) if expand_channels else channel
             for i, channel in enumerate(post_process_channels)
         ]
         self.convs = nn.ModuleList()
-        for _, channel in enumerate(self.post_process_channels):
+        for channel in self.post_process_channels:
             self.convs.append(
                 ConvModule(
                     channel,
@@ -268,15 +274,19 @@ def __init__(self,
             kernel_size=3,
             padding=1,
             norm_cfg=norm_cfg)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
 
     def forward(self, inputs):
+        assert len(inputs) == self.num_reassemble_blocks
         x = self._transform_inputs(inputs)
-        x, img_size = [i[0] for i in x], x[0][1]
-        x = self.reassemble_blocks(x, img_size)
+        x = self.reassemble_blocks(x)
         x = [self.convs[i](feature) for i, feature in enumerate(x)]
-
-        out = self.fusion_blocks[3](x[3])
-        for i in range(2, -1, -1):
+        out = self.fusion_blocks[-1](x[-1])
+        for i in reversed(range(len(self.fusion_blocks) - 1)):
             out = self.fusion_blocks[i](out, x[i])
         out = self.project(out)
         out = self.cls_seg(out)

From 0e4fb4fa0e79b057b030a4beffc59ff089a193ef Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 20 Jul 2021 15:06:02 +0800
Subject: [PATCH 15/38] fix gelu

---
 mmseg/models/decode_heads/dpt_head.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index f502825a4b..97fe6cce4c 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -2,8 +2,8 @@
 
 import torch
 import torch.nn as nn
-from mmcv.cnn import (ConvModule, build_activation_layer, build_conv_layer,
-                      build_norm_layer)
+from mmcv.cnn import (ConvModule, Linear, build_activation_layer,
+                      build_conv_layer, build_norm_layer)
 from mmcv.runner import BaseModule
 
 from mmseg.ops import resize
@@ -68,12 +68,14 @@ def __init__(self,
             for _ in range(len(self.projects)):
                 self.readout_projects.append(
                     nn.Sequential(
-                        nn.Linear(2 * in_channels, in_channels), nn.GELU()))
+                        Linear(2 * in_channels, in_channels),
+                        build_activation_layer(dict(type='GELU'))))
 
     def forward(self, inputs):
         assert isinstance(inputs, list)
         out = []
         for i, x in enumerate(inputs):
+            assert len(x) == 2
             x, cls_token = x[0], x[1]
             feature_shape = x.shape
             if self.readout_type == 'project':

From 6073dfaaee1a97723a25b4093aaa70868eabdba1 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 20 Jul 2021 15:08:35 +0800
Subject: [PATCH 16/38] minor change

---
 mmseg/models/decode_heads/dpt_head.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 97fe6cce4c..f4aa66bb5a 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -99,7 +99,7 @@ class PreActResidualConvUnit(BaseModule):
 
     Args:
         in_channels (int): number of channels in the input feature map.
-        act_cfg (dict): dictionary to construct and config norm layer.
+        act_cfg (dict): dictionary to construct and config activation layer.
         norm_cfg (dict): dictionary to construct and config norm layer.
         conv_cfg (dict): dictionary to construct and config conv layer.
             Default: None.

From 1ebb5586cc8266ec55b384e56395d12725413d43 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 20 Jul 2021 15:43:40 +0800
Subject: [PATCH 17/38] update unitest

---
 mmseg/models/decode_heads/dpt_head.py         | 6 ++++--
 tests/test_models/test_heads/test_dpt_head.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index f4aa66bb5a..aea8e44897 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -199,12 +199,14 @@ def forward(self, *inputs):
         x = inputs[0]
         if len(inputs) == 2:
             if x.shape != inputs[1].shape:
-                inputs[1] = resize(
+                x_ = resize(
                     inputs[1],
                     size=(x.shape[2], x.shape[3]),
                     mode='bilinear',
                     align_corners=False)
-            x = x + self.res_conv_unit1(inputs[1])
+            else:
+                x_ = inputs[1]
+            x = x + self.res_conv_unit1(x_)
         x = self.res_conv_unit2(x)
         x = resize(
             x,
diff --git a/tests/test_models/test_heads/test_dpt_head.py b/tests/test_models/test_heads/test_dpt_head.py
index 1c730c61c2..5b0e9ebc4c 100644
--- a/tests/test_models/test_heads/test_dpt_head.py
+++ b/tests/test_models/test_heads/test_dpt_head.py
@@ -21,7 +21,8 @@ def test_dpt_head():
         in_index=[0, 1, 2, 3],
         input_transform='multiple_select')
 
-    inputs = [[torch.randn(4, 5, 768), [32, 32]] for _ in range(4)]
+    inputs = [[torch.randn(4, 768, 2, 2),
+               torch.randn(4, 768)] for _ in range(4)]
     output = head(inputs)
     assert output.shape == torch.Size((4, 19, 16, 16))
 

From b3903caaba235bbeebdae935b8c67cf2b77c362f Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Tue, 20 Jul 2021 15:58:32 +0800
Subject: [PATCH 18/38] fix configs error

---
 configs/_base_/models/dpt_vit-l16.py           | 8 ++++----
 configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index 00eee459d3..a5e0fe8cce 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -9,22 +9,22 @@
         num_heads=16,
         num_layers=24,
         out_indices=(5, 11, 17, 23),
-        out_shape='NLC',
         final_norm=True,
-        with_spatial_size=True),
+        with_cls_token=True,
+        output_cls_token=True),
     decode_head=dict(
         type='DPTHead',
         in_channels=(1024, 1024, 1024, 1024),
         channels=256,
         embed_dims=1024,
         post_process_channels=[256, 512, 1024, 1024],
-        num_classes=19,
+        num_classes=150,
         readout_type='project',
         input_transform='multiple_select',
         in_index=(0, 1, 2, 3),
         norm_cfg=norm_cfg,
         loss_decode=dict(
-           type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
     auxiliary_head=dict(
         type='FCNHead',
         in_channels=768,
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
index d3e45dfd04..6792d6305a 100644
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../_base_/models/dpt_vit-16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/models/dpt_vit-l16.py', '../_base_/datasets/ade20k.py',
     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 ]
 

From ef87aa584aa74aff8e5ad4d9b42643fa49440a32 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 22 Jul 2021 14:57:47 +0800
Subject: [PATCH 19/38] inference test

---
 configs/_base_/datasets/ade20k.py     |  4 +++-
 configs/_base_/models/dpt_vit-l16.py  |  2 +-
 mmseg/models/decode_heads/dpt_head.py | 24 +++++++++++++++---------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/configs/_base_/datasets/ade20k.py b/configs/_base_/datasets/ade20k.py
index efc8b4bb20..344fa03830 100644
--- a/configs/_base_/datasets/ade20k.py
+++ b/configs/_base_/datasets/ade20k.py
@@ -1,8 +1,10 @@
 # dataset settings
 dataset_type = 'ADE20KDataset'
 data_root = 'data/ade/ADEChallengeData2016'
+# img_norm_cfg = dict(
+#     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
 crop_size = (512, 512)
 train_pipeline = [
     dict(type='LoadImageFromFile'),
diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index a5e0fe8cce..f9d6872078 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -9,7 +9,7 @@
         num_heads=16,
         num_layers=24,
         out_indices=(5, 11, 17, 23),
-        final_norm=True,
+        final_norm=False,
         with_cls_token=True,
         output_cls_token=True),
     decode_head=dict(
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index aea8e44897..3b4ea0e649 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -39,6 +39,7 @@ def __init__(self,
                 in_channels=in_channels,
                 out_channels=out_channel,
                 kernel_size=1,
+                act_cfg=None,
             ) for out_channel in out_channels
         ])
 
@@ -188,7 +189,11 @@ def __init__(self,
             self.out_channels = in_channels // 2
 
         self.project = ConvModule(
-            self.in_channels, self.out_channels, kernel_size=1)
+            self.in_channels,
+            self.out_channels,
+            kernel_size=1,
+            act_cfg=None,
+            bias=True)
 
         self.res_conv_unit1 = PreActResidualConvUnit(
             in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
@@ -199,21 +204,22 @@ def forward(self, *inputs):
         x = inputs[0]
         if len(inputs) == 2:
             if x.shape != inputs[1].shape:
-                x_ = resize(
+                res = resize(
                     inputs[1],
                     size=(x.shape[2], x.shape[3]),
                     mode='bilinear',
                     align_corners=False)
             else:
-                x_ = inputs[1]
-            x = x + self.res_conv_unit1(x_)
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
         x = self.res_conv_unit2(x)
         x = resize(
             x,
             scale_factor=2,
             mode='bilinear',
             align_corners=self.align_corners)
-        return self.project(x)
+        x = self.project(x)
+        return x
 
 
 @HEADS.register_module()
@@ -265,8 +271,8 @@ def __init__(self,
                     self.channels,
                     kernel_size=3,
                     padding=1,
+                    act_cfg=None,
                     bias=False))
-
         self.fusion_blocks = nn.ModuleList()
         for _ in range(len(self.convs)):
             self.fusion_blocks.append(
@@ -289,9 +295,9 @@ def forward(self, inputs):
         x = self._transform_inputs(inputs)
         x = self.reassemble_blocks(x)
         x = [self.convs[i](feature) for i, feature in enumerate(x)]
-        out = self.fusion_blocks[-1](x[-1])
-        for i in reversed(range(len(self.fusion_blocks) - 1)):
-            out = self.fusion_blocks[i](out, x[i])
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
         out = self.project(out)
         out = self.cls_seg(out)
         return out

From 9669d54d1ff6e369ba6ebdbaafe241e5fdfdced3 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 22 Jul 2021 17:36:02 +0800
Subject: [PATCH 20/38] remove auxilary

---
 configs/_base_/datasets/ade20k.py              |  4 +---
 configs/_base_/models/dpt_vit-l16.py           | 16 ++--------------
 configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py |  3 +--
 3 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/configs/_base_/datasets/ade20k.py b/configs/_base_/datasets/ade20k.py
index 344fa03830..efc8b4bb20 100644
--- a/configs/_base_/datasets/ade20k.py
+++ b/configs/_base_/datasets/ade20k.py
@@ -1,10 +1,8 @@
 # dataset settings
 dataset_type = 'ADE20KDataset'
 data_root = 'data/ade/ADEChallengeData2016'
-# img_norm_cfg = dict(
-#     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 img_norm_cfg = dict(
-    mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True)
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
 crop_size = (512, 512)
 train_pipeline = [
     dict(type='LoadImageFromFile'),
diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index f9d6872078..4cf2931341 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -1,7 +1,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz', # noqa
+    pretrained='https://download.openmmlab.com/mmclassification/v0/vit/vit_large_patch16_384.pth', # noqa
     backbone=dict(
         type='VisionTransformer',
         img_size=384,
@@ -25,19 +25,7 @@
         norm_cfg=norm_cfg,
         loss_decode=dict(
             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    auxiliary_head=dict(
-        type='FCNHead',
-        in_channels=768,
-        in_index=3,
-        channels=256,
-        num_convs=1,
-        concat_input=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    auxiliary_head=None,
     # model training and testing settings
     train_cfg=dict(),
     test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
index 6792d6305a..d74aafdd6c 100644
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -3,5 +3,4 @@
     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 ]
 
-model = dict(
-    decode_head=dict(num_classes=150), auxiliary_head=dict(num_classes=150))
+model = dict(decode_head=dict(num_classes=150))

From 03637460283ce1b7bf8a68cd1b149e06fc6cad5f Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 29 Jul 2021 16:40:16 +0800
Subject: [PATCH 21/38] use local pretrain

---
 configs/_base_/models/dpt_vit-l16.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index 4cf2931341..0aa0a78f1b 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -1,7 +1,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='https://download.openmmlab.com/mmclassification/v0/vit/vit_large_patch16_384.pth', # noqa
+    pretrained='pretrain/vit-l_timm.pth', # noqa
     backbone=dict(
         type='VisionTransformer',
         img_size=384,
@@ -11,7 +11,12 @@
         out_indices=(5, 11, 17, 23),
         final_norm=False,
         with_cls_token=True,
-        output_cls_token=True),
+        output_cls_token=True,
+        # init_cfg=dict(
+        #     type='Pretrained',
+        #     checkpoint='pretrain/vit-l_timm.pth',
+        #     prefix='backbone')
+        ),
     decode_head=dict(
         type='DPTHead',
         in_channels=(1024, 1024, 1024, 1024),

From e1ecf6aa9e0ad3a416b6ec52c68b9303b0f7f735 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 11 Aug 2021 16:00:52 +0800
Subject: [PATCH 22/38] update training results

---
 configs/_base_/models/dpt_vit-l16.py          |  7 +--
 configs/dpt/README.md                         | 30 ++++++++++
 .../dpt/dpt_vit-b16_512x512_160k_ade20k.py    | 58 +++++++++++++++++++
 .../dpt/dpt_vit-l16_512x512_160k_ade20k.py    | 30 ++++++++++
 mmseg/models/decode_heads/dpt_head.py         |  2 +-
 5 files changed, 120 insertions(+), 7 deletions(-)
 create mode 100644 configs/dpt/README.md
 create mode 100644 configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-l16.py
index 0aa0a78f1b..5e12c1cedd 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-l16.py
@@ -11,12 +11,7 @@
         out_indices=(5, 11, 17, 23),
         final_norm=False,
         with_cls_token=True,
-        output_cls_token=True,
-        # init_cfg=dict(
-        #     type='Pretrained',
-        #     checkpoint='pretrain/vit-l_timm.pth',
-        #     prefix='backbone')
-        ),
+        output_cls_token=True),
     decode_head=dict(
         type='DPTHead',
         in_channels=(1024, 1024, 1024, 1024),
diff --git a/configs/dpt/README.md b/configs/dpt/README.md
new file mode 100644
index 0000000000..26667d74d5
--- /dev/null
+++ b/configs/dpt/README.md
@@ -0,0 +1,30 @@
+# Vision Transformer for Dense Prediction
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+```latex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+
+@article{Ranftl2021,
+  author    = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
+  title     = {Vision Transformers for Dense Prediction},
+  journal   = {ArXiv preprint},
+  year      = {2021},
+}
+```
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DPT | ViT-B | 512x512  | 160000  |  |   | 46.97 | 48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
+| DPT | ViT-L | 512x512  | 160000  |  |   | 46.19 | 46.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-20210809_172025.log.json) |
diff --git a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
new file mode 100644
index 0000000000..cc8c1cc711
--- /dev/null
+++ b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/dpt_vit-l16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+
+model = dict(
+    pretrained='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', # noqa
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=224,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        out_indices=(2, 5, 8, 11),
+        final_norm=False,
+        with_cls_token=True,
+        with_cp=True,
+        output_cls_token=True),
+    decode_head=dict(
+        type='DPTHead',
+        in_channels=(768, 768, 768, 768),
+        channels=256,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        num_classes=150),
+    auxiliary_head=None,
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
+
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.00006,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+lr_config = dict(
+    _delete_=True,
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-6,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
index d74aafdd6c..6459ad353b 100644
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -4,3 +4,33 @@
 ]
 
 model = dict(decode_head=dict(num_classes=150))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optimizer = dict(
+    _delete_=True,
+    type='AdamW',
+    lr=0.00006,
+    betas=(0.9, 0.999),
+    weight_decay=0.01,
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+lr_config = dict(
+    _delete_=True,
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-6,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+
+cudnn_benchmark = False
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 3b4ea0e649..dbb1c27f90 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -277,7 +277,7 @@ def __init__(self,
         for _ in range(len(self.convs)):
             self.fusion_blocks.append(
                 FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
-
+        self.fusion_blocks[0].res_conv_unit1 = None
         self.project = ConvModule(
             self.channels,
             self.channels,

From 7726d2b7d270478f5ab5c8287e7b84d812afef13 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 11 Aug 2021 16:02:13 +0800
Subject: [PATCH 23/38] update yml

---
 configs/dpt/dpt.yml | 34 ++++++++++++++++++++++++++++++++++
 model-index.yml     |  1 +
 2 files changed, 35 insertions(+)
 create mode 100644 configs/dpt/dpt.yml

diff --git a/configs/dpt/dpt.yml b/configs/dpt/dpt.yml
new file mode 100644
index 0000000000..a921248da3
--- /dev/null
+++ b/configs/dpt/dpt.yml
@@ -0,0 +1,34 @@
+Collections:
+- Name: dpt
+  Metadata:
+    Training Data:
+    - ADE20K
+Models:
+- Name: dpt_vit-b16_512x512_160k_ade20k
+  In Collection: dpt
+  Metadata:
+    backbone: ViT-B
+    crop size: (512,512)
+    lr schd: 160000
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.97
+      mIoU(ms+flip): 48.34
+  Config: configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
+- Name: dpt_vit-l16_512x512_160k_ade20k
+  In Collection: dpt
+  Metadata:
+    backbone: ViT-L
+    crop size: (512,512)
+    lr schd: 160000
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.19
+      mIoU(ms+flip): 46.97
+  Config: configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth
diff --git a/model-index.yml b/model-index.yml
index f834162e26..ce533c7af9 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -8,6 +8,7 @@ Import:
 - configs/deeplabv3plus/deeplabv3plus.yml
 - configs/dmnet/dmnet.yml
 - configs/dnlnet/dnlnet.yml
+- configs/dpt/dpt.yml
 - configs/emanet/emanet.yml
 - configs/encnet/encnet.yml
 - configs/fastscnn/fastscnn.yml

From c5593af327c2dea5bd25db67309d00ddfa885066 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 12 Aug 2021 11:07:29 +0800
Subject: [PATCH 24/38] update fps and memory test

---
 configs/dpt/README.md |  4 ++--
 configs/dpt/dpt.yml   | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 26667d74d5..62d568c85d 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -26,5 +26,5 @@
 
 | Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
 | ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DPT | ViT-B | 512x512  | 160000  |  |   | 46.97 | 48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
-| DPT | ViT-L | 512x512  | 160000  |  |   | 46.19 | 46.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-20210809_172025.log.json) |
+| DPT | ViT-B | 512x512  | 160000  | 8.09 | 10.41 | 46.97 | 48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
+| DPT | ViT-L | 512x512  | 160000  | 18.37 | 4.36 | 46.19 | 46.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-20210809_172025.log.json) |
diff --git a/configs/dpt/dpt.yml b/configs/dpt/dpt.yml
index a921248da3..dad2b5c8df 100644
--- a/configs/dpt/dpt.yml
+++ b/configs/dpt/dpt.yml
@@ -10,6 +10,14 @@ Models:
     backbone: ViT-B
     crop size: (512,512)
     lr schd: 160000
+    inference time (ms/im):
+    - value: 96.06
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    memory (GB): 8.09
   Results:
     Task: Semantic Segmentation
     Dataset: ADE20K
@@ -24,6 +32,14 @@ Models:
     backbone: ViT-L
     crop size: (512,512)
     lr schd: 160000
+    inference time (ms/im):
+    - value: 229.36
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (512,512)
+    memory (GB): 18.37
   Results:
     Task: Semantic Segmentation
     Dataset: ADE20K

From 30aabc4e5493d27c8820e8744744b763a3aee6a5 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 10:56:12 +0800
Subject: [PATCH 25/38] update doc

---
 .../models/{dpt_vit-l16.py => dpt_vit-b16.py} | 19 +++---
 .../dpt/dpt_vit-b16_512x512_160k_ade20k.py    | 30 +---------
 .../dpt/dpt_vit-l16_512x512_160k_ade20k.py    | 58 ++++++++-----------
 3 files changed, 35 insertions(+), 72 deletions(-)
 rename configs/_base_/models/{dpt_vit-l16.py => dpt_vit-b16.py} (68%)

diff --git a/configs/_base_/models/dpt_vit-l16.py b/configs/_base_/models/dpt_vit-b16.py
similarity index 68%
rename from configs/_base_/models/dpt_vit-l16.py
rename to configs/_base_/models/dpt_vit-b16.py
index 5e12c1cedd..d77e164b7f 100644
--- a/configs/_base_/models/dpt_vit-l16.py
+++ b/configs/_base_/models/dpt_vit-b16.py
@@ -1,23 +1,24 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='pretrain/vit-l_timm.pth', # noqa
+    pretrained='pretrain/vit-b16_p16_224pth', # noqa
     backbone=dict(
         type='VisionTransformer',
-        img_size=384,
-        embed_dims=1024,
-        num_heads=16,
-        num_layers=24,
-        out_indices=(5, 11, 17, 23),
+        img_size=224,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        out_indices=(2, 5, 8, 11),
         final_norm=False,
         with_cls_token=True,
+        with_cp=True,
         output_cls_token=True),
     decode_head=dict(
         type='DPTHead',
-        in_channels=(1024, 1024, 1024, 1024),
+        in_channels=(768, 768, 768, 768),
         channels=256,
-        embed_dims=1024,
-        post_process_channels=[256, 512, 1024, 1024],
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
         num_classes=150,
         readout_type='project',
         input_transform='multiple_select',
diff --git a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
index cc8c1cc711..e5a48ef23a 100644
--- a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
@@ -1,34 +1,8 @@
 _base_ = [
-    '../_base_/models/dpt_vit-l16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/models/dpt_vit-b16.py', '../_base_/datasets/ade20k.py',
     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 ]
 
-model = dict(
-    pretrained='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', # noqa
-    backbone=dict(
-        type='VisionTransformer',
-        img_size=224,
-        embed_dims=768,
-        num_layers=12,
-        num_heads=12,
-        out_indices=(2, 5, 8, 11),
-        final_norm=False,
-        with_cls_token=True,
-        with_cp=True,
-        output_cls_token=True),
-    decode_head=dict(
-        type='DPTHead',
-        in_channels=(768, 768, 768, 768),
-        channels=256,
-        embed_dims=768,
-        post_process_channels=[96, 192, 384, 768],
-        num_classes=150),
-    auxiliary_head=None,
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))  # yapf: disable
-
-
 # AdamW optimizer, no weight decay for position embedding & layer norm
 # in backbone
 optimizer = dict(
@@ -55,4 +29,4 @@
     by_epoch=False)
 
 # By default, models are trained on 8 GPUs with 2 images per GPU
-data = dict(samples_per_gpu=2)
+data = dict(workers_per_gpu=2)
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
index 6459ad353b..3a7c33cc7c 100644
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -1,36 +1,24 @@
-_base_ = [
-    '../_base_/models/dpt_vit-l16.py', '../_base_/datasets/ade20k.py',
-    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
-]
+_base_ = './dpt_vit-b16_512x512_160k_ade20k.py'
 
-model = dict(decode_head=dict(num_classes=150))
-
-# AdamW optimizer, no weight decay for position embedding & layer norm
-# in backbone
-optimizer = dict(
-    _delete_=True,
-    type='AdamW',
-    lr=0.00006,
-    betas=(0.9, 0.999),
-    weight_decay=0.01,
-    paramwise_cfg=dict(
-        custom_keys={
-            'pos_embed': dict(decay_mult=0.),
-            'cls_token': dict(decay_mult=0.),
-            'norm': dict(decay_mult=0.)
-        }))
-
-lr_config = dict(
-    _delete_=True,
-    policy='poly',
-    warmup='linear',
-    warmup_iters=1500,
-    warmup_ratio=1e-6,
-    power=1.0,
-    min_lr=0.0,
-    by_epoch=False)
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-data = dict(samples_per_gpu=2)
-
-cudnn_benchmark = False
+model = dict(
+    type='EncoderDecoder',
+    pretrained='pretrain/vit-l16_p16_384.pth', # noqa
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=384,
+        embed_dims=1024,
+        num_heads=16,
+        num_layers=24,
+        out_indices=(5, 11, 17, 23),
+        final_norm=False,
+        with_cls_token=True,
+        output_cls_token=True),
+    decode_head=dict(
+        type='DPTHead',
+        in_channels=(1024, 1024, 1024, 1024),
+        channels=256,
+        embed_dims=1024,
+        post_process_channels=[256, 512, 1024, 1024]),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable

From 64e6f644d964268cedc5e26e52e2ad9271e7bc9b Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 11:02:39 +0800
Subject: [PATCH 26/38] update readme

---
 configs/_base_/models/dpt_vit-b16.py |  2 +-
 configs/dpt/README.md                | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/configs/_base_/models/dpt_vit-b16.py b/configs/_base_/models/dpt_vit-b16.py
index d77e164b7f..a5be6bcd3a 100644
--- a/configs/_base_/models/dpt_vit-b16.py
+++ b/configs/_base_/models/dpt_vit-b16.py
@@ -1,7 +1,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='pretrain/vit-b16_p16_224pth', # noqa
+    pretrained='pretrain/vit-b16_p16_224.pth', # noqa
     backbone=dict(
         type='VisionTransformer',
         img_size=224,
diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 62d568c85d..0a63f3769c 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -20,6 +20,16 @@
 }
 ```
 
+## How to use ViT pretrain weights
+
+We convert the backbone weights from the pytorch-image-models repo (https://github.com/rwightman/pytorch-image-models) with `tools/model_converters/vit_convert.py`.
+
+You may follow below steps to start segformer training preparation:
+
+1. Download segformer pretrain weights (Suggest put in `pretrain/`);
+2. Run convert script to convert official pretrain weights: `python tools/model_converters/vit_convert.py pretrain/vit_timm.pth pretrain/vit-b16__p16_224.pth`;
+3. Modify `pretrained` of VisionTransformer model config, for example, `pretrained` of `dpt_vit-b16.py` is set to `pretrain/vit-b16_p16_224.pth`;
+
 ## Results and models
 
 ### ADE20K

From 96ce1750b88cc6ea52fa65d05a0f81646558ff9f Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 12:17:34 +0800
Subject: [PATCH 27/38] add yml

---
 configs/dpt/dpt.yml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/configs/dpt/dpt.yml b/configs/dpt/dpt.yml
index dad2b5c8df..29ee7dcc44 100644
--- a/configs/dpt/dpt.yml
+++ b/configs/dpt/dpt.yml
@@ -1,50 +1,50 @@
 Collections:
-- Name: dpt
-  Metadata:
+- Metadata:
     Training Data:
     - ADE20K
+  Name: dpt
 Models:
-- Name: dpt_vit-b16_512x512_160k_ade20k
+- Config: configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
   In Collection: dpt
   Metadata:
     backbone: ViT-B
     crop size: (512,512)
-    lr schd: 160000
     inference time (ms/im):
-    - value: 96.06
-      hardware: V100
-      backend: PyTorch
+    - backend: PyTorch
       batch size: 1
+      hardware: V100
       mode: FP32
       resolution: (512,512)
+      value: 96.06
+    lr schd: 160000
     memory (GB): 8.09
+  Name: dpt_vit-b16_512x512_160k_ade20k
   Results:
-    Task: Semantic Segmentation
     Dataset: ADE20K
     Metrics:
       mIoU: 46.97
       mIoU(ms+flip): 48.34
-  Config: configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
+    Task: Semantic Segmentation
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
-- Name: dpt_vit-l16_512x512_160k_ade20k
+- Config: configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
   In Collection: dpt
   Metadata:
     backbone: ViT-L
     crop size: (512,512)
-    lr schd: 160000
     inference time (ms/im):
-    - value: 229.36
-      hardware: V100
-      backend: PyTorch
+    - backend: PyTorch
       batch size: 1
+      hardware: V100
       mode: FP32
       resolution: (512,512)
+      value: 229.36
+    lr schd: 160000
     memory (GB): 18.37
+  Name: dpt_vit-l16_512x512_160k_ade20k
   Results:
-    Task: Semantic Segmentation
     Dataset: ADE20K
     Metrics:
       mIoU: 46.19
       mIoU(ms+flip): 46.97
-  Config: configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+    Task: Semantic Segmentation
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth

From fa6133906b7d538bbd1fd99e58bd861bef25546d Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 16:14:45 +0800
Subject: [PATCH 28/38] update doc

---
 configs/dpt/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 0a63f3769c..182f3b38b6 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -20,15 +20,15 @@
 }
 ```
 
-## How to use ViT pretrain weights
+## How to use ViT pretrained weights
 
-We convert the backbone weights from the pytorch-image-models repo (https://github.com/rwightman/pytorch-image-models) with `tools/model_converters/vit_convert.py`.
+We convert the backbone weights from the pytorch-image-models repository (https://github.com/rwightman/pytorch-image-models) with `tools/model_converters/vit_convert.py`.
 
-You may follow below steps to start segformer training preparation:
+You may follow below steps to start DPT training preparation:
 
-1. Download segformer pretrain weights (Suggest put in `pretrain/`);
-2. Run convert script to convert official pretrain weights: `python tools/model_converters/vit_convert.py pretrain/vit_timm.pth pretrain/vit-b16__p16_224.pth`;
-3. Modify `pretrained` of VisionTransformer model config, for example, `pretrained` of `dpt_vit-b16.py` is set to `pretrain/vit-b16_p16_224.pth`;
+1. Download ViT pretrained weights (Suggest put in `pretrain/`);
+2. Run convert script to convert official pretrained weights: `python tools/model_converters/vit_convert.py pretrain/vit-timm.pth pretrain/vit-mmseg.pth`;
+3. Modify `pretrained` of VisionTransformer model config, for example, `pretrained` of `dpt_vit-b16.py` is set to `pretrain/vit-mmseg.pth`;
 
 ## Results and models
 

From 55bcd748e86461288717d29cc601d640f2184dc9 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 16:26:40 +0800
Subject: [PATCH 29/38] remove with_cp

---
 configs/_base_/models/dpt_vit-b16.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configs/_base_/models/dpt_vit-b16.py b/configs/_base_/models/dpt_vit-b16.py
index a5be6bcd3a..43d3e0cf16 100644
--- a/configs/_base_/models/dpt_vit-b16.py
+++ b/configs/_base_/models/dpt_vit-b16.py
@@ -11,7 +11,6 @@
         out_indices=(2, 5, 8, 11),
         final_norm=False,
         with_cls_token=True,
-        with_cp=True,
         output_cls_token=True),
     decode_head=dict(
         type='DPTHead',

From 4b33f6f980eb6eaff06e3a4862d0e525533d03b9 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 16:39:30 +0800
Subject: [PATCH 30/38] update config

---
 configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py |  2 +-
 configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py | 14 ++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
index e5a48ef23a..c751a68232 100644
--- a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
@@ -29,4 +29,4 @@
     by_epoch=False)
 
 # By default, models are trained on 8 GPUs with 2 images per GPU
-data = dict(workers_per_gpu=2)
+data = dict(samples_per_gpu=2, workers_per_gpu=2)
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
index 3a7c33cc7c..47ba3f8f9a 100644
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
@@ -1,24 +1,14 @@
 _base_ = './dpt_vit-b16_512x512_160k_ade20k.py'
 
 model = dict(
-    type='EncoderDecoder',
     pretrained='pretrain/vit-l16_p16_384.pth', # noqa
     backbone=dict(
-        type='VisionTransformer',
         img_size=384,
         embed_dims=1024,
         num_heads=16,
         num_layers=24,
-        out_indices=(5, 11, 17, 23),
-        final_norm=False,
-        with_cls_token=True,
-        output_cls_token=True),
+        out_indices=(5, 11, 17, 23)),
     decode_head=dict(
-        type='DPTHead',
         in_channels=(1024, 1024, 1024, 1024),
-        channels=256,
         embed_dims=1024,
-        post_process_channels=[256, 512, 1024, 1024]),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))  # yapf: disable
+        post_process_channels=[256, 512, 1024, 1024]))  # yapf: disable

From 76344cd17fc04847d7b8fd56983e0cc98de7c9d6 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Thu, 19 Aug 2021 16:54:31 +0800
Subject: [PATCH 31/38] update docstring

---
 mmseg/models/decode_heads/dpt_head.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index dbb1c27f90..a55ea61793 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -160,7 +160,7 @@ def forward(self, inputs):
 
 
 class FeatureFusionBlock(BaseModule):
-    """FeatureFusionBlock, merge feature map from different stage.
+    """FeatureFusionBlock, merge feature map from different stages.
 
     Args:
         in_channels (int): Input channels.
@@ -230,6 +230,7 @@ class DPTHead(BaseDecodeHead):
 
     Args:
         embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
         post_process_channels (List): Out channels of post process conv
             layers. Default: [96, 192, 384, 768].
         readout_type (str): Type of readout operation. Default: 'ignore'.

From 94fb8d44c843a2bffe678a47f8c70caf93ebda3d Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 25 Aug 2021 16:08:16 +0800
Subject: [PATCH 32/38] remove dpt-l

---
 configs/dpt/README.md                         |  1 -
 configs/dpt/dpt.yml                           | 22 -------------------
 .../dpt/dpt_vit-l16_512x512_160k_ade20k.py    | 14 ------------
 3 files changed, 37 deletions(-)
 delete mode 100644 configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py

diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 182f3b38b6..aa4a15de29 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -37,4 +37,3 @@ You may follow below steps to start DPT training preparation:
 | Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
 | ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | DPT | ViT-B | 512x512  | 160000  | 8.09 | 10.41 | 46.97 | 48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
-| DPT | ViT-L | 512x512  | 160000  | 18.37 | 4.36 | 46.19 | 46.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-20210809_172025.log.json) |
diff --git a/configs/dpt/dpt.yml b/configs/dpt/dpt.yml
index 29ee7dcc44..affb8d4f3f 100644
--- a/configs/dpt/dpt.yml
+++ b/configs/dpt/dpt.yml
@@ -26,25 +26,3 @@ Models:
       mIoU(ms+flip): 48.34
     Task: Semantic Segmentation
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
-- Config: configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
-  In Collection: dpt
-  Metadata:
-    backbone: ViT-L
-    crop size: (512,512)
-    inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
-      hardware: V100
-      mode: FP32
-      resolution: (512,512)
-      value: 229.36
-    lr schd: 160000
-    memory (GB): 18.37
-  Name: dpt_vit-l16_512x512_160k_ade20k
-  Results:
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.19
-      mIoU(ms+flip): 46.97
-    Task: Semantic Segmentation
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-l16_512x512_160k_ade20k/dpt_vit-l16_512x512_160k_ade20k-7b753ca6.pth
diff --git a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
deleted file mode 100644
index 47ba3f8f9a..0000000000
--- a/configs/dpt/dpt_vit-l16_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,14 +0,0 @@
-_base_ = './dpt_vit-b16_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/vit-l16_p16_384.pth', # noqa
-    backbone=dict(
-        img_size=384,
-        embed_dims=1024,
-        num_heads=16,
-        num_layers=24,
-        out_indices=(5, 11, 17, 23)),
-    decode_head=dict(
-        in_channels=(1024, 1024, 1024, 1024),
-        embed_dims=1024,
-        post_process_channels=[256, 512, 1024, 1024]))  # yapf: disable

From 5e56d1b9ce91a6dff49093271dc80770010dd10a Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 25 Aug 2021 17:44:18 +0800
Subject: [PATCH 33/38] add init_cfg and modify readme.md

---
 README.md                             |  1 +
 configs/dpt/README.md                 | 20 ++++++++++++++------
 mmseg/models/decode_heads/dpt_head.py | 12 ++++++++----
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 152955531b..2a8e9f3b71 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,7 @@ Supported methods:
 - [x] [PointRend (CVPR'2020)](configs/point_rend)
 - [x] [CGNet (TIP'2020)](configs/cgnet)
 - [x] [SETR (CVPR'2021)](configs/setr)
+- [x] [DPT (ArXiv' 2021)](configs/dpt)
 
 ## Installation
 
diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index aa4a15de29..3dd994cc58 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -20,15 +20,23 @@
 }
 ```
 
-## How to use ViT pretrained weights
+## Usage
 
-We convert the backbone weights from the pytorch-image-models repository (https://github.com/rwightman/pytorch-image-models) with `tools/model_converters/vit_convert.py`.
+To use other repositories' pre-trained models, it is necessary to convert keys.
 
-You may follow below steps to start DPT training preparation:
+We provide a script [`vit2mmseg.py`](../../tools/model_converters/vit2mmseg.py) in the tools directory to convert the key of models from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to MMSegmentation style.
 
-1. Download ViT pretrained weights (Suggest put in `pretrain/`);
-2. Run convert script to convert official pretrained weights: `python tools/model_converters/vit_convert.py pretrain/vit-timm.pth pretrain/vit-mmseg.pth`;
-3. Modify `pretrained` of VisionTransformer model config, for example, `pretrained` of `dpt_vit-b16.py` is set to `pretrain/vit-mmseg.pth`;
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth pretrain/jx_vit_base_p16_224-80ecf9dd.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
 
 ## Results and models
 
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index a55ea61793..6642f562db 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -21,14 +21,16 @@ class ReassembleBlocks(BaseModule):
             Default: [96, 192, 384, 768].
         readout_type (str): Type of readout operation. Default: 'ignore'.
         patch_size (int): The patch size. Default: 16.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
     """
 
     def __init__(self,
                  in_channels=768,
                  out_channels=[96, 192, 384, 768],
                  readout_type='ignore',
-                 patch_size=16):
-        super(ReassembleBlocks, self).__init__()
+                 patch_size=16,
+                 init_cfg=None):
+        super(ReassembleBlocks, self).__init__(init_cfg)
 
         assert readout_type in ['ignore', 'add', 'project']
         self.readout_type = readout_type
@@ -170,6 +172,7 @@ class FeatureFusionBlock(BaseModule):
             Default: False.
         align_corners (bool): align_corner setting for bilinear upsample.
             Default: True.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
     """
 
     def __init__(self,
@@ -177,8 +180,9 @@ def __init__(self,
                  act_cfg,
                  norm_cfg,
                  expand=False,
-                 align_corners=True):
-        super(FeatureFusionBlock, self).__init__()
+                 align_corners=True,
+                 init_cfg=None):
+        super(FeatureFusionBlock, self).__init__(init_cfg)
 
         self.in_channels = in_channels
         self.expand = expand

From f4ad2fada25cae8aca36436ec56fc0899b80bd68 Mon Sep 17 00:00:00 2001
From: Junjun2016 <hejunjun@sjtu.edu.cn>
Date: Wed, 25 Aug 2021 18:17:26 +0800
Subject: [PATCH 34/38] Update dpt_vit-b16.py

---
 configs/_base_/models/dpt_vit-b16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/_base_/models/dpt_vit-b16.py b/configs/_base_/models/dpt_vit-b16.py
index 43d3e0cf16..dfd48a95f8 100644
--- a/configs/_base_/models/dpt_vit-b16.py
+++ b/configs/_base_/models/dpt_vit-b16.py
@@ -1,7 +1,7 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
 model = dict(
     type='EncoderDecoder',
-    pretrained='pretrain/vit-b16_p16_224.pth', # noqa
+    pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
     backbone=dict(
         type='VisionTransformer',
         img_size=224,

From 161d4948864bebeb217580f7ed24a5bfbc2a7cea Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Wed, 25 Aug 2021 18:31:47 +0800
Subject: [PATCH 35/38] zh-n README

---
 README_zh-CN.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README_zh-CN.md b/README_zh-CN.md
index 01536b86f1..39d041164c 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -91,6 +91,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [PointRend (CVPR'2020)](configs/point_rend)
 - [x] [CGNet (TIP'2020)](configs/cgnet)
 - [x] [SETR (CVPR'2021)](configs/setr)
+- [x] [DPT (ArXiv' 2021)](configs/dpt)
 
 ## 安装
 

From a41ce0575de27e52e7ecdc22ded2f9eacd4ea0f0 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Mon, 30 Aug 2021 12:20:43 +0800
Subject: [PATCH 36/38] use constructor instead of build function

---
 mmseg/models/decode_heads/dpt_head.py | 50 +++++++++------------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 6642f562db..1a621a0bca 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -2,8 +2,7 @@
 
 import torch
 import torch.nn as nn
-from mmcv.cnn import (ConvModule, Linear, build_activation_layer,
-                      build_conv_layer, build_norm_layer)
+from mmcv.cnn import ConvModule, Linear, build_activation_layer
 from mmcv.runner import BaseModule
 
 from mmseg.ops import resize
@@ -104,8 +103,6 @@ class PreActResidualConvUnit(BaseModule):
         in_channels (int): number of channels in the input feature map.
         act_cfg (dict): dictionary to construct and config activation layer.
         norm_cfg (dict): dictionary to construct and config norm layer.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None.
         stride (int): stride of the first block. Default: 1
         dilation (int): dilation rate for convs layers. Default: 1.
         init_cfg (dict, optional): Initialization config dict. Default: None.
@@ -115,49 +112,36 @@ def __init__(self,
                  in_channels,
                  act_cfg,
                  norm_cfg,
-                 conv_cfg=None,
                  stride=1,
                  dilation=1,
                  init_cfg=None):
         super(PreActResidualConvUnit, self).__init__(init_cfg)
 
-        self.norm1_name, norm1 = build_norm_layer(
-            norm_cfg, in_channels, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            norm_cfg, in_channels, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
+        self.conv1 = ConvModule(
             in_channels,
             in_channels,
             3,
             stride=stride,
             padding=dilation,
             dilation=dilation,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            conv_cfg, in_channels, in_channels, 3, padding=1, bias=False)
-        self.add_module(self.norm2_name, norm2)
-        self.activate = build_activation_layer(act_cfg)
-
-    @property
-    def norm1(self):
-        """nn.Module: normalization layer after the first convolution layer"""
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: normalization layer after the second convolution layer"""
-        return getattr(self, self.norm2_name)
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
 
     def forward(self, inputs):
-        x = self.activate(inputs)
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.activate(x)
+        x = self.conv1(inputs)
         x = self.conv2(x)
-        x = self.norm2(x)
         return x + inputs
 
 

From 78b56b1f349eda2beca04424cc159fcf7b65aac2 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Mon, 30 Aug 2021 15:28:49 +0800
Subject: [PATCH 37/38] prevent tensor being modified by ConvModule

---
 mmseg/models/decode_heads/dpt_head.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 1a621a0bca..3adc32d4d5 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -1,3 +1,4 @@
+import copy
 import math
 
 import torch
@@ -140,9 +141,10 @@ def __init__(self,
             order=('act', 'conv', 'norm'))
 
     def forward(self, inputs):
+        inputs_ = copy.deepcopy(inputs)
         x = self.conv1(inputs)
         x = self.conv2(x)
-        return x + inputs
+        return x + inputs_
 
 
 class FeatureFusionBlock(BaseModule):

From 522cdffe6e8fb40e918583582718c74280103779 Mon Sep 17 00:00:00 2001
From: xiexinch <xinchen.xie@qq.com>
Date: Mon, 30 Aug 2021 15:50:29 +0800
Subject: [PATCH 38/38] fix unittest

---
 mmseg/models/decode_heads/dpt_head.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 3adc32d4d5..7028f2a230 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -1,4 +1,3 @@
-import copy
 import math
 
 import torch
@@ -141,7 +140,7 @@ def __init__(self,
             order=('act', 'conv', 'norm'))
 
     def forward(self, inputs):
-        inputs_ = copy.deepcopy(inputs)
+        inputs_ = inputs.clone()
         x = self.conv1(inputs)
         x = self.conv2(x)
         return x + inputs_