diff --git a/mmseg/datasets/pipelines/formating.py b/mmseg/datasets/pipelines/formating.py
index 45824fc405..4e057c1b81 100644
--- a/mmseg/datasets/pipelines/formating.py
+++ b/mmseg/datasets/pipelines/formating.py
@@ -249,9 +249,9 @@ class Collect(object):
         keys (Sequence[str]): Keys of results to be collected in ``data``.
         meta_keys (Sequence[str], optional): Meta keys to be converted to
             ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
-            Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
-            'pad_shape', 'scale_factor', 'flip', 'flip_direction',
-            'img_norm_cfg')``
+            Default: (``filename``, ``ori_filename``, ``ori_shape``,
+            ``img_shape``, ``pad_shape``, ``scale_factor``, ``flip``,
+            ``flip_direction``, ``img_norm_cfg``)
     """
 
     def __init__(self,
diff --git a/mmseg/models/backbones/cgnet.py b/mmseg/models/backbones/cgnet.py
index 67c06717ba..168194c106 100644
--- a/mmseg/models/backbones/cgnet.py
+++ b/mmseg/models/backbones/cgnet.py
@@ -187,8 +187,8 @@ def forward(self, x):
 class CGNet(BaseModule):
     """CGNet backbone.
 
-    A Light-weight Context Guided Network for Semantic Segmentation
-    arXiv: https://arxiv.org/abs/1811.08201
+    This backbone is the implementation of `A Light-weight Context Guided
+    Network for Semantic Segmentation <https://arxiv.org/abs/1811.08201>`_.
 
     Args:
         in_channels (int): Number of input image channels. Normally 3.
diff --git a/mmseg/models/backbones/fast_scnn.py b/mmseg/models/backbones/fast_scnn.py
index 95a434413b..cbfbcaf4f3 100644
--- a/mmseg/models/backbones/fast_scnn.py
+++ b/mmseg/models/backbones/fast_scnn.py
@@ -272,6 +272,9 @@ def forward(self, higher_res_feature, lower_res_feature):
 class FastSCNN(BaseModule):
     """Fast-SCNN Backbone.
 
+    This backbone is the implementation of `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
     Args:
         in_channels (int): Number of input image channels. Default: 3.
         downsample_dw_channels (tuple[int]): Number of output channels after
diff --git a/mmseg/models/backbones/hrnet.py b/mmseg/models/backbones/hrnet.py
index a0b1e47cde..90feadcf62 100644
--- a/mmseg/models/backbones/hrnet.py
+++ b/mmseg/models/backbones/hrnet.py
@@ -218,8 +218,8 @@ def forward(self, x):
 class HRNet(BaseModule):
     """HRNet backbone.
 
-    `High-Resolution Representations for Labeling Pixels and Regions
-    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+    This backbone is the implementation of `High-Resolution Representations
+    for Labeling Pixels and Regions <https://arxiv.org/abs/1904.04514>`_.
 
     Args:
         extra (dict): Detailed configuration for each stage of HRNet.
diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py
index 90abfe539b..ee8bbfab45 100644
--- a/mmseg/models/backbones/mit.py
+++ b/mmseg/models/backbones/mit.py
@@ -246,9 +246,9 @@ def forward(self, x, hw_shape):
 class MixVisionTransformer(BaseModule):
     """The backbone of Segformer.
 
-    A PyTorch implement of : `SegFormer: Simple and Efficient Design for
-    Semantic Segmentation with Transformers` -
-        https://arxiv.org/pdf/2105.15203.pdf
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
 
     Args:
         in_channels (int): Number of input channels. Default: 3.
diff --git a/mmseg/models/backbones/mobilenet_v2.py b/mmseg/models/backbones/mobilenet_v2.py
index 988e29cdea..cbb9c6cd01 100644
--- a/mmseg/models/backbones/mobilenet_v2.py
+++ b/mmseg/models/backbones/mobilenet_v2.py
@@ -14,6 +14,10 @@
 class MobileNetV2(BaseModule):
     """MobileNetV2 backbone.
 
+    This backbone is the implementation of
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks
+    <https://arxiv.org/abs/1801.04381>`_.
+
     Args:
         widen_factor (float): Width multiplier, multiply number of
             channels in each layer by this amount. Default: 1.0.
diff --git a/mmseg/models/backbones/resnest.py b/mmseg/models/backbones/resnest.py
index f47adb5302..91952c2caf 100644
--- a/mmseg/models/backbones/resnest.py
+++ b/mmseg/models/backbones/resnest.py
@@ -271,6 +271,9 @@ def _inner_forward(x):
 class ResNeSt(ResNetV1d):
     """ResNeSt backbone.
 
+    This backbone is the implementation of `ResNeSt:
+    Split-Attention Networks <https://arxiv.org/abs/2004.08955>`_.
+
     Args:
         groups (int): Number of groups of Bottleneck. Default: 1
         base_width (int): Base width of Bottleneck. Default: 4
diff --git a/mmseg/models/backbones/resnet.py b/mmseg/models/backbones/resnet.py
index f9a1ceb4e0..e8b961d5fa 100644
--- a/mmseg/models/backbones/resnet.py
+++ b/mmseg/models/backbones/resnet.py
@@ -311,6 +311,9 @@ def _inner_forward(x):
 class ResNet(BaseModule):
     """ResNet backbone.
 
+    This backbone is the improved implementation of `Deep Residual Learning
+    for Image Recognition <https://arxiv.org/abs/1512.03385>`_.
+
     Args:
         depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
         in_channels (int): Number of input image channels. Default: 3.
@@ -686,11 +689,10 @@ def train(self, mode=True):
 class ResNetV1c(ResNet):
     """ResNetV1c variant described in [1]_.
 
-    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv
-    in the input stem with three 3x3 convs.
-
-    References:
-        .. [1] https://arxiv.org/pdf/1812.01187.pdf
+    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv in
+    the input stem with three 3x3 convs. For more details please refer to `Bag
+    of Tricks for Image Classification with Convolutional Neural Networks
+    <https://arxiv.org/abs/1812.01187>`_.
     """
 
     def __init__(self, **kwargs):
diff --git a/mmseg/models/backbones/resnext.py b/mmseg/models/backbones/resnext.py
index 450b77bb76..805c27bf33 100644
--- a/mmseg/models/backbones/resnext.py
+++ b/mmseg/models/backbones/resnext.py
@@ -88,6 +88,10 @@ def __init__(self,
 class ResNeXt(ResNet):
     """ResNeXt backbone.
 
+    This backbone is the implementation of `Aggregated
+    Residual Transformations for Deep Neural
+    Networks <https://arxiv.org/abs/1611.05431>`_.
+
     Args:
         depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
         in_channels (int): Number of input image channels. Normally 3.
diff --git a/mmseg/models/backbones/swin.py b/mmseg/models/backbones/swin.py
index e3e835a032..424c456cb3 100644
--- a/mmseg/models/backbones/swin.py
+++ b/mmseg/models/backbones/swin.py
@@ -522,13 +522,12 @@ def forward(self, x, hw_shape):
 
 @BACKBONES.register_module()
 class SwinTransformer(BaseModule):
-    """ Swin Transformer
-    A PyTorch implement of : `Swin Transformer:
-    Hierarchical Vision Transformer using Shifted Windows`  -
-        https://arxiv.org/abs/2103.14030
+    """Swin Transformer backbone.
 
-    Inspiration from
-    https://github.com/microsoft/Swin-Transformer
+    This backbone is the implementation of `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted
+    Windows <https://arxiv.org/abs/2103.14030>`_.
+    Inspiration from https://github.com/microsoft/Swin-Transformer.
 
     Args:
         pretrain_img_size (int | tuple[int]): The size of input image when
diff --git a/mmseg/models/backbones/unet.py b/mmseg/models/backbones/unet.py
index 680c79e320..c2d33667f8 100644
--- a/mmseg/models/backbones/unet.py
+++ b/mmseg/models/backbones/unet.py
@@ -224,8 +224,9 @@ def forward(self, x):
 @BACKBONES.register_module()
 class UNet(BaseModule):
     """UNet backbone.
-    U-Net: Convolutional Networks for Biomedical Image Segmentation.
-    https://arxiv.org/pdf/1505.04597.pdf
+
+    This backbone is the implementation of `U-Net: Convolutional Networks
+    for Biomedical Image Segmentation <https://arxiv.org/abs/1505.04597>`_.
 
     Args:
         in_channels (int): Number of input image channels. Default" 3.
@@ -277,7 +278,6 @@ class UNet(BaseModule):
         The input image size should be divisible by the whole downsample rate
         of the encoder. More detail of the whole downsample rate can be found
         in UNet._check_input_divisible.
-
     """
 
     def __init__(self,
diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index 003fa537e6..668d278992 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -98,9 +98,9 @@ def forward(self, x):
 class VisionTransformer(BaseModule):
     """Vision Transformer.
 
-    A PyTorch implement of : `An Image is Worth 16x16 Words:
-    Transformers for Image Recognition at Scale` -
-        https://arxiv.org/abs/2010.11929
+    This backbone is the implementation of `An Image is Worth 16x16 Words:
+    Transformers for Image Recognition at
+    Scale <https://arxiv.org/abs/2010.11929>`_.
 
     Args:
         img_size (int | tuple): Input image size. Default: 224.
diff --git a/mmseg/models/decode_heads/point_head.py b/mmseg/models/decode_heads/point_head.py
index 4bc388cbc0..4470571144 100644
--- a/mmseg/models/decode_heads/point_head.py
+++ b/mmseg/models/decode_heads/point_head.py
@@ -36,6 +36,8 @@ def calculate_uncertainty(seg_logits):
 class PointHead(BaseCascadeDecodeHead):
     """A mask point head use in PointRend.
 
+    This head is implemented of `PointRend: Image Segmentation as
+    Rendering <https://arxiv.org/abs/1912.08193>`_.
     ``PointHead`` use shared multi-layer perceptron (equivalent to
     nn.Conv1d) to predict the logit of input points. The fine-grained feature
     and coarse feature will be concatenate together for predication.
diff --git a/mmseg/models/decode_heads/sep_fcn_head.py b/mmseg/models/decode_heads/sep_fcn_head.py
index 5e22a66f7c..7f9658e08f 100644
--- a/mmseg/models/decode_heads/sep_fcn_head.py
+++ b/mmseg/models/decode_heads/sep_fcn_head.py
@@ -10,7 +10,9 @@ class DepthwiseSeparableFCNHead(FCNHead):
     """Depthwise-Separable Fully Convolutional Network for Semantic
     Segmentation.
 
-    This head is implemented according to Fast-SCNN paper.
+    This head is implemented according to `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
     Args:
         in_channels(int): Number of output channels of FFM.
         channels(int): Number of middle-stage channels in the decode head.
diff --git a/mmseg/models/necks/fpn.py b/mmseg/models/necks/fpn.py
index 8461a75e49..bc237428e9 100644
--- a/mmseg/models/necks/fpn.py
+++ b/mmseg/models/necks/fpn.py
@@ -12,8 +12,8 @@
 class FPN(BaseModule):
     """Feature Pyramid Network.
 
-    This is an implementation of - Feature Pyramid Networks for Object
-    Detection (https://arxiv.org/abs/1612.03144)
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
 
     Args:
         in_channels (List[int]): Number of input channels per scale.
diff --git a/mmseg/models/necks/mla_neck.py b/mmseg/models/necks/mla_neck.py
index 5fc3b98b0b..1513e296da 100644
--- a/mmseg/models/necks/mla_neck.py
+++ b/mmseg/models/necks/mla_neck.py
@@ -63,8 +63,8 @@ def forward(self, inputs):
 class MLANeck(nn.Module):
     """Multi-level Feature Aggregation.
 
-    The Multi-level Feature Aggregation construction of SETR:
-    https://arxiv.org/pdf/2012.15840.pdf
+    This neck is `The Multi-level Feature Aggregation construction of
+    SETR <https://arxiv.org/abs/2012.15840>`_.
 
 
     Args:
diff --git a/mmseg/models/necks/multilevel_neck.py b/mmseg/models/necks/multilevel_neck.py
index cbf4b01176..5151f8762d 100644
--- a/mmseg/models/necks/multilevel_neck.py
+++ b/mmseg/models/necks/multilevel_neck.py
@@ -11,6 +11,7 @@ class MultiLevelNeck(nn.Module):
     """MultiLevelNeck.
 
     A neck structure connect vit backbone and decoder_heads.
+
     Args:
         in_channels (List[int]): Number of input channels per scale.
         out_channels (int): Number of output channels (used at each scale).