style: Updated flake8 config (#174)

* style: Updated flake8 config * fix: Fixed segmentation training * style: Fixed docstrings * style: Reformatted docstrings to match flake8
frgfm · Nov 14, 2021 · 1f000bd · 1f000bd
1 parent f7c50eb
commit 1f000bd
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 80 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,5 @@
 [flake8]
 max-line-length = 120
-ignore = F401, E402, E265, F403, W503, W504, F821, W605, E731
-exclude = .circleci, .git, venv*, docs, build
+ignore = E402, E265, F403, W503, W504, E731
+exclude = .circleci, .git, venv*, docs, build
+per-file-ignores = **/__init__.py:F401
diff --git a/holocron/models/detection/yolo.py b/holocron/models/detection/yolo.py
@@ -357,7 +357,7 @@ def _yolo(arch: str, pretrained: bool, progress: bool, pretrained_backbone: bool
 
 
 def yolov1(pretrained: bool = False, progress: bool = True, pretrained_backbone: bool = True, **kwargs: Any) -> YOLOv1:
-    """YOLO model from
+    r"""YOLO model from
     `"You Only Look Once: Unified, Real-Time Object Detection" <https://pjreddie.com/media/files/papers/yolo_1.pdf>`_.
 
     YOLO's particularity is to make predictions in a grid (same size as last feature map). For each grid cell,
@@ -369,49 +369,49 @@ def yolov1(pretrained: bool = False, progress: bool = True, pretrained_backbone:
     For training, YOLO uses a multi-part loss whose components are computed by:
 
     .. math::
-        \\mathcal{L}_{coords} = \\sum\\limits_{i=0}^{S^2} \\sum\\limits_{j=0}^{B}
-        \\mathbb{1}_{ij}^{obj} \\Big[
-        (x_{ij} - \\hat{x}_{ij})² + (y_{ij} - \\hat{y}_{ij})² +
-        (\\sqrt{w_{ij}} - \\sqrt{\\hat{w}_{ij}})² + (\\sqrt{h_{ij}} - \\sqrt{\\hat{h}_{ij}})²
-        \\Big]
+        \mathcal{L}_{coords} = \sum\limits_{i=0}^{S^2} \sum\limits_{j=0}^{B}
+        \mathbb{1}_{ij}^{obj} \Big[
+        (x_{ij} - \hat{x}_{ij})² + (y_{ij} - \hat{y}_{ij})² +
+        (\sqrt{w_{ij}} - \sqrt{\hat{w}_{ij}})² + (\sqrt{h_{ij}} - \sqrt{\hat{h}_{ij}})²
+        \Big]
 
     where :math:`S` is size of the output feature map (7 for an input size :math:`(448, 448)`),
     :math:`B` is the number of anchor boxes per grid cell (default: 2),
-    :math:`\\mathbb{1}_{ij}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell and among the
+    :math:`\mathbb{1}_{ij}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell and among the
     anchor boxes of that cell, has the highest IoU with the j-th box else 0,
     :math:`(x_{ij}, y_{ij}, w_{ij}, h_{ij})` are the coordinates of the ground truth assigned to
     the j-th anchor box of the i-th grid cell,
-    and :math:`(\\hat{x}_{ij}, \\hat{y}_{ij}, \\hat{w}_{ij}, \\hat{h}_{ij})` are the coordinate predictions
+    and :math:`(\hat{x}_{ij}, \hat{y}_{ij}, \hat{w}_{ij}, \hat{h}_{ij})` are the coordinate predictions
     for the j-th anchor box of the i-th grid cell.
 
     .. math::
-        \\mathcal{L}_{objectness} = \\sum\\limits_{i=0}^{S^2} \\sum\\limits_{j=0}^{B}
-        \\Big[ \\mathbb{1}_{ij}^{obj} \\Big(C_{ij} - \\hat{C}_{ij} \\Big)^2
-        + \\lambda_{noobj} \\mathbb{1}_{ij}^{noobj} \\Big(C_{ij} - \\hat{C}_{ij} \\Big)^2
-        \\Big]
+        \mathcal{L}_{objectness} = \sum\limits_{i=0}^{S^2} \sum\limits_{j=0}^{B}
+        \Big[ \mathbb{1}_{ij}^{obj} \Big(C_{ij} - \hat{C}_{ij} \Big)^2
+        + \lambda_{noobj} \mathbb{1}_{ij}^{noobj} \Big(C_{ij} - \hat{C}_{ij} \Big)^2
+        \Big]
 
-    where :math:`\\lambda_{noobj}` is a positive coefficient (default: 0.5),
-    :math:`\\mathbb{1}_{ij}^{noobj} = 1 - \\mathbb{1}_{ij}^{obj}`,
+    where :math:`\lambda_{noobj}` is a positive coefficient (default: 0.5),
+    :math:`\mathbb{1}_{ij}^{noobj} = 1 - \mathbb{1}_{ij}^{obj}`,
     :math:`C_{ij}` equals the Intersection Over Union between the j-th anchor box in the i-th grid cell and its
     matched ground truth box if that box is matched with a ground truth else 0,
-    and :math:`\\hat{C}_{ij}` is the objectness score of the j-th anchor box in the i-th grid cell..
+    and :math:`\hat{C}_{ij}` is the objectness score of the j-th anchor box in the i-th grid cell..
 
     .. math::
-        \\mathcal{L}_{classification} = \\sum\\limits_{i=0}^{S^2}
-        \\mathbb{1}_{i}^{obj} \\sum\\limits_{c \\in classes}
-        (p_i(c) - \\hat{p}_i(c))^2
+        \mathcal{L}_{classification} = \sum\limits_{i=0}^{S^2}
+        \mathbb{1}_{i}^{obj} \sum\limits_{c \in classes}
+        (p_i(c) - \hat{p}_i(c))^2
 
-    where :math:`\\mathbb{1}_{i}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell else 0,
+    where :math:`\mathbb{1}_{i}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell else 0,
     :math:`p_i(c)` equals 1 if the assigned ground truth to the i-th cell is classified as class :math:`c`,
-    and :math:`\\hat{p}_i(c)` is the predicted probability of class :math:`c` in the i-th cell.
+    and :math:`\hat{p}_i(c)` is the predicted probability of class :math:`c` in the i-th cell.
 
     And the full loss is given by:
 
     .. math::
-        \\mathcal{L}_{YOLOv1} = \\lambda_{coords} \\cdot \\mathcal{L}_{coords} +
-        \\mathcal{L}_{objectness} + \\mathcal{L}_{classification}
+        \mathcal{L}_{YOLOv1} = \lambda_{coords} \cdot \mathcal{L}_{coords} +
+        \mathcal{L}_{objectness} + \mathcal{L}_{classification}
 
-    where :math:`\\lambda_{coords}` is a positive coefficient (default: 5).
+    where :math:`\lambda_{coords}` is a positive coefficient (default: 5).
 
     Args:
         pretrained (bool, optional): If True, returns a model pre-trained on ImageNet

diff --git a/holocron/models/detection/yolov2.py b/holocron/models/detection/yolov2.py
@@ -206,7 +206,7 @@ def _yolo(arch: str, pretrained: bool, progress: bool, pretrained_backbone: bool
 
 
 def yolov2(pretrained: bool = False, progress: bool = True, pretrained_backbone: bool = True, **kwargs: Any) -> YOLOv2:
-    """YOLOv2 model from
+    r"""YOLOv2 model from
     `"YOLO9000: Better, Faster, Stronger" <https://pjreddie.com/media/files/papers/YOLO9000.pdf>`_.
 
     YOLOv2 improves upon YOLO by raising the number of boxes predicted by grid cell (default: 5), introducing
@@ -215,17 +215,17 @@ def yolov2(pretrained: bool = False, progress: bool = True, pretrained_backbone:
     For training, YOLOv2 uses the same multi-part loss as YOLO apart from its classification loss:
 
     .. math::
-        \\mathcal{L}_{classification} = \\sum\\limits_{i=0}^{S^2}  \\sum\\limits_{j=0}^{B}
-        \\mathbb{1}_{ij}^{obj} \\sum\\limits_{c \\in classes}
-        (p_{ij}(c) - \\hat{p}_{ij}(c))^2
+        \mathcal{L}_{classification} = \sum\limits_{i=0}^{S^2}  \sum\limits_{j=0}^{B}
+        \mathbb{1}_{ij}^{obj} \sum\limits_{c \in classes}
+        (p_{ij}(c) - \hat{p}_{ij}(c))^2
 
     where :math:`S` is size of the output feature map (13 for an input size :math:`(416, 416)`),
     :math:`B` is the number of anchor boxes per grid cell (default: 5),
-    :math:`\\mathbb{1}_{ij}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell and among the
+    :math:`\mathbb{1}_{ij}^{obj}` equals to 1 if a GT center falls inside the i-th grid cell and among the
     anchor boxes of that cell, has the highest IoU with the j-th box else 0,
     :math:`p_{ij}(c)` equals 1 if the assigned ground truth to the j-th anchor box of the i-th cell is classified
     as class :math:`c`,
-    and :math:`\\hat{p}_{ij}(c)` is the predicted probability of class :math:`c` for the j-th anchor box
+    and :math:`\hat{p}_{ij}(c)` is the predicted probability of class :math:`c` for the j-th anchor box
     in the i-th cell.
 
     Args:

diff --git a/holocron/models/detection/yolov4.py b/holocron/models/detection/yolov4.py
@@ -514,7 +514,7 @@ def _yolo(arch: str, pretrained: bool, progress: bool, pretrained_backbone: bool
 
 
 def yolov4(pretrained: bool = False, progress: bool = True, pretrained_backbone: bool = True, **kwargs: Any) -> YOLOv4:
-    """YOLOv4 model from
+    r"""YOLOv4 model from
     `"YOLOv4: Optimal Speed and Accuracy of Object Detection" <https://arxiv.org/pdf/2004.10934.pdf>`_.
 
     The architecture improves upon YOLOv3 by including: the usage of `DropBlock
@@ -526,14 +526,14 @@ def yolov4(pretrained: bool = False, progress: bool = True, pretrained_backbone:
     For training, YOLOv4 uses the same multi-part loss as YOLOv3 apart from its box coordinate loss:
 
     .. math::
-        \\mathcal{L}_{coords} = \\sum\\limits_{i=0}^{S^2}  \\sum\\limits_{j=0}^{B}
-        \\min\\limits_{k \\in [1, M]} C_{IoU}(\\hat{loc}_{ij}, loc^{GT}_k)
+        \mathcal{L}_{coords} = \sum\limits_{i=0}^{S^2}  \sum\limits_{j=0}^{B}
+        \min\limits_{k \in [1, M]} C_{IoU}(\hat{loc}_{ij}, loc^{GT}_k)
 
     where :math:`S` is size of the output feature map (13 for an input size :math:`(416, 416)`),
     :math:`B` is the number of anchor boxes per grid cell (default: 3),
     :math:`M` is the number of ground truth boxes,
     :math:`C_{IoU}` is the complete IoU loss,
-    :math:`\\hat{loc}_{ij}` is the predicted bounding box for grid cell :math:`i` at anchor :math:`j`,
+    :math:`\hat{loc}_{ij}` is the predicted bounding box for grid cell :math:`i` at anchor :math:`j`,
     and :math:`loc^{GT}_k` is the k-th ground truth bounding box.
 
     Args:

diff --git a/holocron/nn/modules/activation.py b/holocron/nn/modules/activation.py
@@ -28,25 +28,25 @@ def extra_repr(self) -> str:
 
 
 class HardMish(_Activation):
-    """Implements the Had Mish activation module from `"H-Mish" <https://github.com/digantamisra98/H-Mish>`_
+    r"""Implements the Had Mish activation module from `"H-Mish" <https://github.com/digantamisra98/H-Mish>`_
 
     This activation is computed as follows:
 
     .. math::
-        f(x) = \\frac{x}{2} \\cdot \\min(2, \\max(0, x + 2))
+        f(x) = \frac{x}{2} \cdot \min(2, \max(0, x + 2))
     """
     def forward(self, x: Tensor) -> Tensor:
         return F.hard_mish(x, inplace=self.inplace)
 
 
 class NLReLU(_Activation):
-    """Implements the Natural-Logarithm ReLU activation module from `"Natural-Logarithm-Rectified Activation
+    r"""Implements the Natural-Logarithm ReLU activation module from `"Natural-Logarithm-Rectified Activation
     Function in Convolutional Neural Networks" <https://arxiv.org/pdf/1908.03682.pdf>`_
 
     This activation is computed as follows:
 
     .. math::
-        f(x) = ln(1 + \\beta \\cdot max(0, x))
+        f(x) = ln(1 + \beta \cdot max(0, x))
 
     Args:
         inplace (bool): should the operation be performed inplace
@@ -56,15 +56,15 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class FReLU(nn.Module):
-    """Implements the Funnel activation module from `"Funnel Activation for Visual Recognition"
+    r"""Implements the Funnel activation module from `"Funnel Activation for Visual Recognition"
     <https://arxiv.org/pdf/2007.11824.pdf>`_
 
     This activation is computed as follows:
 
     .. math::
-        f(x) = max(\\mathbb{T}(x), x)
+        f(x) = max(\mathbb{T}(x), x)
 
-    where the :math:`\\mathbb{T}` is the spatial contextual feature extraction. It is a convolution filter of size
+    where the :math:`\mathbb{T}` is the spatial contextual feature extraction. It is a convolution filter of size
     `kernel_size`, same padding and groups equal to the number of input channels, followed by a batch normalization.
 
     Args:

diff --git a/holocron/nn/modules/conv.py b/holocron/nn/modules/conv.py
@@ -43,7 +43,7 @@ def __init__(
 
 
 class NormConv2d(_NormConvNd):
-    """Implements the normalized convolution module from `"Normalized Convolutional Neural Network"
+    r"""Implements the normalized convolution module from `"Normalized Convolutional Neural Network"
     <https://arxiv.org/pdf/2005.05274v2.pdf>`_.
 
     In the simplest case, the output value of the layer with input size
@@ -53,7 +53,7 @@ class NormConv2d(_NormConvNd):
     .. math::
         out(N_i, C_{out_j}) = bias(C_{out_j}) +
         \sum_{k = 0}^{C_{in} - 1} weight(C_{out_j}, k) \star
-        \\frac{input(N_i, k) - \mu(N_i, k)}{\sqrt{\sigma^2(N_i, k) + \epsilon}}
+        \frac{input(N_i, k) - \mu(N_i, k)}{\sqrt{\sigma^2(N_i, k) + \epsilon}}
 
     where :math:`\star` is the valid 2D cross-correlation operator,
     :math:`\mu(N_i, k)` and :math:`\sigma²(N_i, k)` are the mean and variance of :math:`input(N_i, k)` over all slices,
@@ -111,15 +111,15 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class Add2d(_NormConvNd):
-    """Implements the adder module from `"AdderNet: Do We Really Need Multiplications in Deep Learning?"
+    r"""Implements the adder module from `"AdderNet: Do We Really Need Multiplications in Deep Learning?"
     <https://arxiv.org/pdf/1912.13200.pdf>`_.
 
     In the simplest case, the output value of the layer at position :math:`(m, n)` in channel :math:`c`
     with filter F of spatial size :math:`(d, d)`, intput size :math:`(C_{in}, H, W)` and output :math:`(C_{out}, H, W)`
     can be precisely described as:
 
     .. math::
-        out(m, n, c) = - \\sum\\limits_{i=0}^d \\sum\\limits_{j=0}^d \\sum\\limits_{k=0}^{C_{in}}
+        out(m, n, c) = - \sum\limits_{i=0}^d \sum\limits_{j=0}^d \sum\limits_{k=0}^{C_{in}}
         |X(m + i, n + j, k) - F(i, j, k, c)|
 
     where :math:`C` denotes a number of channels,
@@ -183,24 +183,24 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class SlimConv2d(nn.Module):
-    """Implements the convolution module from `"SlimConv: Reducing Channel Redundancy in Convolutional Neural Networks
+    r"""Implements the convolution module from `"SlimConv: Reducing Channel Redundancy in Convolutional Neural Networks
     by Weights Flipping" <https://arxiv.org/pdf/2003.07469.pdf>`_.
 
     First, we compute channel-wise weights as follows:
 
     .. math::
-        z(c) = \\frac{1}{H \\cdot W} \\sum\\limits_{i=1}^H \\sum\\limits_{j=1}^W X_{c,i,j}
+        z(c) = \frac{1}{H \cdot W} \sum\limits_{i=1}^H \sum\limits_{j=1}^W X_{c,i,j}
 
-    where :math:`X \\in \\mathbb{R}^{C \\times H \\times W}` is the input tensor,
+    where :math:`X \in \mathbb{R}^{C \times H \times W}` is the input tensor,
     :math:`H` is height in pixels, and :math:`W` is
     width in pixels.
 
     .. math::
-        w = \\sigma(F_{fc2}(\\delta(F_{fc1}(z))))
+        w = \sigma(F_{fc2}(\delta(F_{fc1}(z))))
 
-    where :math:`z \\in \\mathbb{R}^{C}` contains channel-wise statistics,
-    :math:`\\sigma` refers to the sigmoid function,
-    :math:`\\delta` refers to the ReLU function,
+    where :math:`z \in \mathbb{R}^{C}` contains channel-wise statistics,
+    :math:`\sigma` refers to the sigmoid function,
+    :math:`\delta` refers to the ReLU function,
     :math:`F_{fc1}` is a convolution operation with kernel of size :math:`(1, 1)`
     with :math:`max(C/r, L)` output channels followed by batch normalization,
     and :math:`F_{fc2}` is a plain convolution operation with kernel of size :math:`(1, 1)`
@@ -209,12 +209,12 @@ class SlimConv2d(nn.Module):
     We then proceed with reconstructing and transforming both pathways:
 
     .. math::
-        X_{top} = X \\odot w
+        X_{top} = X \odot w
 
     .. math::
-        X_{bot} = X \\odot \\check{w}
+        X_{bot} = X \odot \check{w}
 
-    where :math:`\\odot` refers to the element-wise multiplication and :math:`\\check{w}` is
+    where :math:`\odot` refers to the element-wise multiplication and :math:`\check{w}` is
     the channel-wise reverse-flip of :math:`w`.
 
     .. math::
@@ -231,9 +231,9 @@ class SlimConv2d(nn.Module):
     Finally we fuse both pathways to yield the output:
 
     .. math::
-        Y = T_{top} \\oplus T_{bot}
+        Y = T_{top} \oplus T_{bot}
 
-    where :math:`\\oplus` is the channel-wise concatenation.
+    where :math:`\oplus` is the channel-wise concatenation.
 
     .. image:: https://github.com/frgfm/Holocron/releases/download/v0.1.3/slimconv2d.png
         :align: center

diff --git a/holocron/nn/modules/loss.py b/holocron/nn/modules/loss.py
@@ -42,23 +42,23 @@ def __init__(
 
 
 class FocalLoss(_Loss):
-    """Implementation of Focal Loss as described in
+    r"""Implementation of Focal Loss as described in
     `"Focal Loss for Dense Object Detection" <https://arxiv.org/pdf/1708.02002.pdf>`_.
 
     While the weighted cross-entropy is described by:
 
     .. math::
-        CE(p_t) = -\\alpha_t log(p_t)
+        CE(p_t) = -\alpha_t log(p_t)
 
-    where :math:`\\alpha_t` is the loss weight of class :math:`t`,
+    where :math:`\alpha_t` is the loss weight of class :math:`t`,
     and :math:`p_t` is the predicted probability of class :math:`t`.
 
     the focal loss introduces a modulating factor
 
     .. math::
-        FL(p_t) = -\\alpha_t (1 - p_t)^\\gamma log(p_t)
+        FL(p_t) = -\alpha_t (1 - p_t)^\gamma log(p_t)
 
-    where :math:`\\gamma` is a positive focusing parameter.
+    where :math:`\gamma` is a positive focusing parameter.
 
     Args:
         gamma (float, optional): exponent parameter of the focal loss
@@ -120,16 +120,16 @@ def __repr__(self) -> str:
 
 
 class ClassBalancedWrapper(nn.Module):
-    """Implementation of the class-balanced loss as described in `"Class-Balanced Loss Based on Effective Number
+    r"""Implementation of the class-balanced loss as described in `"Class-Balanced Loss Based on Effective Number
     of Samples" <https://arxiv.org/pdf/1901.05555.pdf>`_.
 
-    Given a loss function :math:`\\mathcal{L}`, the class-balanced loss is described by:
+    Given a loss function :math:`\mathcal{L}`, the class-balanced loss is described by:
 
     .. math::
-        CB(p, y) = \\frac{1 - \\beta}{1 - \\beta^{n_y}} \\mathcal{L}(p, y)
+        CB(p, y) = \frac{1 - \beta}{1 - \beta^{n_y}} \mathcal{L}(p, y)
 
     where :math:`p` is the predicted probability for class :math:`y`, :math:`n_y` is the number of training
-    samples for class :math:`y`, and :math:`\\beta` is exponential factor.
+    samples for class :math:`y`, and :math:`\beta` is exponential factor.
 
     Args:
         criterion (torch.nn.Module): loss module