diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index fbe524376e592..04fb45cd3ae22 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -286,18 +286,18 @@ class ClipGradByNorm(ClipGradBase): .. math:: Out = - \\left \{ - \\begin{aligned} - & X & & if (norm(X) \\leq clip\_norm) \\\\ - & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\ - \\end{aligned} - \\right. + \left\{ + \begin{array}{ccl} + X & & if (norm(X) \leq clip\_norm) \\ + \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\ + \end{array} + \right. where :math:`norm(X)` represents the L2 norm of :math:`X`. .. math:: - norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}} + norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}} Note: ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. @@ -389,7 +389,7 @@ class ClipGradByGlobalNorm(ClipGradBase): .. math:: - t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)} + t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)} where: diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index de722e6e16c89..608e85acec3f2 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1151,9 +1151,6 @@ def forward(self, input): class BatchNorm(layers.Layer): r""" - :alias_main: paddle.nn.BatchNorm - :alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm - :old_api: paddle.fluid.dygraph.BatchNorm This interface is used to construct a callable object of the ``BatchNorm`` class. For more details, refer to code examples. @@ -1164,16 +1161,16 @@ class BatchNorm(layers.Layer): Internal Covariate Shift `_ for more details. - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad & + //\ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad & + //\ mini-batch\ variance \\ - :math:`x` : mini-batch data - :math:`m` : the size of the mini-batch data @@ -1191,13 +1188,14 @@ class BatchNorm(layers.Layer): .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift + - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_channels(int): Indicate the number of channels of the input ``Tensor``. @@ -3011,9 +3009,9 @@ class SpectralNorm(layers.Layer): .. math:: - \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2} + \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2} - \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2} + \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2} Step 3: Calculate :math:`\sigma(\mathbf{W})` and normalize weight values. @@ -3022,7 +3020,7 @@ class SpectralNorm(layers.Layer): \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v} - \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})} + \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})} Refer to `Spectral Normalization `_ . diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py index f0847c85237b2..88a52268776fc 100644 --- a/python/paddle/nn/initializer/kaiming.py +++ b/python/paddle/nn/initializer/kaiming.py @@ -33,7 +33,7 @@ class KaimingNormal(MSRAInitializer): .. math:: - \sqrt{\\frac{2.0}{fan\_in}} + \sqrt{\frac{2.0}{fan\_in}} Args: fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\ @@ -75,7 +75,7 @@ class KaimingUniform(MSRAInitializer): .. math:: - x = \sqrt{\\frac{6.0}{fan\_in}} + x = \sqrt{\frac{6.0}{fan\_in}} Args: fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py index f2d5593032f64..aff3a2c15aeec 100644 --- a/python/paddle/nn/initializer/xavier.py +++ b/python/paddle/nn/initializer/xavier.py @@ -28,7 +28,7 @@ class XavierNormal(XavierInitializer): .. math:: - \sqrt{\\frac{2.0}{fan\_in + fan\_out}} + \sqrt{\frac{2.0}{fan\_in + fan\_out}} Args: @@ -83,7 +83,7 @@ class XavierUniform(XavierInitializer): .. math:: - x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}} + x = \sqrt{\frac{6.0}{fan\_in + fan\_out}} Args: fan_in (float, optional): fan_in for Xavier initialization, it is diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index 695e387bda84f..abfeff0641a47 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -31,7 +31,7 @@ class ELU(Layer): .. math:: - ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1)) + ELU(x) = max(0, x) + min(0, \alpha * (e^{x}-1)) Parameters: alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0. @@ -75,13 +75,13 @@ class GELU(Layer): .. math:: - GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3}))) + GELU(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3}))) else .. math:: - GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}})) + GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}})) Parameters: approximate (bool, optional): Wether to enable approximation. Default is False. @@ -127,13 +127,13 @@ class Hardshrink(Layer): .. math:: hardshrink(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x > threshold \\\\ - &x, & & if \\ x < -threshold \\\\ - &0, & & if \\ others - \\end{aligned} - \\right. + \left\{ + \begin{array}{rcl} + x, & & if \ x > threshold \\ + x, & & if \ x < -threshold \\ + 0, & & if \ others + \end{array} + \right. Parameters: threshold (float, optional): The value of threshold for hardthrink. Default is 0.5 @@ -179,13 +179,14 @@ class Hardswish(Layer): .. math:: Hardswish(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &x, & & \\text{if } x \\geq 3 \\\\ - &\\frac{x(x+3)}{6}, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{cll} + 0 &, & \text{if } x \leq -3 \\ + x &, & \text{if } x \geq 3 \\ + \frac{x(x+3)}{6} &, & \text{otherwise} + \end{array} + \right. + Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -223,7 +224,7 @@ class Tanh(Layer): Tanh Activation. .. math:: - Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}} + Tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -265,11 +266,15 @@ class Hardtanh(Layer): .. math:: - Hardtanh(x)= \\begin{cases} - max, \\text{if } x > max \\\\ - min, \\text{if } x < min \\\\ - x, \\text{otherwise} - \\end{cases} + Hardtanh(x)= + \left\{ + \begin{array}{cll} + max,& & \text{if } x > max \\ + min,& & \text{if } x < min \\ + x,& & \text{otherwise} + \end{array} + \right. + Parameters: min (float, optional): The value of min for Hardtanh. Default is -1. @@ -461,10 +466,12 @@ class SELU(Layer): .. math:: SELU(x)= scale * - \\begin{cases} - x, \\text{if } x > 0 \\\\ - alpha * e^{x} - alpha, \\text{if } x <= 0 - \\end{cases} + \left\{ + \begin{array}{lcl} + x,& &\text{if } \ x > 0 \\ + alpha * e^{x} - alpha,& &\text{if } \ x <= 0 + \end{array} + \right. Parameters: scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946 @@ -512,12 +519,13 @@ class LeakyReLU(Layer): .. math:: LeakyReLU(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x >= 0 \\\\ - &negative\_slope * x, & & otherwise \\\\ - \\end{aligned} - \\right. \\\\ + \left\{ + \begin{array}{rcl} + x, & & if \ x >= 0 \\ + negative\_slope * x, & & otherwise \\ + \end{array} + \right. + Parameters: negative_slope (float, optional): Slope of the activation function at @@ -604,13 +612,14 @@ class Hardsigmoid(Layer): .. math:: Hardsigmoid(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &1, & & \\text{if } x \\geq 3 \\\\ - &x/6 + 1/2, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{rcl} + 0, & & \text{if } \ x \leq -3 \\ + 1, & & \text{if } \ x \geq 3 \\ + x/6 + 1/2, & & \text{otherwise} + \end{array} + \right. + Parameters: name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -650,8 +659,8 @@ class Softplus(Layer): .. math:: - Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ - \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} + Softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\ + \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} Parameters: beta (float, optional): The value of beta for Softplus. Default is 1 @@ -695,11 +704,15 @@ class Softshrink(Layer): .. math:: - Softshrink(x)= \\begin{cases} - x - threshold, \\text{if } x > threshold \\\\ - x + threshold, \\text{if } x < -threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + Softshrink(x)= + \left\{ + \begin{array}{rcl} + x - threshold,& & \text{if } x > threshold \\ + x + threshold,& & \text{if } x < -threshold \\ + 0,& & \text{otherwise} + \end{array} + \right. + Parameters: threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5 @@ -740,7 +753,7 @@ class Softsign(Layer): .. math:: - Softsign(x) = \\frac{x}{1 + |x|} + Softsign(x) = \frac{x}{1 + |x|} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -779,7 +792,7 @@ class Swish(Layer): .. math:: - Swish(x) = \\frac{x}{1 + e^{-x}} + Swish(x) = \frac{x}{1 + e^{-x}} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -857,10 +870,14 @@ class ThresholdedReLU(Layer): .. math:: - ThresholdedReLU(x) = \\begin{cases} - x, \\text{if } x > threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + ThresholdedReLU(x) = + \left\{ + \begin{array}{rl} + x,& \text{if } \ x > threshold \\ + 0,& \text{otherwise} + \end{array} + \right. + Parameters: threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0 @@ -939,7 +956,7 @@ class LogSigmoid(Layer): .. math:: - LogSigmoid(x) = log \\frac{1}{1 + e^{-x}} + LogSigmoid(x) = log \frac{1}{1 + e^{-x}} Parameters: x (Tensor): The input Tensor with data type float32, or float64. @@ -1001,7 +1018,7 @@ class Softmax(Layer): .. math:: - Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])} + Softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])} Example: @@ -1105,10 +1122,10 @@ class LogSoftmax(Layer): .. math:: - \\begin{aligned} - Out[i, j] &= log(softmax(x)) \\\\ - &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])}) - \\end{aligned} + \begin{array} {rcl} + Out[i, j] &= &log(softmax(x)) \\ + &= &log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])}) + \end{array} Parameters: axis (int, optional): The axis along which to perform log_softmax @@ -1167,12 +1184,14 @@ class Maxout(Layer): .. math:: - &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\ - &g = groups \\\\ - &s = \\frac{input.size}{num\\_channels} \\\\ - &0 \\le i < \\frac{num\\_channels}{groups} \\\\ - &0 \\le j < s \\\\ - &0 \\le k < groups + \begin{array}{l} + &out_{si+j} = \max_{k} x_{gsi + sk + j} \\ + &g = groups \\ + &s = \frac{input.size}{num\_channels} \\ + &0 \le i < \frac{num\_channels}{groups} \\ + &0 \le j < s \\ + &0 \le k < groups + \end{array} Parameters: groups (int, optional): The groups number of maxout. `groups` specifies the diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index 31b552bed162c..3ac0d675fb72c 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -40,18 +40,18 @@ class BCEWithLogitsLoss(Layer): First this operator calculate loss function as follows: .. math:: - Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit)) + Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit)) - We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get: + We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get: .. math:: - Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit}) + Out = Logit - Logit * Labels + \log(1 + e^{-Logit}) - For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0, + For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0, we reformulate the loss as follows: .. math:: - Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|}) + Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|}) Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the weight tensor on the loss `Out`. The ``weight`` tensor will attach different @@ -779,8 +779,6 @@ def forward(self, input, label): class NLLLoss(Layer): r""" - :alias_main: paddle.nn.NLLLoss - :alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss This class accepts input and target label and returns negative log likelihood cross error. It is useful to train a classification problem with C classes. @@ -800,20 +798,25 @@ class NLLLoss(Layer): The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: .. math:: - \ell(x, y) = L = \{l_1,\dots,l_N\}^\\top, \quad + + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - w_{y_n} x_{n,y_n}, \quad - w_{c} = \\text{weight}[c] \cdot \mathbb{1}\{c \\not= \\text{ignore\\_index}\}, + w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\}, where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then .. math:: - \ell(x, y) = \\begin{cases} - \\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n}} l_n, & - \\text{if reduction} = \\text{'mean';}\\\\ - \\sum_{n=1}^N l_n, & - \\text{if reduction} = \\text{'sum'.} - \\end{cases} + + \ell(x, y) = + \left\{ + \begin{array}{lcl} + \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & + \text{if reduction} = \text{'mean';}\\ + \sum_{n=1}^N l_n, & + \text{if reduction} = \text{'sum'.} + \end{array} + \right. Parameters: weight (Tensor, optional): Weight tensor, a manual rescaling weight given @@ -1136,16 +1139,16 @@ class SmoothL1Loss(Layer): .. math:: - loss(x,y) = \\frac{1}{n}\\sum_{i}z_i + loss(x,y) = \frac{1}{n}\sum_{i}z_i where z_i is given by: .. math:: - \\mathop{z_i} = \\left\\{\\begin{array}{rcl} - 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\ + \mathop{z_i} = \left\{\begin{array}{rcl} + 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\ delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise} - \\end{array} \\right. + \end{array} \right. Parameters: reduction (str, optional): Indicate how to average the loss by batch_size, diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 9abbc49425894..41599809810ee 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -115,13 +115,13 @@ class InstanceNorm1D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -187,13 +187,13 @@ class InstanceNorm2D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -257,13 +257,13 @@ class InstanceNorm3D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -450,15 +450,15 @@ class LayerNorm(Layer): .. math:: - \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i + \mu & = \frac{1}{H}\sum_{i=1}^{H} x_i - \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon} + \sigma & = \sqrt{\frac{1}{H}\sum_{i=1}^{H}{(x_i - \mu)^2} + \epsilon} - y & = f(\\frac{g}{\\sigma}(x - \\mu) + b) + y & = f(\frac{g}{\sigma}(x - \mu) + b) - :math:`x`: the vector representation of the summed inputs to the neurons in that layer. - :math:`H`: the number of hidden units in a layers - - :math:`\\epsilon`: the small value added to the variance to prevent division by zero. + - :math:`\epsilon`: the small value added to the variance to prevent division by zero. - :math:`g`: the trainable scale parameter. - :math:`b`: the trainable bias parameter. @@ -666,37 +666,36 @@ class BatchNorm1D(_BatchNormBase): r""" Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - When use_global_stats = True, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + When use_global_stats = True, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch. They are global or running statistics (moving_mean and moving_variance). It usually got from the pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -770,37 +769,36 @@ class BatchNorm2D(_BatchNormBase): r""" Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &// + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - When use_global_stats = True, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + When use_global_stats = True, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch. They are global or running statistics (moving_mean and moving_variance). It usually got from the pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -859,16 +857,16 @@ class BatchNorm3D(_BatchNormBase): r""" Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ When use_global_stats = True, the :math:`\\mu_{\\beta}` and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. @@ -876,20 +874,19 @@ class BatchNorm3D(_BatchNormBase): pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -976,33 +973,33 @@ class SyncBatchNorm(_BatchNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - :math:`x` : whole mini-batch data in all gpus - :math:`m` : the size of the whole mini-batch data When model in evaluation mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, + and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, which usually got from the pre-trained model). Global statistics calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The formula of normalization is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\eps` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable scale parameter vector - - :math:`\\beta` : trainable shift parameter vector + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable scale parameter vector + - :math:`\beta` : trainable shift parameter vector Note: If you want to use container to pack your model and has ``SyncBatchNorm`` in the