Oneflow-Inc · mosout · Dec 25, 2021 · Dec 6, 2021 · Dec 6, 2021 · Dec 14, 2021
@@ -13,6 +13,7 @@ Functional operations for neural networks
 .. autofunction:: hardsigmoid
 .. autofunction:: hardswish
 .. autofunction:: hardtanh
+.. autofunction:: normalize
 .. autofunction:: l2_normalize
 .. autofunction:: leaky_relu
 .. autofunction:: elu
@@ -22,6 +23,7 @@ Functional operations for neural networks
 .. autofunction:: pad
 .. autofunction:: prelu
 .. autofunction:: logsigmoid 
+.. autofunction:: log_softmax
 .. autofunction:: gelu
 .. autofunction:: glu
 .. autofunction:: softsign

@@ -1716,8 +1716,12 @@
   signature: "TensorTuple (Tensor x, TensorTuple like, Int64 axis) => SplitLike"
   bind_python: True
 
+- name: "normalize"
+  signature: "Tensor (Tensor input, Float p=2.0, Int32 dim=1, Float eps=1e-12) => Normalize"
+  bind_python: True
+
 - name: "l2_normalize"
-  signature: "TensorTuple (Tensor input, Int32 axis, Float epsilon) => L2Normalize"
+  signature: "Tensor (Tensor input, Int32 axis=0, Float epsilon=1e-12) => L2Normalize"
   bind_python: True
 
 - name: "l2_normalize_grad"

@@ -1660,18 +1660,44 @@ class L2NormalizeFunctor {
     op_ = CHECK_JUST(
         one::OpBuilder("l2_normalize").Input("x").Output("y").Output("square_x_sum").Build());
   }
-  Maybe<TensorTuple> operator()(const std::shared_ptr<one::Tensor>& input, const int32_t& axis,
-                                const float& epsilon) const {
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const int32_t& axis,
+                           const float& epsilon) const {
     MutableAttrMap attrs;
-    JUST(attrs.SetAttr<int32_t>("axis", axis));
+    JUST(attrs.SetAttr<int32_t>("axis", 0));
     JUST(attrs.SetAttr<float>("epsilon", epsilon));
-    return OpInterpUtil::Dispatch<TensorTuple>(*op_, {input}, attrs);
+
+    if (axis != 0) {
+      std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
+      for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast<int>(i); }
+      std::swap(input_perm[0], input_perm[static_cast<size_t>(axis)]);
+
+      const auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(
+          *op_, {JUST(functional::Transpose(input, input_perm))}, attrs));
+      return functional::Transpose(result->at(0), input_perm);
+    }
+
+    return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs);
   }
 
  private:
   std::shared_ptr<OpExpr> op_;
 };
 
+class NormalizeFunctor {
+ public:
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const float& p,
+                           const int32_t& dim, const float& eps) const {
+    return SequenceFunction<Maybe<Tensor>(const std::shared_ptr<Tensor>&, const float&,
+                                          const int32_t&)>(
+               [](const auto& x, const float& p, const int32_t& dim) -> Maybe<Tensor> {
+                 return functional::ScalarNorm(x, p, dim, true, NullOpt);
+               })
+        .then([&](const auto& x) { return functional::Clamp(x, eps, NullOpt); })
+        .then([&](const auto& x) { return functional::Div(input, x); })
+        .call(input, p, dim);
+  }
+};
+
 class FusedSelfAttentionFunctor {
  public:
   FusedSelfAttentionFunctor() {
@@ -2155,6 +2181,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::OneHotFunctor>("OneHot");
   m.add_functor<impl::FusedSelfAttentionFunctor>("FusedSelfAttention");
   m.add_functor<impl::FusedSelfAttentionGradFunctor>("FusedSelfAttentionGrad");
+  m.add_functor<impl::NormalizeFunctor>("Normalize");
   m.add_functor<impl::L2NormalizeFunctor>("L2Normalize");
   m.add_functor<impl::L2NormalizeGradFunctor>("L2NormalizeGrad");
   m.add_functor<impl::FusedBiasAddGeluFunctor>("FusedBiasAddGelu");

diff --git a/python/oneflow/framework/docstr/norm.py b/python/oneflow/framework/docstr/norm.py
@@ -263,3 +263,81 @@
 
     """,
 )
+
+add_docstr(
+    oneflow._C.normalize,
-    oneflow._C.normalize,
+    oneflow.normalize,
-    oneflow._C.normalize,
+    oneflow.normalize,
+    """nn.functional.normalize(input: Tensor, p: float=2.0, dim: int=0, epsilon: float=1e-12) -> Tensor
+
+    Performs :math:`L_p` normalization of inputs over specified dimension
+
+    For a tensor :attr:`input` of sizes :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`dim` is transformed as:
+
+    .. math::
+        v = \\frac{v}{\max(\\lVert v \\rVert_p, \\epsilon)}.
+
+    With the default arguments it uses the Euclidean norm over vectors along dimension :math:`1` for normalization.
+
+    But note that the gradient calculation of the input tensor has different results on different frameworks
+    when `input.shape[dim] = 1`.
+
+    Args:
+        input (oneflow.Tensor): input tensor of any shape
+        p (float): the exponent value in the norm formulation. Default: 2
+        dim (int): the dimension to reduce. Default: 1
+        eps (float): small value to avoid division by zero. Default: 1e-12 
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> x = flow.tensor([[1, 2], [3, 4]], dtype=flow.float32)
+        >>> out = flow.nn.functional.normalize(x, 2, 0)
+        >>> out
+        tensor([[0.3162, 0.4472],
+                [0.9487, 0.8944]], dtype=oneflow.float32)
+        >>> out = flow.nn.functional.normalize(x, 2, 1)
+        >>> out
+        tensor([[0.4472, 0.8944],
+                [0.6000, 0.8000]], dtype=oneflow.float32)
+
+    """,
+)
+
+add_docstr(
+    oneflow._C.l2_normalize,
+    """nn.functional.l2_normalize(input: Tensor, dim: int=0, epsilon: float=1e-12) -> Tensor
+
+    Use L2 norm to normalizes along dimension `dim`
+
+    The equation is:
+
+    .. math::
+        out = \\frac{x}{max(\\sqrt{\\Sigma{x^2}}, \\epsilon)}
+
+    Args:
+        input (oneflow.Tensor): Input Tensor
+        dim (int): The axis on which to apply L2 normalization. Defaults to 0.
+        epsilon (float): The epsilon value is used to avoid division by zero. Defaults to 1e-12.
+
+    Returns:
+        oneflow.Tensor: The normalized Tensor
+
+    For example:
+
+    .. code-block:: python
+
+        >>> import oneflow as flow
+        >>> x = flow.tensor([[1, 2], [3, 4]], dtype=flow.float32)
+        >>> out = flow.nn.functional.l2_normalize(x, 0)
+        >>> out
+        tensor([[0.3162, 0.4472],
+                [0.9487, 0.8944]], dtype=oneflow.float32)
+        >>> out = flow.nn.functional.l2_normalize(x, 1)
+        >>> out
+        tensor([[0.4472, 0.8944],
+                [0.6000, 0.8000]], dtype=oneflow.float32)
+
+    """,
+)
diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py
@@ -14,7 +14,6 @@
 limitations under the License.
 """
 from oneflow.nn.modules.interpolate import interpolate
-from oneflow.nn.modules.norm import l2_normalize
 from oneflow.nn.modules.affine_grid import affine_grid
 from oneflow.nn.modules.grid_sample import grid_sample
 from oneflow.nn.modules.sparse_softmax_cross_entropy import sparse_softmax_cross_entropy
@@ -43,6 +42,7 @@
 from oneflow._C import gelu
 from oneflow._C import glu
 from oneflow._C import logsigmoid
+from oneflow._C import log_softmax
 from oneflow._C import softsign
 from oneflow._C import softmax
 from oneflow._C import softplus
@@ -57,6 +57,8 @@
 from oneflow._C import triplet_margin_loss
 from oneflow._C import ctc_greedy_decoder
 from oneflow._C import one_hot
+from oneflow._C import l2_normalize
+from oneflow._C import normalize
 from oneflow.nn.modules.sparse import embedding
 from oneflow.nn.modules.linear import linear
 from oneflow.nn.modules.activation import relu6
diff --git a/python/oneflow/nn/modules/norm.py b/python/oneflow/nn/modules/norm.py
diff --git a/...oneflow/test/modules/test_l2_normalize.py → ...on/oneflow/test/modules/test_normalize.py b/...oneflow/test/modules/test_l2_normalize.py → ...on/oneflow/test/modules/test_normalize.py
@@ -16,10 +16,9 @@
 
 import unittest
 from collections import OrderedDict
-
-import numpy as np
 from test_util import GenArgList
-
+from oneflow.test_utils.automated_test_util import *
+import numpy as np
 import oneflow as flow
 import oneflow.unittest
 
@@ -32,12 +31,18 @@ def _count(shape, begin_axis, end_axis):
 
 
 def _l2_norm_numpy(x, dim, epsilon=1e-12):
+    axes = [k for k in range(len(list(x.shape)))]
+    axes[0], axes[dim] = axes[dim], axes[0]
+    axes_tuple = tuple(axes)
+
+    x = np.transpose(x, axes_tuple)
+
     square_x_sum_shape = list(x.shape)
-    square_x_sum_shape[dim] = 1
+    square_x_sum_shape[0] = 1
 
-    c = x.shape[dim]
+    c = x.shape[0]
     n = int(x.size / c)
-    d = _count(x.shape, dim + 1, len(x.shape))
+    d = _count(x.shape, 1, len(x.shape))
 
     square_x_sum = np.zeros(square_x_sum_shape)
 
@@ -58,13 +63,21 @@ def _l2_norm_numpy(x, dim, epsilon=1e-12):
 
     square_x_sum = square_x_sum_flatten.reshape(square_x_sum.shape)
     out = out.reshape(x.shape)
-    return out, square_x_sum
+    return np.transpose(out, axes_tuple), np.transpose(square_x_sum, axes_tuple)
 
 
 def _l2_norm_backward_np(dy, y, square_x_sum, dim, epsilon=1e-12):
-    c = dy.shape[dim]
+    axes = [k for k in range(len(list(y.shape)))]
+    axes[0], axes[dim] = axes[dim], axes[0]
+    axes_tuple = tuple(axes)
+
+    dy = np.transpose(dy, axes_tuple)
+    y = np.transpose(y, axes_tuple)
+    square_x_sum = np.transpose(square_x_sum, axes_tuple)
+
+    c = dy.shape[0]
     n = int(dy.size / c)
-    d = _count(dy.shape, dim + 1, len(y.shape))
+    d = _count(dy.shape, 1, len(y.shape))
 
     dx = np.zeros(dy.shape).reshape(-1)
     dy_flatten = dy.reshape(-1)
@@ -89,7 +102,7 @@ def _l2_norm_backward_np(dy, y, square_x_sum, dim, epsilon=1e-12):
                 index = offset + j * d
                 dx[index] = (1 / norm) * dy_flatten[index]
 
-    return dx.reshape(y.shape)
+    return np.transpose(dx.reshape(y.shape), axes_tuple)
 
 
 def _test_l2_normalize(test_case, device, dim, shape):
@@ -124,5 +137,24 @@ def test_l2_normalize(test_case):
             arg[0](test_case, *arg[1:])
 
 
+@flow.unittest.skip_unless_1n1d()
+class TestFunctionalNormalize(flow.unittest.TestCase):
+    @autotest(check_graph=False)
+    def test_functional_normalize(test_case):
+        device = random_device()
+        ndim = random(low=2)
+
+        shape = list(random_tensor(ndim).value().shape)
+        dim = random(low=0, high=ndim).to(int).value()
+        shape[dim] = random(low=2, high=8).to(int).value()
+        shape = tuple(shape)
+
+        x = random_pytorch_tensor(len(shape), *shape).to(device)
+        m = torch.nn.functional.normalize
+        y = m(x, oneof(2, 3, 4), dim, 1e-12)
-        y = m(x, oneof(2, 3, 4), dim, 1e-12)
+        y = torch.nn.functional.normalize(x, oneof(2, 3, 4), dim, 1e-12)
-        y = m(x, oneof(2, 3, 4), dim, 1e-12)
+        y = torch.nn.functional.normalize(x, oneof(2, 3, 4), dim, 1e-12)
+
+        return y
+
+
 if __name__ == "__main__":
     unittest.main()