diff --git a/docs/source/functional.rst b/docs/source/functional.rst
index 23b21728d8b..e187893a174 100644
--- a/docs/source/functional.rst
+++ b/docs/source/functional.rst
@@ -14,7 +14,6 @@ Functional operations for neural networks
 .. autofunction:: hardswish
 .. autofunction:: hardtanh
 .. autofunction:: normalize
-.. autofunction:: l2_normalize
 .. autofunction:: leaky_relu
 .. autofunction:: elu
 .. autofunction:: celu
diff --git a/oneflow/core/functional/functional_api.yaml b/oneflow/core/functional/functional_api.yaml
index 382373be87e..cba2728c763 100755
--- a/oneflow/core/functional/functional_api.yaml
+++ b/oneflow/core/functional/functional_api.yaml
@@ -1762,12 +1762,12 @@
   bind_python: True
 
 - name: "normalize"
-  signature: "Tensor (Tensor input, Float p=2.0, Int32 dim=1, Float eps=1e-12) => Normalize"
+  signature: "Tensor (Tensor input, Float p=2.0, Int32 dim=1, Float eps=1e-12, Bool use_l2_norm_kernel=True) => Normalize"
   bind_python: True
 
 - name: "l2_normalize"
   signature: "Tensor (Tensor input, Int32 axis=0, Float epsilon=1e-12) => L2Normalize"
-  bind_python: True
+  bind_python: False
 
 - name: "l2_normalize_grad"
   signature: "Tensor (Tensor dy, Tensor y, Tensor square_x_sum, Int32 axis, Float epsilon) => L2NormalizeGrad"
diff --git a/oneflow/core/functional/impl/nn_functor.cpp b/oneflow/core/functional/impl/nn_functor.cpp
index 33a2b3eaaa0..2a1f355bc11 100644
--- a/oneflow/core/functional/impl/nn_functor.cpp
+++ b/oneflow/core/functional/impl/nn_functor.cpp
@@ -1662,21 +1662,27 @@ class L2NormalizeFunctor {
   }
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const int32_t& axis,
                            const float& epsilon) const {
+    const auto ndims = input->shape()->NumAxes();
+    const auto final_dim = ndims - 1;
+
+    auto axis_ = axis >= 0 ? axis : axis + ndims;
+    CHECK_GE_OR_RETURN(axis_, 0) << "Axis should >=0 but axis is " << axis_ << " now.";
+    CHECK_LE_OR_RETURN(axis_, final_dim)
+        << "Axis should <" << ndims << " but axis is " << axis_ << " now.";
+
     MutableAttrMap attrs;
-    JUST(attrs.SetAttr<int32_t>("axis", 0));
     JUST(attrs.SetAttr<float>("epsilon", epsilon));
+    JUST(attrs.SetAttr<int32_t>("axis", final_dim));
 
-    if (axis != 0) {
-      std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
-      for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast<int>(i); }
-      std::swap(input_perm[0], input_perm[static_cast<size_t>(axis)]);
+    if (axis_ == final_dim) { return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs); }
 
-      const auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(
-          *op_, {JUST(functional::Transpose(input, input_perm))}, attrs));
-      return functional::Transpose(result->at(0), input_perm);
-    }
+    std::vector<int> input_perm(input->shape()->dim_vec().size(), 0);
+    for (size_t i = 0; i < input_perm.size(); ++i) { input_perm[i] = static_cast<int>(i); }
+    std::swap(input_perm[final_dim], input_perm[static_cast<size_t>(axis_)]);
 
-    return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs);
+    const auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(
+        *op_, {JUST(functional::Transpose(input, input_perm))}, attrs));
+    return functional::Transpose(result->at(0), input_perm);
   }
 
  private:
@@ -1686,7 +1692,11 @@ class L2NormalizeFunctor {
 class NormalizeFunctor {
  public:
   Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, const float& p,
-                           const int32_t& dim, const float& eps) const {
+                           const int32_t& dim, const float& eps,
+                           const bool& use_l2_norm_kernel) const {
+    if (use_l2_norm_kernel && (std::fabs(p - 2.0f) < std::numeric_limits<float>::min())) {
+      return functional::L2Normalize(input, dim, eps);
+    }
     return SequenceFunction<Maybe<Tensor>(const std::shared_ptr<Tensor>&, const float&,
                                           const int32_t&)>(
                [](const auto& x, const float& p, const int32_t& dim) -> Maybe<Tensor> {
diff --git a/python/oneflow/framework/docstr/norm.py b/python/oneflow/framework/docstr/norm.py
index a3a0b0048af..f7b7f64a2d4 100644
--- a/python/oneflow/framework/docstr/norm.py
+++ b/python/oneflow/framework/docstr/norm.py
@@ -304,40 +304,3 @@
 
     """,
 )
-
-add_docstr(
-    oneflow._C.l2_normalize,
-    """nn.functional.l2_normalize(input: Tensor, dim: int=0, epsilon: float=1e-12) -> Tensor
-
-    Use L2 norm to normalizes along dimension `dim`
-
-    The equation is:
-
-    .. math::
-        out = \\frac{x}{max(\\sqrt{\\Sigma{x^2}}, \\epsilon)}
-
-    Args:
-        input (oneflow.Tensor): Input Tensor
-        dim (int): The axis on which to apply L2 normalization. Defaults to 0.
-        epsilon (float): The epsilon value is used to avoid division by zero. Defaults to 1e-12.
-
-    Returns:
-        oneflow.Tensor: The normalized Tensor
-
-    For example:
-
-    .. code-block:: python
-
-        >>> import oneflow as flow
-        >>> x = flow.tensor([[1, 2], [3, 4]], dtype=flow.float32)
-        >>> out = flow.nn.functional.l2_normalize(x, 0)
-        >>> out
-        tensor([[0.3162, 0.4472],
-                [0.9487, 0.8944]], dtype=oneflow.float32)
-        >>> out = flow.nn.functional.l2_normalize(x, 1)
-        >>> out
-        tensor([[0.4472, 0.8944],
-                [0.6000, 0.8000]], dtype=oneflow.float32)
-
-    """,
-)
diff --git a/python/oneflow/nn/functional/__init__.py b/python/oneflow/nn/functional/__init__.py
index 4a1bd002699..472c4267cbb 100644
--- a/python/oneflow/nn/functional/__init__.py
+++ b/python/oneflow/nn/functional/__init__.py
@@ -57,7 +57,6 @@
 from oneflow._C import triplet_margin_loss
 from oneflow._C import ctc_greedy_decoder
 from oneflow._C import one_hot
-from oneflow._C import l2_normalize
 from oneflow._C import normalize
 from oneflow.nn.modules.sparse import embedding
 from oneflow.nn.modules.linear import linear
diff --git a/python/oneflow/test/modules/test_normalize.py b/python/oneflow/test/modules/test_normalize.py
index b981baf8044..f604858a117 100644
--- a/python/oneflow/test/modules/test_normalize.py
+++ b/python/oneflow/test/modules/test_normalize.py
@@ -15,131 +15,14 @@
 """
 
 import unittest
-from collections import OrderedDict
-from test_util import GenArgList
 from oneflow.test_utils.automated_test_util import *
-import numpy as np
 import oneflow as flow
 import oneflow.unittest
 
 
-def _count(shape, begin_axis, end_axis):
-    cnt = 1
-    for i in range(begin_axis, end_axis):
-        cnt *= shape[i]
-    return cnt
-
-
-def _l2_norm_numpy(x, dim, epsilon=1e-12):
-    axes = [k for k in range(len(list(x.shape)))]
-    axes[0], axes[dim] = axes[dim], axes[0]
-    axes_tuple = tuple(axes)
-
-    x = np.transpose(x, axes_tuple)
-
-    square_x_sum_shape = list(x.shape)
-    square_x_sum_shape[0] = 1
-
-    c = x.shape[0]
-    n = int(x.size / c)
-    d = _count(x.shape, 1, len(x.shape))
-
-    square_x_sum = np.zeros(square_x_sum_shape)
-
-    square_x_sum_flatten = square_x_sum.reshape(-1)
-    in_flatten = x.reshape(-1)
-    out = np.zeros(x.size)
-
-    for i in range(0, n):
-        offset = int(int((i / d)) * d * c + (i % d))
-        for j in range(0, c):
-            item = in_flatten[offset + j * d]
-            square_x_sum_flatten[i] = square_x_sum_flatten[i] + item * item
-
-        norm = np.sqrt(np.maximum(square_x_sum_flatten[i], epsilon))
-        for j in range(0, c):
-            index = offset + j * d
-            out[index] = in_flatten[index] / norm
-
-    square_x_sum = square_x_sum_flatten.reshape(square_x_sum.shape)
-    out = out.reshape(x.shape)
-    return np.transpose(out, axes_tuple), np.transpose(square_x_sum, axes_tuple)
-
-
-def _l2_norm_backward_np(dy, y, square_x_sum, dim, epsilon=1e-12):
-    axes = [k for k in range(len(list(y.shape)))]
-    axes[0], axes[dim] = axes[dim], axes[0]
-    axes_tuple = tuple(axes)
-
-    dy = np.transpose(dy, axes_tuple)
-    y = np.transpose(y, axes_tuple)
-    square_x_sum = np.transpose(square_x_sum, axes_tuple)
-
-    c = dy.shape[0]
-    n = int(dy.size / c)
-    d = _count(dy.shape, 1, len(y.shape))
-
-    dx = np.zeros(dy.shape).reshape(-1)
-    dy_flatten = dy.reshape(-1)
-    y_flatten = y.reshape(-1)
-    square_x_sum_flatten = square_x_sum.reshape(-1)
-
-    for i in range(0, n):
-        norm = np.sqrt(np.maximum(square_x_sum_flatten[i], epsilon))
-        offset = int(int(int((i / d)) * d * c) + (i % d))
-        if square_x_sum_flatten[i] >= epsilon:
-            y_dy_inner_prod = 0
-            for j in range(0, c):
-                index = offset + j * d
-                y_dy_inner_prod = y_dy_inner_prod + dy_flatten[index] * y_flatten[index]
-            for j in range(0, c):
-                index = offset + j * d
-                dx[index] = (1 / norm) * (
-                    dy_flatten[index] - y_dy_inner_prod * y_flatten[index]
-                )
-        else:
-            for j in range(0, c):
-                index = offset + j * d
-                dx[index] = (1 / norm) * dy_flatten[index]
-
-    return np.transpose(dx.reshape(y.shape), axes_tuple)
-
-
-def _test_l2_normalize(test_case, device, dim, shape):
-    input = np.random.randn(*shape)
-    np_out, square_x_sum = _l2_norm_numpy(input, dim)
-    of_input = flow.tensor(
-        input, dtype=flow.float32, requires_grad=True, device=flow.device(device)
-    )
-    of_out = flow.nn.functional.l2_normalize(of_input, dim)
-
-    test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
-
-    z = of_out.sum()
-    z.backward()
-    dx = _l2_norm_backward_np(np.ones(np_out.shape), np_out, square_x_sum, dim)
-    test_case.assertTrue(np.allclose(of_input.grad.numpy(), dx, 1e-4, 1e-4))
-
-
-@flow.unittest.skip_unless_1n1d()
-class TestL2Normalize(flow.unittest.TestCase):
-    def test_l2_normalize(test_case):
-        arg_dict = OrderedDict()
-        arg_dict["test_fun"] = [
-            _test_l2_normalize,
-        ]
-        arg_dict["device"] = ["cpu", "cuda"]
-        arg_dict["dim"] = [0, 1, 2, 3]
-        arg_dict["shape"] = [
-            (10, 10, 20, 30),
-        ]
-        for arg in GenArgList(arg_dict):
-            arg[0](test_case, *arg[1:])
-
-
 @flow.unittest.skip_unless_1n1d()
 class TestFunctionalNormalize(flow.unittest.TestCase):
-    @autotest(check_graph=False)
+    @autotest()
     def test_functional_normalize(test_case):
         device = random_device()
         ndim = random(low=2)