From 4ced11ad3bd5f844583da37f2dfcff34b3057fca Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Wed, 26 Jan 2022 10:56:19 +0800
Subject: [PATCH 1/8] finish lamb interface

---
 docs/source/optim.rst                         |   1 +
 python/oneflow/nn/optimizer/lamb.py           | 235 ++++++++++++++++++
 python/oneflow/optim/__init__.py              |   2 +-
 .../test/graph/test_graph_optim_lamb.py       | 183 ++++++++++++++
 .../oneflow/test/modules/test_optim_lamb.py   | 172 +++++++++++++
 5 files changed, 592 insertions(+), 1 deletion(-)
 create mode 100644 python/oneflow/nn/optimizer/lamb.py
 create mode 100644 python/oneflow/test/graph/test_graph_optim_lamb.py
 create mode 100644 python/oneflow/test/modules/test_optim_lamb.py

diff --git a/docs/source/optim.rst b/docs/source/optim.rst
index f9fcbcb3dd5..3ebb75161b6 100644
--- a/docs/source/optim.rst
+++ b/docs/source/optim.rst
@@ -10,6 +10,7 @@ Optimizers
         Optimizer,
         RMSprop,
         SGD,
+        LAMB,
         lr_scheduler
 
 .. automodule:: oneflow.optim.lr_scheduler
diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
new file mode 100644
index 00000000000..537d0c44104
--- /dev/null
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -0,0 +1,235 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+from typing import Callable, Dict, Iterator, List, Union, Tuple
+
+import math
+import oneflow as flow
+from oneflow.nn.optimizer.optimizer import Optimizer
+from oneflow.nn.parameter import Parameter
+
+
+class LAMB(Optimizer):
+    """Implements LAMB algorithm.
+
+    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes.
+    https://arxiv.org/abs/1904.00962
+
+    Args:
+        parameters (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam_w_mode (bool, optional): apply L2 regularization or weight decay True for
+            decoupled weight decay (also known as AdamW) (default: True)
+        do_bias_correction (bool, optional): whether to do bias correction (default: True)
+        amsgrad (bool, optional): whether to use the AMSGrad variant of this algorithm. NOT SUPPORTED now! (default: False)
+
+    For example:
+
+    Example 1:
+
+    .. code-block:: python
+
+        # Assume net is a custom model.
+        lamb = flow.optim.LAMB(net.parameters(), lr=1e-3)
+
+        for epoch in range(epochs):
+            # Read data, Compute the loss and so on.
+            # ...
+            loss.backward()
+            lamb.step()
+            lamb.zero_grad()
+
+    Example 2:
+
+    .. code-block:: python
+
+        # Assume net is a custom model.
+        lamb = flow.optim.LAMB(
+            [
+                {
+                    "params": net.parameters(),
+                    "lr": learning_rate,
+                    "clip_grad_max_norm": 0.5,
+                    "clip_grad_norm_type": 2.0,
+                }
+            ],
+        )
+
+        for epoch in range(epochs):
+            # Read data, Compute the loss and so on.
+            # ...
+            loss.backward()
+            lamb.clip_grad()
+            lamb.step()
+            lamb.zero_grad()
+
+    If you want to use clip_grad, you can refer this example.
+
+    For more details of `clip_grad_max_norm` and `clip_grad_norm_type`, you can refer to :func:`oneflow.nn.utils.clip_grad_norm_`.
+
+    """
+
+    def __init__(
+        self,
+        parameters: Union[Iterator[Parameter], List[Dict]],
+        lr: float = 0.001,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-08,
+        weight_decay: float = 0,
+        adam_w_mode: bool = True,
+        do_bias_correction: bool = True,
+        amsgrad: bool = False,
+    ):
+        if amsgrad:
+            raise RuntimeError("LAMB does not support AMSGrad variant.")
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert (
+            betas[0] >= 0.0 and betas[0] < 1.0
+        ), f"Invalid beta parameter at index 0: {betas[0]}"
+        assert (
+            betas[1] >= 0.0 and betas[1] < 1.0
+        ), f"Invalid beta parameter at index 1: {betas[1]}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay value: {weight_decay}"
+
+        options = dict()
+        options["lr"] = lr
+        options["eps"] = eps
+        options["betas"] = betas
+        options["weight_decay"] = weight_decay
+        options["amsgrad"] = amsgrad
+        options["adam_w_mode"] = adam_w_mode
+        options["bias_correction1"] = 1.0
+        options["bias_correction2"] = 1.0
+        options["do_bias_correction"] = do_bias_correction
+
+        super().__init__(parameters, options)
+
+        for param_group in self.param_groups:
+            for param in param_group.parameters:
+                assert param.is_leaf, "parameters must be leaf tensor"
+                self._state[param] = dict()
+
+        self._lamb_op = (
+            flow.builtin_op("lamb_update")
+            .Input("model")
+            .Input("model_diff")
+            .Input("m")
+            .Input("v")
+            .Attr("l1", 0.0)
+            .Build()
+        )
+
+    def step(self, closure: Callable = None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        with flow.no_grad():
+            loss = None
+            if closure is not None:
+                loss = closure()
+            for param_group in self.param_groups:
+                if param_group["do_bias_correction"]:
+                    param_group["bias_correction1"] = 1.0 - math.pow(
+                        param_group["betas"][0], self._state["step"] + 1
+                    )
+                    param_group["bias_correction2"] = 1.0 - math.pow(
+                        param_group["betas"][1], self._state["step"] + 1
+                    )
+
+                kwargs = {
+                    "learning_rate_val": param_group["lr"],
+                    "bias_correction1_val": param_group["bias_correction1"],
+                    "bias_correction2_val": param_group["bias_correction2"],
+                    "beta1": param_group["betas"][0],
+                    "beta2": param_group["betas"][1],
+                    "epsilon": param_group["eps"],
+                    "do_bias_correction": param_group["do_bias_correction"],
+                }
+                if param_group["adam_w_mode"]:
+                    kwargs["weight_decay"] = param_group["weight_decay"]
+                    kwargs["l2"] = 0.0
+                else:
+                    kwargs["l2"] = param_group["weight_decay"]
+                    kwargs["weight_decay"] = 0.0
+                for param in param_group.parameters:
+                    if param.grad is None:
+                        continue
+                    if "exp_avg" not in self._state[param]:
+                        self._state[param]["exp_avg"] = flow.zeros_like(param)
+                    if "exp_avg_sq" not in self._state[param]:
+                        self._state[param]["exp_avg_sq"] = flow.zeros_like(param)
+                    m_tensor = self._state[param]["exp_avg"]
+                    v_tensor = self._state[param]["exp_avg_sq"]
+
+                    self._lamb_op(param, param.grad, m_tensor, v_tensor, **kwargs)
+
+            self._state["step"] += 1
+
+            return loss
+
+    def _generate_conf_for_graph(self, train_conf, vars_conf):
+        new_opt_confs = []
+        for param_group in self.param_groups:
+            optimizer_conf = train_conf.mutable_optimizer_conf().Add()
+
+            lr = (
+                param_group["initial_lr"]
+                if "initial_lr" in param_group
+                else param_group["lr"]
+            )
+            adam_w_mode = param_group["adam_w_mode"]
+            weight_decay = param_group["weight_decay"]
+            beta1 = param_group["betas"][0]
+            beta2 = param_group["betas"][1]
+            do_bias_correction = param_group["do_bias_correction"]
+            epsilon = param_group["eps"]
+
+            optimizer_conf.set_base_learning_rate(lr)
+
+            optimizer_conf.mutable_lamb_conf().set_beta1(beta1)
+            optimizer_conf.mutable_lamb_conf().set_beta2(beta2)
+            optimizer_conf.mutable_lamb_conf().set_epsilon(epsilon)
+            optimizer_conf.mutable_lamb_conf().set_do_bias_correction(
+                do_bias_correction
+            )
+
+            self._generate_grad_clip_conf_for_optim_conf(param_group, optimizer_conf)
+
+            if adam_w_mode:
+                optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(
+                    weight_decay
+                )
+            else:
+                optimizer_conf.mutable_weight_decay_conf().set_weight_decay_rate(0.0)
+
+            for param in param_group.parameters:
+                if not adam_w_mode:
+                    # Set l2 penalty as weight decay if **NOT** using adam_w_mode
+                    vars_conf[param].l2 = weight_decay
+                if param.requires_grad:
+                    optimizer_conf.add_variable_op_names(vars_conf[param].name)
+
+            new_opt_confs.append(optimizer_conf)
+        return new_opt_confs
diff --git a/python/oneflow/optim/__init__.py b/python/oneflow/optim/__init__.py
index baf0061095c..a4f995d3c5d 100644
--- a/python/oneflow/optim/__init__.py
+++ b/python/oneflow/optim/__init__.py
@@ -19,6 +19,6 @@
 from oneflow.nn.optimizer.rmsprop import RMSprop
 from oneflow.nn.optimizer.sgd import SGD
 from oneflow.nn.optimizer.adagrad import Adagrad
+from oneflow.nn.optimizer.lamb import LAMB
 
 from . import lr_scheduler
-from . import utils
diff --git a/python/oneflow/test/graph/test_graph_optim_lamb.py b/python/oneflow/test/graph/test_graph_optim_lamb.py
new file mode 100644
index 00000000000..9935d46545b
--- /dev/null
+++ b/python/oneflow/test/graph/test_graph_optim_lamb.py
@@ -0,0 +1,183 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import unittest
+from collections import OrderedDict
+import numpy as np
+
+from test_util import GenArgList
+from optimizer_test_util import clip_grad_norm_np
+
+import oneflow as flow
+
+
+def compare_with_numpy_lamb(
+    test_case,
+    device,
+    x_shape,
+    learning_rate,
+    train_iters,
+    betas,
+    weight_decay,
+    eps,
+    do_bias_correction,
+    adam_w_mode,
+    clip_grad_max_norm,
+    clip_grad_norm_type,
+):
+
+    np.random.seed(1000)
+
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    class CustomModule(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.param = flow.nn.Parameter(
+                flow.Tensor(init_value, device=flow.device(device))
+            )
+
+        def forward(self, mask):
+            return self.param * mask
+
+    simp_module = CustomModule()
+    simp_module.to(device)
+    simp_module.train()
+
+    optim_kwargs = {
+        "params": simp_module.parameters(),
+        "lr": learning_rate,
+        "betas": betas,
+        "eps": eps,
+        "weight_decay": weight_decay,
+        "adam_w_mode": adam_w_mode,
+        "do_bias_correction": do_bias_correction,
+    }
+
+    if clip_grad_max_norm != -1:
+        optim_kwargs["clip_grad_max_norm"] = clip_grad_max_norm
+        optim_kwargs["clip_grad_norm_type"] = clip_grad_norm_type
+
+    lamb_optim = flow.optim.LAMB([optim_kwargs])
+
+    class CustomLambGraph(flow.nn.Graph):
+        def __init__(self):
+            super().__init__()
+            self.m = simp_module
+            self.add_optimizer(lamb_optim)
+
+        def build(self, mask_tensor):
+            loss = flow.sum(self.m(mask_tensor))
+            loss.backward()
+            return loss
+
+    of_res_list = []
+    lamb_graph = CustomLambGraph()
+
+    for i in range(train_iters):
+        mask_tensor = flow.tensor(
+            random_grad_seq[i],
+            dtype=flow.float32,
+            requires_grad=False,
+            device=flow.device(device),
+        )
+        _ = lamb_graph(mask_tensor)
+
+        of_res_list.append(simp_module.param.numpy())
+
+    np_res_list = []
+
+    def train_by_numpy():
+        x = init_value
+        mt = np.zeros_like(x)
+        vt = np.zeros_like(x)
+        beta1 = betas[0]
+        beta2 = betas[1]
+        if adam_w_mode:
+            l2 = 0
+            wd = weight_decay
+        else:
+            l2 = weight_decay
+            wd = 0
+
+        def np_train_one_iter(step, grad):
+            if clip_grad_max_norm != -1:
+                _, grad = clip_grad_norm_np(
+                    grad, clip_grad_max_norm, clip_grad_norm_type
+                )
+
+            grad = grad + l2 * x
+
+            bias_correction1 = 1.0
+            bias_correction2 = 1.0
+
+            if do_bias_correction:
+                bias_correction1 = 1.0 - np.power(beta1, step + 1)
+                bias_correction2 = 1.0 - np.power(beta2, step + 1)
+
+            m = beta1 * mt + (1 - beta1) * grad
+            v = beta2 * vt + (1 - beta2) * grad * grad
+
+            denom = np.sqrt(v) / np.sqrt(bias_correction2) + eps
+
+            adam_diff = m / bias_correction1 / denom
+
+            w_norm = np.linalg.norm(x, ord=2)
+            g_norm = np.linalg.norm(adam_diff, ord=2)
+            if w_norm > 0 and g_norm > 0:
+                trust_ratio = w_norm / g_norm
+            else:
+                trust_ratio = 1.0
+
+            param = x - learning_rate * trust_ratio * (adam_diff + wd * x)
+            return (param, m, v)
+
+        for i in range(train_iters):
+            (x, mt, vt) = np_train_one_iter(i, random_grad_seq[i])
+            np_res_list.append(x)
+        return x
+
+    train_by_numpy()
+
+    test_case.assertTrue(np.allclose(of_res_list, np_res_list, rtol=1e-3, atol=1e-3))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLamb(flow.unittest.TestCase):
+    def test_lamb(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["learning_rate"] = [0.1, 1e-3]
+        arg_dict["train_iters"] = [10]
+        arg_dict["betas"] = [(0.99, 0.9)]
+        arg_dict["weight_decay"] = [0.001, 0.1]
+        arg_dict["eps"] = [1e-8, 1e-6]
+        arg_dict["do_bias_correction"] = [True, False]
+        arg_dict["adam_w_mode"] = [True, False]
+        # NOTE(Lxy): max_norm = -1 means no clip grad
+        # nn.Graph only support `clip_grad_max_norm == 1.0` and `clip_grad_norm_type == 2.0`
+        arg_dict["clip_grad_max_norm"] = [-1, 1.0]
+        arg_dict["clip_grad_norm_type"] = [2.0]
+
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_lamb(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/oneflow/test/modules/test_optim_lamb.py b/python/oneflow/test/modules/test_optim_lamb.py
new file mode 100644
index 00000000000..85be61fe974
--- /dev/null
+++ b/python/oneflow/test/modules/test_optim_lamb.py
@@ -0,0 +1,172 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import unittest
+from collections import OrderedDict
+import numpy as np
+
+from test_util import GenArgList
+from optimizer_test_util import clip_grad_norm_np
+
+import oneflow as flow
+
+
+def compare_with_numpy_lamb(
+    test_case,
+    device,
+    x_shape,
+    learning_rate,
+    train_iters,
+    betas,
+    weight_decay,
+    eps,
+    do_bias_correction,
+    adam_w_mode,
+    clip_grad_max_norm,
+    clip_grad_norm_type,
+):
+
+    np.random.seed(1000)
+
+    random_grad_seq = []
+    for _ in range(train_iters):
+        random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
+    init_value = np.random.uniform(size=x_shape).astype(np.float32)
+
+    of_res_list = []
+
+    def train_by_oneflow():
+        x = flow.nn.Parameter(flow.Tensor(init_value, device=flow.device(device)))
+
+        optim_kwargs = {
+            "params": [x],
+            "lr": learning_rate,
+            "betas": betas,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "adam_w_mode": adam_w_mode,
+            "do_bias_correction": do_bias_correction,
+        }
+
+        if clip_grad_max_norm != -1:
+            optim_kwargs["clip_grad_max_norm"] = clip_grad_max_norm
+            optim_kwargs["clip_grad_norm_type"] = clip_grad_norm_type
+
+        lamb = flow.optim.LAMB([optim_kwargs])
+
+        def train_one_iter(grad):
+            grad_tensor = flow.tensor(
+                grad,
+                dtype=flow.float32,
+                requires_grad=False,
+                device=flow.device(device),
+            )
+
+            loss = flow.sum(x * grad_tensor)
+            loss.backward()
+            if clip_grad_max_norm != -1:
+                lamb.clip_grad()
+            lamb.step()
+            lamb.zero_grad()
+            return x
+
+        for i in range(train_iters):
+            param = train_one_iter(random_grad_seq[i])
+
+            of_res_list.append(param.numpy())
+
+    train_by_oneflow()
+
+    np_res_list = []
+
+    def train_by_numpy():
+        x = init_value
+        mt = np.zeros_like(x)
+        vt = np.zeros_like(x)
+        beta1 = betas[0]
+        beta2 = betas[1]
+        if adam_w_mode:
+            l2 = 0
+            wd = weight_decay
+        else:
+            l2 = weight_decay
+            wd = 0
+
+        def np_train_one_iter(step, grad):
+            if clip_grad_max_norm != -1:
+                _, grad = clip_grad_norm_np(
+                    grad, clip_grad_max_norm, clip_grad_norm_type
+                )
+
+            grad = grad + l2 * x
+
+            bias_correction1 = 1.0
+            bias_correction2 = 1.0
+
+            if do_bias_correction:
+                bias_correction1 = 1.0 - np.power(beta1, step + 1)
+                bias_correction2 = 1.0 - np.power(beta2, step + 1)
+
+            m = beta1 * mt + (1 - beta1) * grad
+            v = beta2 * vt + (1 - beta2) * grad * grad
+
+            denom = np.sqrt(v) / np.sqrt(bias_correction2) + eps
+
+            adam_diff = m / bias_correction1 / denom
+
+            w_norm = np.linalg.norm(x, ord=2)
+            g_norm = np.linalg.norm(adam_diff, ord=2)
+            if w_norm > 0 and g_norm > 0:
+                trust_ratio = w_norm / g_norm
+            else:
+                trust_ratio = 1.0
+
+            param = x - learning_rate * trust_ratio * (adam_diff + wd * x)
+            return (param, m, v)
+
+        for i in range(train_iters):
+            (x, mt, vt) = np_train_one_iter(i, random_grad_seq[i])
+            np_res_list.append(x)
+        return x
+
+    train_by_numpy()
+
+    test_case.assertTrue(np.allclose(of_res_list, np_res_list, rtol=1e-3, atol=1e-3))
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestLamb(flow.unittest.TestCase):
+    def test_lamb(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["device"] = ["cpu", "cuda"]
+        arg_dict["x_shape"] = [(10,)]
+        arg_dict["learning_rate"] = [0.1, 1e-3]
+        arg_dict["train_iters"] = [10]
+        arg_dict["betas"] = [(0.99, 0.9)]
+        arg_dict["weight_decay"] = [0.001, 0.1]
+        arg_dict["eps"] = [1e-8, 1e-6]
+        arg_dict["do_bias_correction"] = [True, False]
+        arg_dict["adam_w_mode"] = [True, False]
+        # NOTE(Lxy): max_norm = -1 means no clip grad
+        arg_dict["clip_grad_max_norm"] = [-1, 0.0, 0.5, 1.0]
+        arg_dict["clip_grad_norm_type"] = ["inf", "-inf", 0.0, 1.0, 2.0, 3.5]
+        # TODO(Lxy): add save and load test
+
+        for arg in GenArgList(arg_dict):
+            compare_with_numpy_lamb(test_case, *arg)
+
+
+if __name__ == "__main__":
+    unittest.main()

From c7cd1097ff6d2a5a53e723afc2be1cc5facf7e73 Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Thu, 27 Jan 2022 00:25:06 +0800
Subject: [PATCH 2/8] finish lamb optim

---
 .../functional/dispatch_stateful_ops.cpp      |  19 +++
 .../functional/dispatch_stateful_ops.yaml     |   4 +
 oneflow/core/job/job_conf.proto               |   7 +-
 oneflow/core/job_rewriter/adam_optm.cpp       |  44 +++---
 oneflow/core/job_rewriter/lamb_optm.cpp       | 129 +++++++++++++++---
 oneflow/ir/include/OneFlow/OneFlowUserOps.td  |  24 ++--
 .../user/kernels/model_update_kernel_util.cpp |  28 ++--
 .../user/kernels/model_update_kernel_util.cu  |  83 ++++++-----
 .../user/kernels/model_update_kernel_util.h   |  30 ++--
 oneflow/user/kernels/model_update_kernels.cpp |  40 +++++-
 oneflow/user/ops/model_update_ops.cpp         |  12 +-
 python/oneflow/nn/optimizer/lamb.py           |  18 ++-
 .../test/graph/test_graph_optim_lamb.py       |  14 +-
 .../oneflow/test/modules/test_optim_lamb.py   |  44 +++---
 14 files changed, 337 insertions(+), 159 deletions(-)

diff --git a/oneflow/api/python/functional/dispatch_stateful_ops.cpp b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
index a97a6526d11..57b4863d650 100644
--- a/oneflow/api/python/functional/dispatch_stateful_ops.cpp
+++ b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
+#include <memory>
 #include "oneflow/core/common/scalar.h"
 #include "oneflow/core/framework/attr_map.h"
 #include "oneflow/core/framework/nd_sbp.h"
@@ -480,6 +481,24 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
         JUST(OpInterpUtil::Dispatch<TensorTuple>(*op, inputs, attrs));
         return Maybe<void>::Ok();
       });
+  m.add_functor(
+    "DispatchLambUpdate",
+    [](const std::shared_ptr<OpExpr>& op, const TensorTuple& inputs, float learning_rate, float bias_correction1, float bias_correction2, double scale, float l1, float l2, float beta1, float beta2, float epsilon, float weight_decay, bool do_bias_correction) -> Maybe<void> {
+      MutableAttrMap attrs;
+      JUST(attrs.SetAttr("learning_rate_val", learning_rate));
+      JUST(attrs.SetAttr("bias_correction1_val", bias_correction1));
+      JUST(attrs.SetAttr("bias_correction2_val", bias_correction2));
+      JUST(attrs.SetAttr("scale", scale));
+      JUST(attrs.SetAttr("l1", l1));
+      JUST(attrs.SetAttr("l2", l2));
+      JUST(attrs.SetAttr("beta1", beta1));
+      JUST(attrs.SetAttr("beta2", beta2));
+      JUST(attrs.SetAttr("epsilon", epsilon));
+      JUST(attrs.SetAttr("weight_decay", weight_decay));
+      JUST(attrs.SetAttr("do_bias_correction", do_bias_correction));
+      JUST(OpInterpUtil::Dispatch<TensorTuple>(*op, inputs, attrs));
+      return Maybe<void>::Ok();
+    });
   m.add_functor("DispatchEagerNcclAllReduce",
                 [](const std::shared_ptr<OpExpr>& op, const std::shared_ptr<Tensor>& input,
                    const std::string& parallel_conf, bool async_launch) -> Maybe<Tensor> {
diff --git a/oneflow/api/python/functional/dispatch_stateful_ops.yaml b/oneflow/api/python/functional/dispatch_stateful_ops.yaml
index 87fa7647059..713bdc08d38 100644
--- a/oneflow/api/python/functional/dispatch_stateful_ops.yaml
+++ b/oneflow/api/python/functional/dispatch_stateful_ops.yaml
@@ -138,6 +138,10 @@
 - name: "dispatch_sgd_update"
   signature: "Void (OpExpr op, TensorTuple inputs, Float learning_rate=0, Double scale=1.0, Float l1=0, Float l2=0, Float weight_decay=0) => DispatchSgdUpdate"
   bind_python: True
+    
+- name: "dispatch_lamb_update"
+  signature: "Void (OpExpr op, TensorTuple inputs, Float learning_rate=0, Float bias_correction1=1.0, Float bias_correction2=1.0, Double scale=1.0, Float l1=0, Float l2=0, Float beta1=0.9, Float beta2=0.999, Float epsilon=1e-8, Float weight_decay=0, Bool do_bias_correction=True) => DispatchLambUpdate"
+  bind_python: True
 
 - name: "dispatch_eager_nccl_all_reduce"
   signature: "Tensor (OpExpr op, Tensor input, String parallel_conf, Bool async_launch=False) => DispatchEagerNcclAllReduce"
diff --git a/oneflow/core/job/job_conf.proto b/oneflow/core/job/job_conf.proto
index 8b1d2153a9d..88432aa7504 100644
--- a/oneflow/core/job/job_conf.proto
+++ b/oneflow/core/job/job_conf.proto
@@ -47,9 +47,10 @@ message LazyAdamModelUpdateConf {
 }
 
 message LambModelUpdateConf {
-  required float beta1 = 1;
-  required float beta2 = 2;
-  required float epsilon = 3;
+  optional float beta1 = 1 [default = 0.9];
+  optional float beta2 = 2 [default = 0.999];
+  optional float epsilon = 3 [default = 1e-8];
+  optional bool do_bias_correction = 4 [default = true];
 }
 
 message AdagradModelUpdateConf {
diff --git a/oneflow/core/job_rewriter/adam_optm.cpp b/oneflow/core/job_rewriter/adam_optm.cpp
index b325a992317..76d6b02270f 100644
--- a/oneflow/core/job_rewriter/adam_optm.cpp
+++ b/oneflow/core/job_rewriter/adam_optm.cpp
@@ -27,6 +27,7 @@ bool operator==(const BiasCorrectionFactorCacheKey& lhs, const BiasCorrectionFac
   return (lhs.beta == rhs.beta) && (lhs.parallel_conf == rhs.parallel_conf);
 }
 
+
 }  // namespace oneflow
 
 namespace std {
@@ -43,27 +44,8 @@ struct hash<oneflow::BiasCorrectionFactorCacheKey> {
 }  // namespace std
 
 namespace oneflow {
-
-namespace {
-
-std::string GenVariableOutputLbn(const OperatorConf& op_conf) {
-  CHECK(op_conf.has_variable_conf());
-  return GenLogicalBlobName(op_conf.name(), op_conf.variable_conf().out());
-}
-
-OperatorConf GenerateAdamHelperVariableOpConf(const VariableOp& op, const std::string& name,
-                                              const float initial_value) {
-  OperatorConf helper_variable_op(op.op_conf());
-  helper_variable_op.set_name(op.op_name() + "-" + name);
-  helper_variable_op.mutable_variable_conf()->set_out("out");
-  InitializerConf constant_initializer;
-  constant_initializer.mutable_constant_conf()->set_value(initial_value);
-  *(helper_variable_op.mutable_variable_conf()->mutable_initializer()) = constant_initializer;
-  helper_variable_op.set_scope_symbol_id(op.op_conf().scope_symbol_id());
-  return helper_variable_op;
-}
-
-class BiasCorrectionFactorState final : public JobPassState {
+  
+ class BiasCorrectionFactorState final : public JobPassState {
  public:
   BiasCorrectionFactorState() {}
   ~BiasCorrectionFactorState() override = default;
@@ -88,6 +70,26 @@ class BiasCorrectionFactorState final : public JobPassState {
   HashMap<BiasCorrectionFactorCacheKey, std::string> key2lbn_;
 };
 
+namespace {
+
+std::string GenVariableOutputLbn(const OperatorConf& op_conf) {
+  CHECK(op_conf.has_variable_conf());
+  return GenLogicalBlobName(op_conf.name(), op_conf.variable_conf().out());
+}
+
+OperatorConf GenerateAdamHelperVariableOpConf(const VariableOp& op, const std::string& name,
+                                              const float initial_value) {
+  OperatorConf helper_variable_op(op.op_conf());
+  helper_variable_op.set_name(op.op_name() + "-" + name);
+  helper_variable_op.mutable_variable_conf()->set_out("out");
+  InitializerConf constant_initializer;
+  constant_initializer.mutable_constant_conf()->set_value(initial_value);
+  *(helper_variable_op.mutable_variable_conf()->mutable_initializer()) = constant_initializer;
+  helper_variable_op.set_scope_symbol_id(op.op_conf().scope_symbol_id());
+  return helper_variable_op;
+}
+
+
 void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
                              const std::string& model_diff_lbn, const OptimizerConf& optimizer_conf,
                              JobBuilder* job_builder) {
diff --git a/oneflow/core/job_rewriter/lamb_optm.cpp b/oneflow/core/job_rewriter/lamb_optm.cpp
index d764637b17a..23734257d46 100644
--- a/oneflow/core/job_rewriter/lamb_optm.cpp
+++ b/oneflow/core/job_rewriter/lamb_optm.cpp
@@ -15,8 +15,47 @@ limitations under the License.
 */
 #include "oneflow/core/job_rewriter/optimizer.h"
 #include "oneflow/core/framework/framework.h"
+
 namespace oneflow {
+  
+struct BiasCorrectionFactorCacheKey {
+  float beta = 1.0;
+  ParallelConf parallel_conf;
+};
+
+bool operator==(const BiasCorrectionFactorCacheKey& lhs, const BiasCorrectionFactorCacheKey& rhs);
 
+} // namespace oneflow
+
+namespace std {
+template<>
+struct hash<oneflow::BiasCorrectionFactorCacheKey> {
+  size_t operator()(const oneflow::BiasCorrectionFactorCacheKey& key) const {
+    const auto& float_hash = std::hash<float>();
+    const auto& parallel_conf_hash = std::hash<oneflow::ParallelConf>();
+    return float_hash(key.beta) ^ parallel_conf_hash(key.parallel_conf);
+  }
+};
+
+}  // namespace std
+
+namespace oneflow {
+
+// Forward declaration for bias correction factor
+class BiasCorrectionFactorState final : public JobPassState {
+  public:
+    BiasCorrectionFactorState() {}
+    ~BiasCorrectionFactorState() override = default;
+
+    std::string GetLbn(float beta, std::string bias_correction_name, ParallelConf parallel_conf,
+                       const std::function<std::string(float beta_val, std::string op_name)>&
+                           BiasCorrectionFactorStateOp);
+
+   private:
+    HashMap<BiasCorrectionFactorCacheKey, std::string> key2lbn_;
+};
+
+  
 namespace {
 
 std::string GenVariableOutputLbn(const OperatorConf& op_conf) {
@@ -47,37 +86,95 @@ void SetScalarShapeAndNdSbpConf(const ParallelDesc& parallel_desc, OperatorConf*
 }
 
 void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
-                             const std::string& model_diff_lbn, const OptimizerConf optimizer_conf,
+                             const std::string& model_diff_lbn, const OptimizerConf& optimizer_conf,
                              JobBuilder* job_builder) {
   const VariableOp* var_op = dynamic_cast<const VariableOp*>(&var_op_node.op());
   CHECK_NOTNULL(var_op);
+
   OperatorConf m_var = GenerateLAMBHelperVariableOpConf(*var_op, "m", 0.f);
   OperatorConf v_var = GenerateLAMBHelperVariableOpConf(*var_op, "v", 0.f);
+
   job_builder->AddOps(var_op_node.parallel_desc().parallel_conf(), {m_var, v_var});
 
-  OperatorConf beta1_t_var;
-  OperatorConf beta2_t_var;
+  user_op::UserOpConfWrapperBuilder lamb_update_op_builder(var_op->op_name() + "_optimizer");
+  
   const LambModelUpdateConf& lamb_conf = optimizer_conf.lamb_conf();
-  beta1_t_var = GenerateLAMBHelperVariableOpConf(*var_op, "beta1_t", lamb_conf.beta1());
-  SetScalarShapeAndNdSbpConf(var_op_node.parallel_desc(), &beta1_t_var);
-  beta2_t_var = GenerateLAMBHelperVariableOpConf(*var_op, "beta2_t", lamb_conf.beta2());
-  SetScalarShapeAndNdSbpConf(var_op_node.parallel_desc(), &beta2_t_var);
-  job_builder->AddOps(var_op_node.parallel_desc().parallel_conf(), {beta1_t_var, beta2_t_var});
+  float beta1 = lamb_conf.beta1();
+  float beta2 = lamb_conf.beta2();
+  float epsilon = lamb_conf.epsilon();
+  bool do_bias_correction = lamb_conf.do_bias_correction();
+  
+  const std::string& train_step_lbn = job_builder->job().job_conf().train_conf().train_step_lbn();
+  const std::string& learning_rate_lbn = optimizer_conf.learning_rate_lbn();
+  
+  if (do_bias_correction) {
+    // Reuse adam bias_correction job pass
+    const std::string& job_pass_state_key = "adam_bias_correction_factor";
+    const bool has_state = CHECK_JUST(ctx->HasState<BiasCorrectionFactorState>(job_pass_state_key));
+    if (!has_state) {
+      CHECK_JUST(
+          ctx->ResetState(job_pass_state_key, std::make_unique<BiasCorrectionFactorState>()));
+    }
+    auto* state = CHECK_JUST(ctx->MutableState<BiasCorrectionFactorState>(job_pass_state_key));
+    ParallelConf bias_correction_parallel_conf;
+    const auto& lr_parallel_conf = CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(learning_rate_lbn)));
+    const auto& train_step_parallel_conf = CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(train_step_lbn)));
+    if (lr_parallel_conf == train_step_parallel_conf) {
+      bias_correction_parallel_conf = lr_parallel_conf;
+    } else {
+      bias_correction_parallel_conf = var_op_node.parallel_desc().parallel_conf();
+    }
+    auto AddLambBiasCorrectionFactorOp = [&](float beta_val,
+                                             const std::string& op_name) -> std::string {
+      user_op::UserOpConfWrapperBuilder op_builder(var_op->op_name() + op_name);
+      const auto lamb_bias_correction_factor_op =
+          op_builder.OpTypeName("adam_bias_correction_factor")
+              .Input("train_step", train_step_lbn)
+              .Attr<float>("beta", beta_val)
+              .Output("out")
+              .ScopeSymbolId(var_op->op_conf().scope_symbol_id())
+              .Build();
 
-  user_op::UserOpConfWrapperBuilder lamb_update_op_builder(var_op->op_name() + "_optimizer");
-  lamb_update_op_builder.OpTypeName("lamb_update")
+      job_builder->AddOps(bias_correction_parallel_conf, {lamb_bias_correction_factor_op.op_conf()});
+      return lamb_bias_correction_factor_op.output("out", 0);
+    };
+
+    const std::string bias_correction1_lbn =
+        state->GetLbn(beta1, "lamb_bias_correction_factor1", bias_correction_parallel_conf,
+                      AddLambBiasCorrectionFactorOp);
+    const std::string bias_correction2_lbn =
+        state->GetLbn(beta2, "lamb_bias_correction_factor2", bias_correction_parallel_conf,
+                      AddLambBiasCorrectionFactorOp);
+
+    lamb_update_op_builder.OpTypeName("lamb_update")
+      .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
+      .Input("model_diff", model_diff_lbn)
       .Input("m", GenVariableOutputLbn(m_var))
       .Input("v", GenVariableOutputLbn(v_var))
-      .Input("beta1_t", GenVariableOutputLbn(beta1_t_var))
-      .Input("beta2_t", GenVariableOutputLbn(beta2_t_var))
+      .Input("learning_rate", learning_rate_lbn)
+      .Input("bias_correction1", bias_correction1_lbn)
+      .Input("bias_correction2", bias_correction2_lbn)
+      .Attr<float>("beta1", beta1)
+      .Attr<float>("beta2", beta2)
+      .Attr<float>("epsilon", epsilon)
+      .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
+      .Attr<bool>("do_bias_correction", true)
+      .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
+  } else {
+    lamb_update_op_builder.OpTypeName("lamb_update")
       .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
       .Input("model_diff", model_diff_lbn)
-      .Input("learning_rate", optimizer_conf.learning_rate_lbn())
-      .Attr<float>("beta1", lamb_conf.beta1())
-      .Attr<float>("beta2", lamb_conf.beta2())
-      .Attr<float>("epsilon", lamb_conf.epsilon())
+      .Input("m", GenVariableOutputLbn(m_var))
+      .Input("v", GenVariableOutputLbn(v_var))
+      .Input("learning_rate", learning_rate_lbn)
+      .Attr<float>("beta1", beta1)
+      .Attr<float>("beta2", beta2)
+      .Attr<float>("epsilon", epsilon)
       .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
+      .Attr<bool>("do_bias_correction", false)
       .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
+  }
+
   SetDynamicLossScaleSkipIf(ctx, &lamb_update_op_builder);
   const auto lamb_update_op = lamb_update_op_builder.Build();
   job_builder->AddOps(var_op_node.parallel_desc().parallel_conf(), {lamb_update_op.op_conf()});
diff --git a/oneflow/ir/include/OneFlow/OneFlowUserOps.td b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
index a5086360e74..ae7fca8ad85 100644
--- a/oneflow/ir/include/OneFlow/OneFlowUserOps.td
+++ b/oneflow/ir/include/OneFlow/OneFlowUserOps.td
@@ -5405,24 +5405,28 @@ def OneFlow_IndexedSlicesSgdUpdateOp : OneFlow_BaseOp<"indexed_slices_sgd_update
 
 def OneFlow_LambUpdateOp : OneFlow_BaseOp<"lamb_update", [NoGrad, AttrSizedOperandSegments, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
-    OneFlow_Tensor:$m,
-    OneFlow_Tensor:$v,
-    OneFlow_Tensor:$beta1_t,
-    OneFlow_Tensor:$beta2_t,
     OneFlow_Tensor:$model,
     OneFlow_Tensor:$model_diff,
-    OneFlow_Tensor:$learning_rate,
+    Optional<OneFlow_Tensor>:$learning_rate,
     Optional<OneFlow_Tensor>:$scale_by_tensor,
-    Optional<OneFlow_Tensor>:$skip_if
+    Optional<OneFlow_Tensor>:$skip_if,
+    Optional<OneFlow_Tensor>:$bias_correction1,
+    Optional<OneFlow_Tensor>:$bias_correction2,
+    OneFlow_Tensor:$m,
+    OneFlow_Tensor:$v
   );
   let attrs = (ins
-    DefaultValuedAttr<F32Attr, "0.">:$beta1,
-    DefaultValuedAttr<F32Attr, "0.">:$beta2,
-    DefaultValuedAttr<F32Attr, "0.">:$epsilon,
+    DefaultValuedAttr<F32Attr, "0.">:$learning_rate_val,
+    DefaultValuedAttr<F32Attr, "1.">:$bias_correction1_val,
+    DefaultValuedAttr<F32Attr, "1.">:$bias_correction2_val,
     DefaultValuedAttr<F64Attr, "1.">:$scale,
     DefaultValuedAttr<F32Attr, "0.">:$l1,
     DefaultValuedAttr<F32Attr, "0.">:$l2,
-    DefaultValuedAttr<F32Attr, "0.">:$weight_decay
+    DefaultValuedAttr<F32Attr, "0.9">:$beta1,
+    DefaultValuedAttr<F32Attr, "0.999">:$beta2,
+    DefaultValuedAttr<F32Attr, "0.">:$epsilon,
+    DefaultValuedAttr<F32Attr, "0.">:$weight_decay,
+    DefaultValuedAttr<BoolAttr, "true">:$do_bias_correction
   );
   let trait_attrs = (ins
     I32ElementsAttr:$operand_segment_sizes
diff --git a/oneflow/user/kernels/model_update_kernel_util.cpp b/oneflow/user/kernels/model_update_kernel_util.cpp
index 9a85aebfcf8..36e4ad9a136 100644
--- a/oneflow/user/kernels/model_update_kernel_util.cpp
+++ b/oneflow/user/kernels/model_update_kernel_util.cpp
@@ -272,30 +272,38 @@ template struct AdagradUpdateKernelUtil<DeviceType::kCPU, double, double>;
 template<typename T, typename G>
 struct LambUpdateKernelUtil<DeviceType::kCPU, T, G> {
   static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, const float* learning_rate,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
                      const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer, T* beta1_t, T* beta2_t);
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
 };
 
 template<typename T, typename G>
 void LambUpdateKernelUtil<DeviceType::kCPU, T, G>::Update(
     ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v, T* norm_buffer,
-    T* beta1_t, T* beta2_t) {
+    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
+    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v,
+    T* norm_buffer) {
   if (skip_if != nullptr && *skip_if != 0) { return; }
-  *beta1_t *= beta1;
-  *beta2_t *= beta2;
+  if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; }
   if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
+  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
+
   FOR_RANGE(int64_t, i, 0, n) {
-    LambGradFunctor<T, G>()(beta1_t, beta2_t, model_diff + i, adam_diff + i, model + i, m + i,
-                            v + i, scale, l1, l2, beta1, beta2, epsilon);
+    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2,
+                            beta1, beta2, epsilon, do_bias_correction, bias_correction1_val,
+                            bias_correction2_val);
   }
   T* w_norm_2 = norm_buffer;
   T* g_norm_2 = norm_buffer + 1;
   Memset<DeviceType::kCPU>(stream, norm_buffer, 0, 2 * sizeof(T));
   SumSquares2(n, model, w_norm_2, adam_diff, g_norm_2);
-  const float lr = LambLRFunctor<T>()(*learning_rate, w_norm_2, g_norm_2);
+  const float lr = LambLRFunctor<T>()(learning_rate_val, w_norm_2, g_norm_2);
   FOR_RANGE(int64_t, i, 0, n) {
     LambUpdateFunctor<T>()(lr, weight_decay, adam_diff + i, model + i);
   }
diff --git a/oneflow/user/kernels/model_update_kernel_util.cu b/oneflow/user/kernels/model_update_kernel_util.cu
index d858fe444a4..d44b94c41f0 100644
--- a/oneflow/user/kernels/model_update_kernel_util.cu
+++ b/oneflow/user/kernels/model_update_kernel_util.cu
@@ -322,23 +322,27 @@ __global__ void IndexedSlicesAdamUpdateGpu(
 
 template<typename T, typename G>
 __global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
-                            float epsilon, const T* beta1_t, const T* beta2_t,
-                            const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                            T* adam_diff, T* model, T* m, T* v) {
+                            float epsilon, const T* scale_by_ptr, const int64_t* skip_if,
+                            const G* model_diff, T* adam_diff, T* model, T* m, T* v, bool do_bias_correction, float bias_correction1_val, float bias_correction2_val, const float* bias_correction1_ptr, 
+                            const float* bias_correction2_ptr) {
   if (skip_if != nullptr && *skip_if != 0) { return; }
   if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
+  if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
+  if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
   CUDA_1D_KERNEL_LOOP(i, n) {
-    LambGradFunctor<T, G>()(beta1_t, beta2_t, model_diff + i, adam_diff + i, model + i, m + i,
-                            v + i, scale, l1, l2, beta1, beta2, epsilon);
+    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i,
+                            v + i, scale, l1, l2, beta1, beta2, epsilon, 
+                            do_bias_correction, bias_correction1_val, bias_correction2_val);
   }
 }
 
 template<typename T>
-__global__ void LambUpdateGpu(int64_t n, float weight_decay, const float* learning_rate,
+__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val, const float* learning_rate_ptr,
                               const int64_t* skip_if, const T* w_norm_2, const T* g_norm_2,
-                              const T* beta1_t, const T* beta2_t, const T* adam_diff, T* model) {
+                              const T* adam_diff, T* model) {
   if (skip_if != nullptr && *skip_if != 0) { return; }
-  const float lr = LambLRFunctor<T>()(*learning_rate, w_norm_2, g_norm_2);
+  if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; }
+  const float lr = LambLRFunctor<T>()(learning_rate_val, w_norm_2, g_norm_2);
   CUDA_1D_KERNEL_LOOP(i, n) { LambUpdateFunctor<T>()(lr, weight_decay, adam_diff + i, model + i); }
 }
 
@@ -447,53 +451,62 @@ template struct AdagradUpdateKernelUtil<DeviceType::kCUDA, double, double>;
 template<typename T, typename G>
 struct LambUpdateKernelUtil<DeviceType::kCUDA, T, G> {
   static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, const float* learning_rate,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
                      const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer, T* beta1_t, T* beta2_t);
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
 };
 
 template<typename T, typename G>
 void LambUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v, T* norm_buffer,
-    T* beta1_t, T* beta2_t) {
-  AdamUpdateBetaTGpu<T><<<1, 1, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      beta1, beta2, skip_if, beta1_t, beta2_t);
-  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, beta1, beta2, epsilon, beta1_t, beta2_t, scale_by_ptr, skip_if, model_diff,
-      adam_diff, model, m, v);
+    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
+                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer
+) {
+  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff,
+      adam_diff, model, m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, 
+      bias_correction1_ptr, bias_correction2_ptr);
   T* w_norm_2 = norm_buffer;
   T* g_norm_2 = norm_buffer + 1;
   Memset<DeviceType::kCUDA>(stream, norm_buffer, 0, 2 * sizeof(T));
-  SumSquares2<T>
-      <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-         stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2);
-  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
-                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, weight_decay, learning_rate, skip_if, w_norm_2, g_norm_2, beta1_t, beta2_t, adam_diff,
+  SumSquares2<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, model, w_norm_2, adam_diff, g_norm_2);
+  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff,
       model);
 }
 
 template<typename T>
 struct LambUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
   static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, const float* learning_rate,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
                      const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer, T* beta1_t, T* beta2_t);
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
 };
 
 template<typename T>
 void LambUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
-    float epsilon, float weight_decay, const float* learning_rate, const T* scale_by_ptr,
-    const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v,
-    T* norm_buffer, T* beta1_t, T* beta2_t) {
+ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
+                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer
+) {
   LambUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate, scale_by_ptr,
-      skip_if, reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer,
-      beta1_t, beta2_t);
+      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val, do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr,
+      skip_if, reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer);
 }
 
 template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float>;
diff --git a/oneflow/user/kernels/model_update_kernel_util.h b/oneflow/user/kernels/model_update_kernel_util.h
index b81428209e1..d94d2d7be56 100644
--- a/oneflow/user/kernels/model_update_kernel_util.h
+++ b/oneflow/user/kernels/model_update_kernel_util.h
@@ -120,27 +120,36 @@ struct AdagradUpdateFunctor {
 template<typename T, typename G>
 struct LambGradFunctor {
   OF_DEVICE_FUNC
-  void operator()(const T* beta1_t, const T* beta2_t, const G* model_diff, T* adam_diff, T* model,
+  void operator()(const G* model_diff, T* adam_diff, T* model,
                   T* m, T* v, float scale, float l1, float l2, float beta1, float beta2,
-                  float epsilon) const {
+                  float epsilon, bool do_bias_correction, float bias_correction1, float bias_correction2) const {
     const T model_val = *model;
     T model_diff_t =
         CastScaleRegularizeGradientFunctor<T, G>()(*model_diff, model_val, scale, l1, l2);
     const T next_m = beta1 * *m + (1 - beta1) * model_diff_t;
     const T next_v = beta2 * *v + (1 - beta2) * model_diff_t * model_diff_t;
-    *adam_diff = (next_m / (1 - *beta1_t)) / (std::sqrt(next_v / (1 - *beta2_t)) + epsilon);
     *m = next_m;
     *v = next_v;
+    T numerator = 0;
+    T denominator = 0;
+    if (do_bias_correction) {
+      numerator = next_m / bias_correction1;
+      denominator = (sqrt(next_v) / sqrt(bias_correction2)) + epsilon;
+    } else {
+      numerator = next_m;
+      denominator = sqrt(next_v) + epsilon;
+    }
+    *adam_diff = numerator / denominator;
   }
 };
 
 template<typename T>
 struct LambLRFunctor {
   OF_DEVICE_FUNC
-  float operator()(const float learning_rate, const T* w_norm_2, const T* g_norm_2) const {
-    float lr = learning_rate;
-    const T w_norm_val = std::sqrt(*w_norm_2);
-    const T g_norm_val = std::sqrt(*g_norm_2);
+  float operator()(const float learning_rate_val, const T* w_norm_2, const T* g_norm_2) const {
+    float lr = learning_rate_val;
+    const T w_norm_val = sqrt(*w_norm_2);
+    const T g_norm_val = sqrt(*g_norm_2);
     T trust_ratio = 1;
     if (w_norm_val > 0 && g_norm_val > 0) { trust_ratio = w_norm_val / g_norm_val; }
     lr *= trust_ratio;
@@ -216,9 +225,12 @@ template<DeviceType device_type, typename T, typename G>
 struct LambUpdateKernelUtil {
  public:
   static void Update(ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, const float* learning_rate,
+                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
+                     bool do_bias_correction, float bias_correction1_val,
+                     float bias_correction2_val, const float* learning_rate_ptr,
+                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
                      const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer, T* beta1_t, T* beta2_t);
+                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer);
 };
 
 template<typename T, typename G, bool centered>
diff --git a/oneflow/user/kernels/model_update_kernels.cpp b/oneflow/user/kernels/model_update_kernels.cpp
index b233b7d42f5..5627526a44d 100644
--- a/oneflow/user/kernels/model_update_kernels.cpp
+++ b/oneflow/user/kernels/model_update_kernels.cpp
@@ -676,15 +676,13 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra
 
  private:
   void Compute(user_op::KernelComputeContext* ctx) const override {
-    const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
     const user_op::Tensor* model_diff = ctx->Tensor4ArgNameAndIndex("model_diff", 0);
     user_op::Tensor* model = ctx->Tensor4ArgNameAndIndex("model", 0);
     user_op::Tensor* m = ctx->Tensor4ArgNameAndIndex("m", 0);
     user_op::Tensor* v = ctx->Tensor4ArgNameAndIndex("v", 0);
-    user_op::Tensor* beta1_t = ctx->Tensor4ArgNameAndIndex("beta1_t", 0);
-    user_op::Tensor* beta2_t = ctx->Tensor4ArgNameAndIndex("beta2_t", 0);
     user_op::Tensor* tmp_buffer = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
     LambTmpBufferManager<device_type, T> tbm(tmp_buffer->mut_dptr(), model->shape().elem_cnt());
+
     const auto scale = ctx->Attr<double>("scale");
     const auto l1 = ctx->Attr<float>("l1");
     const auto l2 = ctx->Attr<float>("l2");
@@ -692,6 +690,31 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra
     const auto beta2 = ctx->Attr<float>("beta2");
     const auto epsilon = ctx->Attr<float>("epsilon");
     const auto weight_decay = ctx->Attr<float>("weight_decay");
+
+    const bool do_bias_correction = ctx->Attr<bool>("do_bias_correction");
+    const float bias_correction1_val = ctx->Attr<float>("bias_correction1_val");
+    const float* bias_correction1_ptr = nullptr;
+    if (ctx->has_input("bias_correction1", 0)) {
+      const user_op::Tensor* bias_correction1= ctx->Tensor4ArgNameAndIndex("bias_correction1", 0);
+      // Just for Lazy optional input check.
+      CHECK_EQ(bias_correction1->shape().elem_cnt(), 1);
+      bias_correction1_ptr = bias_correction1->dptr<float>();
+    }
+    const float bias_correction2_val = ctx->Attr<float>("bias_correction2_val");
+    const float* bias_correction2_ptr = nullptr;
+    if (ctx->has_input("bias_correction2", 0)) {
+      const user_op::Tensor* bias_correction2 = ctx->Tensor4ArgNameAndIndex("bias_correction2", 0);
+      CHECK_EQ(bias_correction2->shape().elem_cnt(), 1);
+      bias_correction2_ptr = bias_correction2->dptr<float>();
+    }
+
+    const float learning_rate_val = ctx->Attr<float>("learning_rate_val");
+    const float* learning_rate_ptr = nullptr;
+    if (ctx->has_input("learning_rate", 0)) {
+      const user_op::Tensor* learning_rate = ctx->Tensor4ArgNameAndIndex("learning_rate", 0);
+      learning_rate_ptr = learning_rate->dptr<float>();
+    }
+
     const T* scale_by_ptr = nullptr;
     if (ctx->has_input("scale_by_tensor", 0)) {
       const user_op::Tensor* scale_by_tensor = ctx->Tensor4ArgNameAndIndex("scale_by_tensor", 0);
@@ -699,17 +722,20 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra
       CHECK_EQ(scale_by_tensor->shape().elem_cnt(), 1);
       scale_by_ptr = scale_by_tensor->dptr<T>();
     }
+
     const int64_t* skip_if_ptr = nullptr;
     if (ctx->has_input("skip_if", 0)) {
       const user_op::Tensor* skip_if = ctx->Tensor4ArgNameAndIndex("skip_if", 0);
       CHECK_EQ(skip_if->shape().elem_cnt(), 1);
       skip_if_ptr = skip_if->dptr<int64_t>();
     }
+    
     LambUpdateKernelUtil<device_type, T, G>::Update(
-        ctx->stream(), m->shape().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon, weight_decay,
-        learning_rate->dptr<float>(), scale_by_ptr, skip_if_ptr, model_diff->dptr<G>(),
-        tbm.AdamDiffPtr(), model->mut_dptr<T>(), m->mut_dptr<T>(), v->mut_dptr<T>(),
-        tbm.NormBufferPtr(), beta1_t->mut_dptr<T>(), beta2_t->mut_dptr<T>());
+        ctx->stream(), m->shape().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon,
+        weight_decay, learning_rate_val, do_bias_correction, bias_correction1_val,
+        bias_correction2_val, learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr,
+        scale_by_ptr, skip_if_ptr, model_diff->dptr<G>(), tbm.AdamDiffPtr(), model->mut_dptr<T>(),
+        m->mut_dptr<T>(), v->mut_dptr<T>(), tbm.NormBufferPtr());
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
 };
diff --git a/oneflow/user/ops/model_update_ops.cpp b/oneflow/user/ops/model_update_ops.cpp
index d0da22056f9..7a18d87289c 100644
--- a/oneflow/user/ops/model_update_ops.cpp
+++ b/oneflow/user/ops/model_update_ops.cpp
@@ -253,10 +253,6 @@ Maybe<void> InferLambUpdateTensorDesc(user_op::InferContext* ctx) {
   const user_op::TensorDesc& v = ctx->InputTensorDesc("v", 0);
   JUST(CheckShapeLike(&v, &model));
   JUST(CheckLearningRateShape(ctx));
-  const user_op::TensorDesc& beta1_t = ctx->InputTensorDesc("beta1_t", 0);
-  const user_op::TensorDesc& beta2_t = ctx->InputTensorDesc("beta2_t", 0);
-  JUST(CheckScalarShape(&beta1_t));
-  JUST(CheckScalarShape(&beta2_t));
   if (ctx->has_input("scale_by_tensor", 0)) {
     const auto& scale_by_tensor = ctx->InputTensorDesc("scale_by_tensor", 0);
     JUST(CheckScalarShape(&scale_by_tensor));
@@ -270,11 +266,6 @@ Maybe<void> InferLambUpdateDataType(user_op::InferContext* ctx) {
   JUST(CheckDataTypeLike(&m, &model));
   const user_op::TensorDesc& v = ctx->InputTensorDesc("v", 0);
   JUST(CheckDataTypeLike(&v, &model));
-  const DataType data_type = model.data_type();
-  const user_op::TensorDesc& beta1_t = ctx->InputTensorDesc("beta1_t", 0);
-  const user_op::TensorDesc& beta2_t = ctx->InputTensorDesc("beta2_t", 0);
-  JUST(CheckScalarDataType(&beta1_t, data_type));
-  JUST(CheckScalarDataType(&beta2_t, data_type));
   JUST(CheckLearningRateDataType(ctx));
   if (ctx->has_input("scale_by_tensor", 0)) {
     const auto& scale_by_tensor = ctx->InputTensorDesc("scale_by_tensor", 0);
@@ -282,6 +273,7 @@ Maybe<void> InferLambUpdateDataType(user_op::InferContext* ctx) {
   }
   return Maybe<void>::Ok();
 }
+
 Maybe<void> SetInputArgModifierMutable(const user_op::GetInputArgModifier& GetInputArgModifierFn,
                                        const std::string& arg_name, int32_t arg_index) {
   user_op::InputArgModifier* arg_modifier = GetInputArgModifierFn(arg_name, arg_index);
@@ -311,8 +303,6 @@ Maybe<void> LambInputArgModifyFn(const user_op::GetInputArgModifier& GetInputArg
   JUST(SetInputArgModifierMutable(GetInputArgModifierFn, "model", 0));
   JUST(SetInputArgModifierMutable(GetInputArgModifierFn, "m", 0));
   JUST(SetInputArgModifierMutable(GetInputArgModifierFn, "v", 0));
-  JUST(SetInputArgModifierMutable(GetInputArgModifierFn, "beta1_t", 0));
-  JUST(SetInputArgModifierMutable(GetInputArgModifierFn, "beta2_t", 0));
   return Maybe<void>::Ok();
 }
 
diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
index 537d0c44104..9843fd2aa05 100644
--- a/python/oneflow/nn/optimizer/lamb.py
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -128,13 +128,12 @@ def __init__(
                 assert param.is_leaf, "parameters must be leaf tensor"
                 self._state[param] = dict()
 
-        self._lamb_op = (
-            flow.builtin_op("lamb_update")
+        self._op = (
+            flow.stateful_op("lamb_update")
             .Input("model")
             .Input("model_diff")
             .Input("m")
             .Input("v")
-            .Attr("l1", 0.0)
             .Build()
         )
 
@@ -149,6 +148,7 @@ def step(self, closure: Callable = None):
             loss = None
             if closure is not None:
                 loss = closure()
+
             for param_group in self.param_groups:
                 if param_group["do_bias_correction"]:
                     param_group["bias_correction1"] = 1.0 - math.pow(
@@ -159,9 +159,9 @@ def step(self, closure: Callable = None):
                     )
 
                 kwargs = {
-                    "learning_rate_val": param_group["lr"],
-                    "bias_correction1_val": param_group["bias_correction1"],
-                    "bias_correction2_val": param_group["bias_correction2"],
+                    "learning_rate": param_group["lr"],
+                    "bias_correction1": param_group["bias_correction1"],
+                    "bias_correction2": param_group["bias_correction2"],
                     "beta1": param_group["betas"][0],
                     "beta2": param_group["betas"][1],
                     "epsilon": param_group["eps"],
@@ -183,7 +183,11 @@ def step(self, closure: Callable = None):
                     m_tensor = self._state[param]["exp_avg"]
                     v_tensor = self._state[param]["exp_avg_sq"]
 
-                    self._lamb_op(param, param.grad, m_tensor, v_tensor, **kwargs)
+                    flow._C.dispatch_lamb_update(
+                        self._op,
+                        (param, param.grad, m_tensor, v_tensor),
+                        **kwargs
+                    )
 
             self._state["step"] += 1
 
diff --git a/python/oneflow/test/graph/test_graph_optim_lamb.py b/python/oneflow/test/graph/test_graph_optim_lamb.py
index 9935d46545b..e5460e81c0c 100644
--- a/python/oneflow/test/graph/test_graph_optim_lamb.py
+++ b/python/oneflow/test/graph/test_graph_optim_lamb.py
@@ -86,7 +86,6 @@ def build(self, mask_tensor):
             loss.backward()
             return loss
 
-    of_res_list = []
     lamb_graph = CustomLambGraph()
 
     for i in range(train_iters):
@@ -96,11 +95,9 @@ def build(self, mask_tensor):
             requires_grad=False,
             device=flow.device(device),
         )
-        _ = lamb_graph(mask_tensor)
+        lamb_graph(mask_tensor)
 
-        of_res_list.append(simp_module.param.numpy())
-
-    np_res_list = []
+    of_res = simp_module.param.numpy()
 
     def train_by_numpy():
         x = init_value
@@ -149,12 +146,11 @@ def np_train_one_iter(step, grad):
 
         for i in range(train_iters):
             (x, mt, vt) = np_train_one_iter(i, random_grad_seq[i])
-            np_res_list.append(x)
         return x
 
-    train_by_numpy()
+    np_res = train_by_numpy()
 
-    test_case.assertTrue(np.allclose(of_res_list, np_res_list, rtol=1e-3, atol=1e-3))
+    test_case.assertTrue(np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3))
 
 
 @flow.unittest.skip_unless_1n1d()
@@ -170,7 +166,7 @@ def test_lamb(test_case):
         arg_dict["eps"] = [1e-8, 1e-6]
         arg_dict["do_bias_correction"] = [True, False]
         arg_dict["adam_w_mode"] = [True, False]
-        # NOTE(Lxy): max_norm = -1 means no clip grad
+        # NOTE(l1aoxingyu): max_norm = -1 means no clip grad
         # nn.Graph only support `clip_grad_max_norm == 1.0` and `clip_grad_norm_type == 2.0`
         arg_dict["clip_grad_max_norm"] = [-1, 1.0]
         arg_dict["clip_grad_norm_type"] = [2.0]
diff --git a/python/oneflow/test/modules/test_optim_lamb.py b/python/oneflow/test/modules/test_optim_lamb.py
index 85be61fe974..17410707260 100644
--- a/python/oneflow/test/modules/test_optim_lamb.py
+++ b/python/oneflow/test/modules/test_optim_lamb.py
@@ -13,12 +13,13 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+import tempfile
 import unittest
 from collections import OrderedDict
-import numpy as np
 
-from test_util import GenArgList
+import numpy as np
 from optimizer_test_util import clip_grad_norm_np
+from test_util import GenArgList
 
 import oneflow as flow
 
@@ -36,6 +37,8 @@ def compare_with_numpy_lamb(
     adam_w_mode,
     clip_grad_max_norm,
     clip_grad_norm_type,
+    reload_state_step,
+    save_load_by_pickle,
 ):
 
     np.random.seed(1000)
@@ -45,8 +48,6 @@ def compare_with_numpy_lamb(
         random_grad_seq.append(np.random.uniform(size=x_shape).astype(np.float32))
     init_value = np.random.uniform(size=x_shape).astype(np.float32)
 
-    of_res_list = []
-
     def train_by_oneflow():
         x = flow.nn.Parameter(flow.Tensor(init_value, device=flow.device(device)))
 
@@ -80,16 +81,18 @@ def train_one_iter(grad):
                 lamb.clip_grad()
             lamb.step()
             lamb.zero_grad()
-            return x
 
         for i in range(train_iters):
-            param = train_one_iter(random_grad_seq[i])
-
-            of_res_list.append(param.numpy())
-
-    train_by_oneflow()
-
-    np_res_list = []
+            train_one_iter(random_grad_seq[i])
+            if i == reload_state_step:
+                state_dict = lamb.state_dict()
+                lamb = flow.optim.LAMB([optim_kwargs])
+                if save_load_by_pickle:
+                    with tempfile.TemporaryDirectory() as save_dir:
+                        flow.save(state_dict, save_dir)
+                        state_dict = flow.load(save_dir)
+                lamb.load_state_dict(state_dict)
+        return x
 
     def train_by_numpy():
         x = init_value
@@ -106,9 +109,7 @@ def train_by_numpy():
 
         def np_train_one_iter(step, grad):
             if clip_grad_max_norm != -1:
-                _, grad = clip_grad_norm_np(
-                    grad, clip_grad_max_norm, clip_grad_norm_type
-                )
+                _, grad = clip_grad_norm_np(grad, clip_grad_max_norm, clip_grad_norm_type)
 
             grad = grad + l2 * x
 
@@ -138,12 +139,12 @@ def np_train_one_iter(step, grad):
 
         for i in range(train_iters):
             (x, mt, vt) = np_train_one_iter(i, random_grad_seq[i])
-            np_res_list.append(x)
         return x
 
-    train_by_numpy()
+    of_res = train_by_oneflow().numpy()
+    np_res = train_by_numpy()
 
-    test_case.assertTrue(np.allclose(of_res_list, np_res_list, rtol=1e-3, atol=1e-3))
+    test_case.assertTrue(np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3))
 
 
 @flow.unittest.skip_unless_1n1d()
@@ -151,7 +152,7 @@ class TestLamb(flow.unittest.TestCase):
     def test_lamb(test_case):
         arg_dict = OrderedDict()
         arg_dict["device"] = ["cpu", "cuda"]
-        arg_dict["x_shape"] = [(10,)]
+        arg_dict["x_shape"] = [(1,)]
         arg_dict["learning_rate"] = [0.1, 1e-3]
         arg_dict["train_iters"] = [10]
         arg_dict["betas"] = [(0.99, 0.9)]
@@ -159,10 +160,11 @@ def test_lamb(test_case):
         arg_dict["eps"] = [1e-8, 1e-6]
         arg_dict["do_bias_correction"] = [True, False]
         arg_dict["adam_w_mode"] = [True, False]
-        # NOTE(Lxy): max_norm = -1 means no clip grad
+        # NOTE(l1aoxingyu): max_norm = -1 means no clip grad
         arg_dict["clip_grad_max_norm"] = [-1, 0.0, 0.5, 1.0]
         arg_dict["clip_grad_norm_type"] = ["inf", "-inf", 0.0, 1.0, 2.0, 3.5]
-        # TODO(Lxy): add save and load test
+        arg_dict["reload_state_step"] = [5]
+        arg_dict["save_load_by_pickle"] = [False, True]
 
         for arg in GenArgList(arg_dict):
             compare_with_numpy_lamb(test_case, *arg)

From ab47f24de2df8cd7bffb20e6741f62d12fb460d7 Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Thu, 27 Jan 2022 00:34:48 +0800
Subject: [PATCH 3/8] refine docs

---
 python/oneflow/nn/optimizer/lamb.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
index 9843fd2aa05..9a45840203c 100644
--- a/python/oneflow/nn/optimizer/lamb.py
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -24,8 +24,7 @@
 class LAMB(Optimizer):
     """Implements LAMB algorithm.
 
-    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes.
-    https://arxiv.org/abs/1904.00962
+    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
 
     Args:
         parameters (iterable): iterable of parameters to optimize or dicts defining
@@ -39,7 +38,11 @@ class LAMB(Optimizer):
         adam_w_mode (bool, optional): apply L2 regularization or weight decay True for
             decoupled weight decay (also known as AdamW) (default: True)
         do_bias_correction (bool, optional): whether to do bias correction (default: True)
-        amsgrad (bool, optional): whether to use the AMSGrad variant of this algorithm. NOT SUPPORTED now! (default: False)
+        amsgrad (bool, optional): whether to use the AMSGrad variant of this algorithm. 
+        NOT SUPPORTED now! (default: False)
+        
+    .. _Large Batch Optimization for Deep Learning\\: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
 
     For example:
 
@@ -84,7 +87,6 @@ class LAMB(Optimizer):
     If you want to use clip_grad, you can refer this example.
 
     For more details of `clip_grad_max_norm` and `clip_grad_norm_type`, you can refer to :func:`oneflow.nn.utils.clip_grad_norm_`.
-
     """
 
     def __init__(

From 4f623089cb47dc21a40ace70306026c53283386d Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Thu, 27 Jan 2022 11:04:47 +0800
Subject: [PATCH 4/8] add math equation

---
 python/oneflow/nn/optimizer/lamb.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
index 9a45840203c..eedc91c3fb8 100644
--- a/python/oneflow/nn/optimizer/lamb.py
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -26,6 +26,20 @@ class LAMB(Optimizer):
 
     LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
 
+    The equation of parameters updating is:
+
+    .. math::
+
+        & V_t = \\beta_1*V_{t-1} + (1-\\beta_1)*grad
+
+        & S_t = \\beta_2*S_{t-1} + (1-\\beta_2)*{grad} \\odot {grad}
+
+        & \\hat{u} = \\frac{{V_t}}{\\sqrt{{S_t}}+\\epsilon}
+        
+        & \\hat{r} = learning\\_rate * \\frac{||param_{old}||_2}{||\\hat{u}||_2}
+
+        & param_{new} = param_{old} - \\hat{r} * \\hat{u}
+
     Args:
         parameters (iterable): iterable of parameters to optimize or dicts defining
             parameter groups

From 231d04204624912d2dd219864b2ccd61c57ac32d Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Thu, 27 Jan 2022 14:57:58 +0800
Subject: [PATCH 5/8] refine

---
 oneflow/api/python/functional/dispatch_stateful_ops.cpp | 1 -
 python/oneflow/nn/optimizer/lamb.py                     | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/oneflow/api/python/functional/dispatch_stateful_ops.cpp b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
index 57b4863d650..59dd0cfd8b6 100644
--- a/oneflow/api/python/functional/dispatch_stateful_ops.cpp
+++ b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
@@ -14,7 +14,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-#include <memory>
 #include "oneflow/core/common/scalar.h"
 #include "oneflow/core/framework/attr_map.h"
 #include "oneflow/core/framework/nd_sbp.h"
diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
index eedc91c3fb8..5cbdd58c60d 100644
--- a/python/oneflow/nn/optimizer/lamb.py
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -115,6 +115,7 @@ def __init__(
         amsgrad: bool = False,
     ):
         if amsgrad:
+            # TODO: supported amsgrad in Lamb
             raise RuntimeError("LAMB does not support AMSGrad variant.")
         assert lr >= 0.0, f"Invalid learning rate: {lr}"
         assert eps >= 0.0, f"Invalid epsilon value: {eps}"

From fb8019a6df17103dc9a3b80e33c5b1e1060aa293 Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Fri, 28 Jan 2022 09:05:52 +0800
Subject: [PATCH 6/8] format

---
 .../user/kernels/model_update_kernel_util.cu  | 67 ++++++++++---------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/oneflow/user/kernels/model_update_kernel_util.cu b/oneflow/user/kernels/model_update_kernel_util.cu
index d44b94c41f0..4dd53735a10 100644
--- a/oneflow/user/kernels/model_update_kernel_util.cu
+++ b/oneflow/user/kernels/model_update_kernel_util.cu
@@ -323,23 +323,25 @@ __global__ void IndexedSlicesAdamUpdateGpu(
 template<typename T, typename G>
 __global__ void LambGradGpu(int64_t n, T scale, float l1, float l2, float beta1, float beta2,
                             float epsilon, const T* scale_by_ptr, const int64_t* skip_if,
-                            const G* model_diff, T* adam_diff, T* model, T* m, T* v, bool do_bias_correction, float bias_correction1_val, float bias_correction2_val, const float* bias_correction1_ptr, 
+                            const G* model_diff, T* adam_diff, T* model, T* m, T* v,
+                            bool do_bias_correction, float bias_correction1_val,
+                            float bias_correction2_val, const float* bias_correction1_ptr,
                             const float* bias_correction2_ptr) {
   if (skip_if != nullptr && *skip_if != 0) { return; }
   if (scale_by_ptr != nullptr) { scale *= *scale_by_ptr; }
   if (bias_correction1_ptr != nullptr) { bias_correction1_val = *bias_correction1_ptr; }
   if (bias_correction2_ptr != nullptr) { bias_correction2_val = *bias_correction2_ptr; }
   CUDA_1D_KERNEL_LOOP(i, n) {
-    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i,
-                            v + i, scale, l1, l2, beta1, beta2, epsilon, 
-                            do_bias_correction, bias_correction1_val, bias_correction2_val);
+    LambGradFunctor<T, G>()(model_diff + i, adam_diff + i, model + i, m + i, v + i, scale, l1, l2,
+                            beta1, beta2, epsilon, do_bias_correction, bias_correction1_val,
+                            bias_correction2_val);
   }
 }
 
 template<typename T>
-__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val, const float* learning_rate_ptr,
-                              const int64_t* skip_if, const T* w_norm_2, const T* g_norm_2,
-                              const T* adam_diff, T* model) {
+__global__ void LambUpdateGpu(int64_t n, float weight_decay, float learning_rate_val,
+                              const float* learning_rate_ptr, const int64_t* skip_if,
+                              const T* w_norm_2, const T* g_norm_2, const T* adam_diff, T* model) {
   if (skip_if != nullptr && *skip_if != 0) { return; }
   if (learning_rate_ptr != nullptr) { learning_rate_val = *learning_rate_ptr; }
   const float lr = LambLRFunctor<T>()(learning_rate_val, w_norm_2, g_norm_2);
@@ -461,24 +463,25 @@ struct LambUpdateKernelUtil<DeviceType::kCUDA, T, G> {
 
 template<typename T, typename G>
 void LambUpdateKernelUtil<DeviceType::kCUDA, T, G>::Update(
-    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
-                     bool do_bias_correction, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate_ptr,
-                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
-                     const T* scale_by_ptr, const int64_t* skip_if, const G* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer
-) {
-  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff,
-      adam_diff, model, m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, 
-      bias_correction1_ptr, bias_correction2_ptr);
+    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
+    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
+    const int64_t* skip_if, const G* model_diff, T* adam_diff, T* model, T* m, T* v,
+    T* norm_buffer) {
+  LambGradGpu<T, G><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+      n, scale, l1, l2, beta1, beta2, epsilon, scale_by_ptr, skip_if, model_diff, adam_diff, model,
+      m, v, do_bias_correction, bias_correction1_val, bias_correction2_val, bias_correction1_ptr,
+      bias_correction2_ptr);
   T* w_norm_2 = norm_buffer;
   T* g_norm_2 = norm_buffer + 1;
   Memset<DeviceType::kCUDA>(stream, norm_buffer, 0, 2 * sizeof(T));
-  SumSquares2<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
-      n, model, w_norm_2, adam_diff, g_norm_2);
-  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+  SumSquares2<T>
+      <<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+         stream->As<ep::CudaStream>()->cuda_stream()>>>(n, model, w_norm_2, adam_diff, g_norm_2);
+  LambUpdateGpu<T><<<BlocksNum4ThreadsNum(n), kCudaThreadsNumPerBlock, 0,
+                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
       n, weight_decay, learning_rate_val, learning_rate_ptr, skip_if, w_norm_2, g_norm_2, adam_diff,
       model);
 }
@@ -496,17 +499,17 @@ struct LambUpdateKernelUtil<DeviceType::kCUDA, T, float16> {
 
 template<typename T>
 void LambUpdateKernelUtil<DeviceType::kCUDA, T, float16>::Update(
-ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1,
-                     float beta2, float epsilon, float weight_decay, float learning_rate_val,
-                     bool do_bias_correction, float bias_correction1_val,
-                     float bias_correction2_val, const float* learning_rate_ptr,
-                     const float* bias_correction1_ptr, const float* bias_correction2_ptr,
-                     const T* scale_by_ptr, const int64_t* skip_if, const float16* model_diff,
-                     T* adam_diff, T* model, T* m, T* v, T* norm_buffer
-) {
+    ep::Stream* stream, int64_t n, float scale, float l1, float l2, float beta1, float beta2,
+    float epsilon, float weight_decay, float learning_rate_val, bool do_bias_correction,
+    float bias_correction1_val, float bias_correction2_val, const float* learning_rate_ptr,
+    const float* bias_correction1_ptr, const float* bias_correction2_ptr, const T* scale_by_ptr,
+    const int64_t* skip_if, const float16* model_diff, T* adam_diff, T* model, T* m, T* v,
+    T* norm_buffer) {
   LambUpdateKernelUtil<DeviceType::kCUDA, T, half>::Update(
-      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val, do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr,
-      skip_if, reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer);
+      stream, n, scale, l1, l2, beta1, beta2, epsilon, weight_decay, learning_rate_val,
+      do_bias_correction, bias_correction1_val, bias_correction2_val, learning_rate_ptr,
+      bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if,
+      reinterpret_cast<const half*>(model_diff), adam_diff, model, m, v, norm_buffer);
 }
 
 template struct LambUpdateKernelUtil<DeviceType::kCUDA, float, float>;

From 7773d7bd59f7d516e14ba0618da8aaecaaa1d2d6 Mon Sep 17 00:00:00 2001
From: oneflow-ci-bot <ci-bot@oneflow.org>
Date: Fri, 28 Jan 2022 01:12:08 +0000
Subject: [PATCH 7/8] auto format by CI

---
 .../functional/dispatch_stateful_ops.cpp      | 38 +++++----
 oneflow/core/job_rewriter/adam_optm.cpp       |  6 +-
 oneflow/core/job_rewriter/lamb_optm.cpp       | 84 ++++++++++---------
 .../user/kernels/model_update_kernel_util.h   |  6 +-
 oneflow/user/kernels/model_update_kernels.cpp | 14 ++--
 python/oneflow/nn/optimizer/lamb.py           |  4 +-
 .../test/graph/test_graph_optim_lamb.py       |  4 +-
 .../oneflow/test/modules/test_optim_lamb.py   |  8 +-
 8 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/oneflow/api/python/functional/dispatch_stateful_ops.cpp b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
index 59dd0cfd8b6..240ebb0ed4f 100644
--- a/oneflow/api/python/functional/dispatch_stateful_ops.cpp
+++ b/oneflow/api/python/functional/dispatch_stateful_ops.cpp
@@ -480,24 +480,26 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
         JUST(OpInterpUtil::Dispatch<TensorTuple>(*op, inputs, attrs));
         return Maybe<void>::Ok();
       });
-  m.add_functor(
-    "DispatchLambUpdate",
-    [](const std::shared_ptr<OpExpr>& op, const TensorTuple& inputs, float learning_rate, float bias_correction1, float bias_correction2, double scale, float l1, float l2, float beta1, float beta2, float epsilon, float weight_decay, bool do_bias_correction) -> Maybe<void> {
-      MutableAttrMap attrs;
-      JUST(attrs.SetAttr("learning_rate_val", learning_rate));
-      JUST(attrs.SetAttr("bias_correction1_val", bias_correction1));
-      JUST(attrs.SetAttr("bias_correction2_val", bias_correction2));
-      JUST(attrs.SetAttr("scale", scale));
-      JUST(attrs.SetAttr("l1", l1));
-      JUST(attrs.SetAttr("l2", l2));
-      JUST(attrs.SetAttr("beta1", beta1));
-      JUST(attrs.SetAttr("beta2", beta2));
-      JUST(attrs.SetAttr("epsilon", epsilon));
-      JUST(attrs.SetAttr("weight_decay", weight_decay));
-      JUST(attrs.SetAttr("do_bias_correction", do_bias_correction));
-      JUST(OpInterpUtil::Dispatch<TensorTuple>(*op, inputs, attrs));
-      return Maybe<void>::Ok();
-    });
+  m.add_functor("DispatchLambUpdate",
+                [](const std::shared_ptr<OpExpr>& op, const TensorTuple& inputs,
+                   float learning_rate, float bias_correction1, float bias_correction2,
+                   double scale, float l1, float l2, float beta1, float beta2, float epsilon,
+                   float weight_decay, bool do_bias_correction) -> Maybe<void> {
+                  MutableAttrMap attrs;
+                  JUST(attrs.SetAttr("learning_rate_val", learning_rate));
+                  JUST(attrs.SetAttr("bias_correction1_val", bias_correction1));
+                  JUST(attrs.SetAttr("bias_correction2_val", bias_correction2));
+                  JUST(attrs.SetAttr("scale", scale));
+                  JUST(attrs.SetAttr("l1", l1));
+                  JUST(attrs.SetAttr("l2", l2));
+                  JUST(attrs.SetAttr("beta1", beta1));
+                  JUST(attrs.SetAttr("beta2", beta2));
+                  JUST(attrs.SetAttr("epsilon", epsilon));
+                  JUST(attrs.SetAttr("weight_decay", weight_decay));
+                  JUST(attrs.SetAttr("do_bias_correction", do_bias_correction));
+                  JUST(OpInterpUtil::Dispatch<TensorTuple>(*op, inputs, attrs));
+                  return Maybe<void>::Ok();
+                });
   m.add_functor("DispatchEagerNcclAllReduce",
                 [](const std::shared_ptr<OpExpr>& op, const std::shared_ptr<Tensor>& input,
                    const std::string& parallel_conf, bool async_launch) -> Maybe<Tensor> {
diff --git a/oneflow/core/job_rewriter/adam_optm.cpp b/oneflow/core/job_rewriter/adam_optm.cpp
index 76d6b02270f..382307f2f2d 100644
--- a/oneflow/core/job_rewriter/adam_optm.cpp
+++ b/oneflow/core/job_rewriter/adam_optm.cpp
@@ -27,7 +27,6 @@ bool operator==(const BiasCorrectionFactorCacheKey& lhs, const BiasCorrectionFac
   return (lhs.beta == rhs.beta) && (lhs.parallel_conf == rhs.parallel_conf);
 }
 
-
 }  // namespace oneflow
 
 namespace std {
@@ -44,8 +43,8 @@ struct hash<oneflow::BiasCorrectionFactorCacheKey> {
 }  // namespace std
 
 namespace oneflow {
-  
- class BiasCorrectionFactorState final : public JobPassState {
+
+class BiasCorrectionFactorState final : public JobPassState {
  public:
   BiasCorrectionFactorState() {}
   ~BiasCorrectionFactorState() override = default;
@@ -89,7 +88,6 @@ OperatorConf GenerateAdamHelperVariableOpConf(const VariableOp& op, const std::s
   return helper_variable_op;
 }
 
-
 void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
                              const std::string& model_diff_lbn, const OptimizerConf& optimizer_conf,
                              JobBuilder* job_builder) {
diff --git a/oneflow/core/job_rewriter/lamb_optm.cpp b/oneflow/core/job_rewriter/lamb_optm.cpp
index 23734257d46..999f6d34d10 100644
--- a/oneflow/core/job_rewriter/lamb_optm.cpp
+++ b/oneflow/core/job_rewriter/lamb_optm.cpp
@@ -17,7 +17,7 @@ limitations under the License.
 #include "oneflow/core/framework/framework.h"
 
 namespace oneflow {
-  
+
 struct BiasCorrectionFactorCacheKey {
   float beta = 1.0;
   ParallelConf parallel_conf;
@@ -25,7 +25,7 @@ struct BiasCorrectionFactorCacheKey {
 
 bool operator==(const BiasCorrectionFactorCacheKey& lhs, const BiasCorrectionFactorCacheKey& rhs);
 
-} // namespace oneflow
+}  // namespace oneflow
 
 namespace std {
 template<>
@@ -43,19 +43,18 @@ namespace oneflow {
 
 // Forward declaration for bias correction factor
 class BiasCorrectionFactorState final : public JobPassState {
-  public:
-    BiasCorrectionFactorState() {}
-    ~BiasCorrectionFactorState() override = default;
+ public:
+  BiasCorrectionFactorState() {}
+  ~BiasCorrectionFactorState() override = default;
 
-    std::string GetLbn(float beta, std::string bias_correction_name, ParallelConf parallel_conf,
-                       const std::function<std::string(float beta_val, std::string op_name)>&
-                           BiasCorrectionFactorStateOp);
+  std::string GetLbn(float beta, std::string bias_correction_name, ParallelConf parallel_conf,
+                     const std::function<std::string(float beta_val, std::string op_name)>&
+                         BiasCorrectionFactorStateOp);
 
-   private:
-    HashMap<BiasCorrectionFactorCacheKey, std::string> key2lbn_;
+ private:
+  HashMap<BiasCorrectionFactorCacheKey, std::string> key2lbn_;
 };
 
-  
 namespace {
 
 std::string GenVariableOutputLbn(const OperatorConf& op_conf) {
@@ -97,16 +96,16 @@ void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
   job_builder->AddOps(var_op_node.parallel_desc().parallel_conf(), {m_var, v_var});
 
   user_op::UserOpConfWrapperBuilder lamb_update_op_builder(var_op->op_name() + "_optimizer");
-  
+
   const LambModelUpdateConf& lamb_conf = optimizer_conf.lamb_conf();
   float beta1 = lamb_conf.beta1();
   float beta2 = lamb_conf.beta2();
   float epsilon = lamb_conf.epsilon();
   bool do_bias_correction = lamb_conf.do_bias_correction();
-  
+
   const std::string& train_step_lbn = job_builder->job().job_conf().train_conf().train_step_lbn();
   const std::string& learning_rate_lbn = optimizer_conf.learning_rate_lbn();
-  
+
   if (do_bias_correction) {
     // Reuse adam bias_correction job pass
     const std::string& job_pass_state_key = "adam_bias_correction_factor";
@@ -117,8 +116,10 @@ void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
     }
     auto* state = CHECK_JUST(ctx->MutableState<BiasCorrectionFactorState>(job_pass_state_key));
     ParallelConf bias_correction_parallel_conf;
-    const auto& lr_parallel_conf = CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(learning_rate_lbn)));
-    const auto& train_step_parallel_conf = CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(train_step_lbn)));
+    const auto& lr_parallel_conf =
+        CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(learning_rate_lbn)));
+    const auto& train_step_parallel_conf =
+        CHECK_JUST(job_builder->ParallelConf4Lbi(GenLogicalBlobId(train_step_lbn)));
     if (lr_parallel_conf == train_step_parallel_conf) {
       bias_correction_parallel_conf = lr_parallel_conf;
     } else {
@@ -135,7 +136,8 @@ void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
               .ScopeSymbolId(var_op->op_conf().scope_symbol_id())
               .Build();
 
-      job_builder->AddOps(bias_correction_parallel_conf, {lamb_bias_correction_factor_op.op_conf()});
+      job_builder->AddOps(bias_correction_parallel_conf,
+                          {lamb_bias_correction_factor_op.op_conf()});
       return lamb_bias_correction_factor_op.output("out", 0);
     };
 
@@ -147,32 +149,32 @@ void GenerateOptimizerOpConf(JobPassCtx* ctx, const OpNode& var_op_node,
                       AddLambBiasCorrectionFactorOp);
 
     lamb_update_op_builder.OpTypeName("lamb_update")
-      .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
-      .Input("model_diff", model_diff_lbn)
-      .Input("m", GenVariableOutputLbn(m_var))
-      .Input("v", GenVariableOutputLbn(v_var))
-      .Input("learning_rate", learning_rate_lbn)
-      .Input("bias_correction1", bias_correction1_lbn)
-      .Input("bias_correction2", bias_correction2_lbn)
-      .Attr<float>("beta1", beta1)
-      .Attr<float>("beta2", beta2)
-      .Attr<float>("epsilon", epsilon)
-      .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
-      .Attr<bool>("do_bias_correction", true)
-      .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
+        .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
+        .Input("model_diff", model_diff_lbn)
+        .Input("m", GenVariableOutputLbn(m_var))
+        .Input("v", GenVariableOutputLbn(v_var))
+        .Input("learning_rate", learning_rate_lbn)
+        .Input("bias_correction1", bias_correction1_lbn)
+        .Input("bias_correction2", bias_correction2_lbn)
+        .Attr<float>("beta1", beta1)
+        .Attr<float>("beta2", beta2)
+        .Attr<float>("epsilon", epsilon)
+        .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
+        .Attr<bool>("do_bias_correction", true)
+        .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
   } else {
     lamb_update_op_builder.OpTypeName("lamb_update")
-      .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
-      .Input("model_diff", model_diff_lbn)
-      .Input("m", GenVariableOutputLbn(m_var))
-      .Input("v", GenVariableOutputLbn(v_var))
-      .Input("learning_rate", learning_rate_lbn)
-      .Attr<float>("beta1", beta1)
-      .Attr<float>("beta2", beta2)
-      .Attr<float>("epsilon", epsilon)
-      .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
-      .Attr<bool>("do_bias_correction", false)
-      .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
+        .Input("model", GenLogicalBlobName(var_op->BnInOp2Lbi("out")))
+        .Input("model_diff", model_diff_lbn)
+        .Input("m", GenVariableOutputLbn(m_var))
+        .Input("v", GenVariableOutputLbn(v_var))
+        .Input("learning_rate", learning_rate_lbn)
+        .Attr<float>("beta1", beta1)
+        .Attr<float>("beta2", beta2)
+        .Attr<float>("epsilon", epsilon)
+        .Attr<float>("weight_decay", GetOptimizerWeightDecayRate(optimizer_conf, *var_op))
+        .Attr<bool>("do_bias_correction", false)
+        .ScopeSymbolId(var_op->op_conf().scope_symbol_id());
   }
 
   SetDynamicLossScaleSkipIf(ctx, &lamb_update_op_builder);
diff --git a/oneflow/user/kernels/model_update_kernel_util.h b/oneflow/user/kernels/model_update_kernel_util.h
index d94d2d7be56..03ae9b819c0 100644
--- a/oneflow/user/kernels/model_update_kernel_util.h
+++ b/oneflow/user/kernels/model_update_kernel_util.h
@@ -120,9 +120,9 @@ struct AdagradUpdateFunctor {
 template<typename T, typename G>
 struct LambGradFunctor {
   OF_DEVICE_FUNC
-  void operator()(const G* model_diff, T* adam_diff, T* model,
-                  T* m, T* v, float scale, float l1, float l2, float beta1, float beta2,
-                  float epsilon, bool do_bias_correction, float bias_correction1, float bias_correction2) const {
+  void operator()(const G* model_diff, T* adam_diff, T* model, T* m, T* v, float scale, float l1,
+                  float l2, float beta1, float beta2, float epsilon, bool do_bias_correction,
+                  float bias_correction1, float bias_correction2) const {
     const T model_val = *model;
     T model_diff_t =
         CastScaleRegularizeGradientFunctor<T, G>()(*model_diff, model_val, scale, l1, l2);
diff --git a/oneflow/user/kernels/model_update_kernels.cpp b/oneflow/user/kernels/model_update_kernels.cpp
index 5627526a44d..5a8b29326ce 100644
--- a/oneflow/user/kernels/model_update_kernels.cpp
+++ b/oneflow/user/kernels/model_update_kernels.cpp
@@ -695,7 +695,7 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra
     const float bias_correction1_val = ctx->Attr<float>("bias_correction1_val");
     const float* bias_correction1_ptr = nullptr;
     if (ctx->has_input("bias_correction1", 0)) {
-      const user_op::Tensor* bias_correction1= ctx->Tensor4ArgNameAndIndex("bias_correction1", 0);
+      const user_op::Tensor* bias_correction1 = ctx->Tensor4ArgNameAndIndex("bias_correction1", 0);
       // Just for Lazy optional input check.
       CHECK_EQ(bias_correction1->shape().elem_cnt(), 1);
       bias_correction1_ptr = bias_correction1->dptr<float>();
@@ -729,13 +729,13 @@ class LambUpdateKernel final : public user_op::OpKernel, public user_op::CudaGra
       CHECK_EQ(skip_if->shape().elem_cnt(), 1);
       skip_if_ptr = skip_if->dptr<int64_t>();
     }
-    
+
     LambUpdateKernelUtil<device_type, T, G>::Update(
-        ctx->stream(), m->shape().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon,
-        weight_decay, learning_rate_val, do_bias_correction, bias_correction1_val,
-        bias_correction2_val, learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr,
-        scale_by_ptr, skip_if_ptr, model_diff->dptr<G>(), tbm.AdamDiffPtr(), model->mut_dptr<T>(),
-        m->mut_dptr<T>(), v->mut_dptr<T>(), tbm.NormBufferPtr());
+        ctx->stream(), m->shape().elem_cnt(), scale, l1, l2, beta1, beta2, epsilon, weight_decay,
+        learning_rate_val, do_bias_correction, bias_correction1_val, bias_correction2_val,
+        learning_rate_ptr, bias_correction1_ptr, bias_correction2_ptr, scale_by_ptr, skip_if_ptr,
+        model_diff->dptr<G>(), tbm.AdamDiffPtr(), model->mut_dptr<T>(), m->mut_dptr<T>(),
+        v->mut_dptr<T>(), tbm.NormBufferPtr());
   }
   bool AlwaysComputeWhenAllOutputsEmpty() const override { return true; }
 };
diff --git a/python/oneflow/nn/optimizer/lamb.py b/python/oneflow/nn/optimizer/lamb.py
index 5cbdd58c60d..f4f7fdf5b99 100644
--- a/python/oneflow/nn/optimizer/lamb.py
+++ b/python/oneflow/nn/optimizer/lamb.py
@@ -201,9 +201,7 @@ def step(self, closure: Callable = None):
                     v_tensor = self._state[param]["exp_avg_sq"]
 
                     flow._C.dispatch_lamb_update(
-                        self._op,
-                        (param, param.grad, m_tensor, v_tensor),
-                        **kwargs
+                        self._op, (param, param.grad, m_tensor, v_tensor), **kwargs
                     )
 
             self._state["step"] += 1
diff --git a/python/oneflow/test/graph/test_graph_optim_lamb.py b/python/oneflow/test/graph/test_graph_optim_lamb.py
index e5460e81c0c..791b4fd98af 100644
--- a/python/oneflow/test/graph/test_graph_optim_lamb.py
+++ b/python/oneflow/test/graph/test_graph_optim_lamb.py
@@ -150,7 +150,9 @@ def np_train_one_iter(step, grad):
 
     np_res = train_by_numpy()
 
-    test_case.assertTrue(np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3))
+    test_case.assertTrue(
+        np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3)
+    )
 
 
 @flow.unittest.skip_unless_1n1d()
diff --git a/python/oneflow/test/modules/test_optim_lamb.py b/python/oneflow/test/modules/test_optim_lamb.py
index 17410707260..3c7a38337c9 100644
--- a/python/oneflow/test/modules/test_optim_lamb.py
+++ b/python/oneflow/test/modules/test_optim_lamb.py
@@ -109,7 +109,9 @@ def train_by_numpy():
 
         def np_train_one_iter(step, grad):
             if clip_grad_max_norm != -1:
-                _, grad = clip_grad_norm_np(grad, clip_grad_max_norm, clip_grad_norm_type)
+                _, grad = clip_grad_norm_np(
+                    grad, clip_grad_max_norm, clip_grad_norm_type
+                )
 
             grad = grad + l2 * x
 
@@ -144,7 +146,9 @@ def np_train_one_iter(step, grad):
     of_res = train_by_oneflow().numpy()
     np_res = train_by_numpy()
 
-    test_case.assertTrue(np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3))
+    test_case.assertTrue(
+        np.allclose(of_res.flatten(), np_res.flatten(), rtol=1e-3, atol=1e-3)
+    )
 
 
 @flow.unittest.skip_unless_1n1d()

From 74f1dfa079ec32883890a8046cc9fb07d30a6016 Mon Sep 17 00:00:00 2001
From: l1aoxingyu <sherlockliao01@gmail.com>
Date: Fri, 28 Jan 2022 15:44:05 +0800
Subject: [PATCH 8/8] add utils

---
 python/oneflow/optim/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/oneflow/optim/__init__.py b/python/oneflow/optim/__init__.py
index a4f995d3c5d..0484175bafa 100644
--- a/python/oneflow/optim/__init__.py
+++ b/python/oneflow/optim/__init__.py
@@ -22,3 +22,4 @@
 from oneflow.nn.optimizer.lamb import LAMB
 
 from . import lr_scheduler
+from . import utils