From de60c1d10f86a9ee6a9af725007ad35148da75fe Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Tue, 27 Jun 2023 11:46:53 +0800
Subject: [PATCH] replace NaturalExpDecay, ExponentialDecay, InverseTimeDecay
 with 2.0 version (#54424)

* remove the NaturalExpDecay in fluid

* fix bug

* remove the ExponentialDecay in fluid

* remove the InverseTimeDecay in fluid

* remove the InverseTimeDecay class

* fix bug
---
 .../distributed/passes/ps_server_pass.py      |   7 +-
 .../fluid/dygraph/learning_rate_scheduler.py  | 252 ------------------
 .../fluid/layers/learning_rate_scheduler.py   |  12 +-
 .../test_basic_api_transformation.py          |  75 +++++-
 test/legacy_test/dist_ctr.py                  |   6 +-
 test/legacy_test/test_dist_fleet_ps5.py       |   6 +-
 test/legacy_test/test_dist_transpiler.py      |  12 +-
 test/legacy_test/test_imperative_optimizer.py |  37 ++-
 .../test_learning_rate_scheduler.py           |  18 +-
 9 files changed, 111 insertions(+), 314 deletions(-)

diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py
index 3cc1a14dbcd91..4e4377f328f3d 100755
--- a/python/paddle/distributed/passes/ps_server_pass.py
+++ b/python/paddle/distributed/passes/ps_server_pass.py
@@ -18,7 +18,6 @@
 from paddle.fluid.layers.learning_rate_scheduler import (
     exponential_decay,
     inverse_time_decay,
-    natural_exp_decay,
     noam_decay,
 )
 from paddle.optimizer.lr import (
@@ -112,9 +111,9 @@ def _get_lr_scheduler_program(self, lr_scheduler, lr_decay_steps):
             with paddle.static.program_guard(
                 decay_main_program, decay_startup_program
             ):
-                lr = natural_exp_decay(
-                    1.0, lr_decay_steps, lr_scheduler.gamma, True
-                )
+                lr = paddle.optimizer.lr.NaturalExpDecay(
+                    1.0, lr_scheduler.gamma
+                ).get_lr()
                 lr_name = lr.name
                 logging.warn(
                     "NaturalExpDecay is set, staircase = True, global learning rate decay step is [ %d ], Change decay steps as follow: \n"
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 2da10b2e52dce..079bf865df6f0 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -24,9 +24,6 @@
 __all__ = [
     'NoamDecay',
     'PiecewiseDecay',
-    'NaturalExpDecay',
-    'ExponentialDecay',
-    'InverseTimeDecay',
     'PolynomialDecay',
     'CosineDecay',
     'LinearLrWarmup',
@@ -197,255 +194,6 @@ def step(self):
         return self.create_lr_var(self.vars[len(self.values) - 1])
 
 
-class NaturalExpDecay(LearningRateDecay):
-    r"""
-    :api_attr: imperative
-
-    Applies natural exponential decay to the initial learning rate.
-
-    The algorithm can be described as following.
-
-    .. math::
-
-        decayed\_learning\_rate = learning\_rate * e^{y}
-
-    If staircase is set to False, then:
-
-    .. math::
-
-        y = - decay\_rate * \\frac{global\_step}{decay\_steps}
-
-    If staircase is set to True, then:
-
-    .. math::
-
-        y = - decay\_rate * math.floor(\\frac{global\_step}{decay\_steps})
-
-    Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type
-            is Variable, it's a tensor with shape [1], the data type can be
-            float32 or float64. It also can be set to python int number.
-        decay_steps(int): The decay step size. It determines the decay cycle.
-        decay_rate(int): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
-            default value is False.
-        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
-        step(int, optional): The step size used to calculate the new global_step in the description above.
-            The default value is 1.
-        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
-            'float32', 'float64'. The default value is 'float32'.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle
-            base_lr = 0.1
-            with fluid.dygraph.guard():
-                emb = paddle.nn.Embedding(10, 10)
-                sgd_optimizer = fluid.optimizer.SGD(
-                        learning_rate=fluid.dygraph.NaturalExpDecay(
-                            learning_rate=base_lr,
-                            decay_steps=10000,
-                            decay_rate=0.5,
-                            staircase=True),
-                        parameter_list=emb.parameters())
-
-    """
-
-    def __init__(
-        self,
-        learning_rate,
-        decay_steps,
-        decay_rate,
-        staircase=False,
-        begin=0,
-        step=1,
-        dtype='float32',
-    ):
-        super().__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = paddle.floor(div_res)
-        decayed_lr = self.learning_rate * paddle.exp(
-            -1 * self.decay_rate * div_res
-        )
-
-        return decayed_lr
-
-
-class ExponentialDecay(LearningRateDecay):
-    r"""
-    :api_attr: imperative
-
-    Applies exponential decay to the learning rate.
-
-    The algorithm can be described as following.
-
-    .. math::
-
-        decayed\_learning\_rate = learning\_rate * decay\_rate ^ y
-
-    If staircase is set to False, then:
-
-    .. math::
-
-        y = \\frac{global\_step}{decay\_steps}
-
-    If staircase is set to True, then:
-
-    .. math::
-
-        y = math.floor(\\frac{global\_step}{decay\_steps})
-
-
-    Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type
-            is Variable, it's a tensor with shape [1], the data type can be
-            float32 or float64. It also can be set to python int number.
-        decay_steps(int): The decay step size. It determines the decay cycle.
-        decay_rate(float): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
-            default value is False.
-        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
-        step(int, optional): The step size used to calculate the new global_step in the description above.
-            The default value is 1.
-        dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as
-            'float32', 'float64'. The default value is 'float32'.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-                    learning_rate=fluid.dygraph.ExponentialDecay(
-                        learning_rate=base_lr,
-                        decay_steps=10000,
-                        decay_rate=0.5,
-                        staircase=True))
-
-    """
-
-    def __init__(
-        self,
-        learning_rate,
-        decay_steps,
-        decay_rate,
-        staircase=False,
-        begin=0,
-        step=1,
-        dtype='float32',
-    ):
-        super().__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = paddle.floor(div_res)
-
-        decayed_lr = self.learning_rate * (self.decay_rate**div_res)
-
-        return decayed_lr
-
-
-class InverseTimeDecay(LearningRateDecay):
-    r"""
-    :api_attr: imperative
-
-    Applies inverse time decay to the initial learning rate.
-
-    The algorithm can be described as following.
-    If staircase is set to False, then:
-
-    .. math::
-
-        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * \\frac{global\_step}{decay\_step}}
-
-    If staircase is set to True, then:
-
-    .. math::
-
-        decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * math.floor(\\frac{global\_step}{decay\_step})}
-
-    Parameters:
-        learning_rate(Variable|float): The initial learning rate. If the type
-            is Variable, it's a tensor with shape [1], the data type can be
-            float32 or float64. It also can be set to python int number.
-        decay_steps(int): The decay step size. It determines the decay cycle.
-        decay_rate(float): The decay rate.
-        staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The
-            default value is False.
-        begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0.
-        step(int, optional): The step size used to calculate the new global_step in the description above.
-            The default value is 1.
-        dtype(str, optional): The data type used to create the learning rate variable. The data type can be
-            'float32', 'float64'. The default value is 'float32'.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              emb = paddle.nn.Embedding(10, 10)
-              sgd_optimizer = fluid.optimizer.SGD(
-                  learning_rate=fluid.dygraph.InverseTimeDecay(
-                        learning_rate=base_lr,
-                        decay_steps=10000,
-                        decay_rate=0.5,
-                        staircase=True),
-                  parameter_list = emb.parameters())
-
-    """
-
-    def __init__(
-        self,
-        learning_rate,
-        decay_steps,
-        decay_rate,
-        staircase=False,
-        begin=0,
-        step=1,
-        dtype='float32',
-    ):
-        super().__init__(begin, step, dtype)
-        self.learning_rate = learning_rate
-        self.decay_steps = decay_steps
-        self.decay_rate = decay_rate
-        self.staircase = staircase
-
-    def step(self):
-        div_res = self.create_lr_var(self.step_num / self.decay_steps)
-        if self.staircase:
-            div_res = paddle.floor(div_res)
-
-        decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
-
-        return decayed_lr
-
-
 class PolynomialDecay(LearningRateDecay):
     r"""
     :api_attr: imperative
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 4e01297d76a81..37f61d351e622 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -166,8 +166,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     with default_main_program()._lr_schedule_guard():
         if in_dygraph_mode():
-            decay = imperate_lr.ExponentialDecay(
-                learning_rate, decay_steps, decay_rate, staircase
+            decay = paddle.optimizer.lr.ExponentialDecay(
+                learning_rate, decay_rate
             )
             return decay
         else:
@@ -228,8 +228,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     with default_main_program()._lr_schedule_guard():
         if in_dygraph_mode():
-            decay = imperate_lr.NaturalExpDecay(
-                learning_rate, decay_steps, decay_rate, staircase
+            decay = paddle.optimizer.lr.NaturalExpDecay(
+                learning_rate, decay_rate
             )
             return decay
         else:
@@ -288,8 +288,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
     with default_main_program()._lr_schedule_guard():
         if in_dygraph_mode():
-            decay = imperate_lr.InverseTimeDecay(
-                learning_rate, decay_steps, decay_rate, staircase
+            decay = paddle.optimizer.lr.InverseTimeDecay(
+                learning_rate, decay_rate
             )
             return decay
         else:
diff --git a/test/dygraph_to_static/test_basic_api_transformation.py b/test/dygraph_to_static/test_basic_api_transformation.py
index ea6cb541ff99e..0abe6bcb5194a 100644
--- a/test/dygraph_to_static/test_basic_api_transformation.py
+++ b/test/dygraph_to_static/test_basic_api_transformation.py
@@ -342,8 +342,8 @@ def dyfunc_CosineDecay():
 
 def dyfunc_ExponentialDecay():
     base_lr = 0.1
-    exponential_decay = fluid.dygraph.ExponentialDecay(
-        learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True
+    exponential_decay = paddle.optimizer.lr.ExponentialDecay(
+        learning_rate=base_lr, gamma=0.5
     )
     lr = exponential_decay()
     return lr
@@ -351,8 +351,8 @@ def dyfunc_ExponentialDecay():
 
 def dyfunc_InverseTimeDecay():
     base_lr = 0.1
-    inverse_time_decay = fluid.dygraph.InverseTimeDecay(
-        learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True
+    inverse_time_decay = paddle.optimizer.lr.InverseTimeDecay(
+        learning_rate=base_lr, gamma=0.5
     )
     lr = inverse_time_decay()
     return lr
@@ -360,8 +360,8 @@ def dyfunc_InverseTimeDecay():
 
 def dyfunc_NaturalExpDecay():
     base_lr = 0.1
-    natural_exp_decay = fluid.dygraph.NaturalExpDecay(
-        learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True
+    natural_exp_decay = paddle.optimizer.lr.NaturalExpDecay(
+        learning_rate=base_lr, gamma=0.5
     )
     lr = natural_exp_decay()
     return lr
@@ -424,16 +424,79 @@ class TestDygraphBasicApi_ExponentialDecay(TestDygraphBasicApi_CosineDecay):
     def setUp(self):
         self.dygraph_func = dyfunc_ExponentialDecay
 
+    def get_dygraph_output(self):
+        with fluid.dygraph.guard():
+            fluid.default_startup_program.random_seed = SEED
+            fluid.default_main_program.random_seed = SEED
+            res = self.dygraph_func()
+            return res
+
+    def get_static_output(self):
+        startup_program = fluid.Program()
+        startup_program.random_seed = SEED
+        main_program = fluid.Program()
+        main_program.random_seed = SEED
+        with fluid.program_guard(main_program, startup_program):
+            static_out = dygraph_to_static_func(self.dygraph_func)()
+            static_out = paddle.to_tensor(static_out)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        static_res = exe.run(main_program, fetch_list=static_out)
+        return static_res[0]
+
 
 class TestDygraphBasicApi_InverseTimeDecay(TestDygraphBasicApi_CosineDecay):
     def setUp(self):
         self.dygraph_func = dyfunc_InverseTimeDecay
 
+    def get_dygraph_output(self):
+        with fluid.dygraph.guard():
+            fluid.default_startup_program.random_seed = SEED
+            fluid.default_main_program.random_seed = SEED
+            res = self.dygraph_func()
+            return res
+
+    def get_static_output(self):
+        startup_program = fluid.Program()
+        startup_program.random_seed = SEED
+        main_program = fluid.Program()
+        main_program.random_seed = SEED
+        with fluid.program_guard(main_program, startup_program):
+            static_out = dygraph_to_static_func(self.dygraph_func)()
+            static_out = paddle.to_tensor(static_out)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        static_res = exe.run(main_program, fetch_list=static_out)
+        return static_res[0]
+
 
 class TestDygraphBasicApi_NaturalExpDecay(TestDygraphBasicApi_CosineDecay):
     def setUp(self):
         self.dygraph_func = dyfunc_NaturalExpDecay
 
+    def get_dygraph_output(self):
+        with fluid.dygraph.guard():
+            fluid.default_startup_program.random_seed = SEED
+            fluid.default_main_program.random_seed = SEED
+            res = self.dygraph_func()
+            return res
+
+    def get_static_output(self):
+        startup_program = fluid.Program()
+        startup_program.random_seed = SEED
+        main_program = fluid.Program()
+        main_program.random_seed = SEED
+        with fluid.program_guard(main_program, startup_program):
+            static_out = dygraph_to_static_func(self.dygraph_func)()
+            static_out = paddle.to_tensor(static_out)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        static_res = exe.run(main_program, fetch_list=static_out)
+        return static_res[0]
+
 
 class TestDygraphBasicApi_NoamDecay(TestDygraphBasicApi_CosineDecay):
     def setUp(self):
diff --git a/test/legacy_test/dist_ctr.py b/test/legacy_test/dist_ctr.py
index f6b33319a9798..4056e5bc2285e 100644
--- a/test/legacy_test/dist_ctr.py
+++ b/test/legacy_test/dist_ctr.py
@@ -117,11 +117,9 @@ def get_model(self, batch_size=2):
         use_lr_decay = bool(os.getenv('LR_DECAY', 0))
         lr = 0.0001
         if use_lr_decay:
-            lr = fluid.layers.exponential_decay(
+            lr = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=0.0001,
-                decay_steps=10000,
-                decay_rate=0.999,
-                staircase=True,
+                gamma=0.999,
             )
 
         sgd_optimizer = fluid.optimizer.SGD(
diff --git a/test/legacy_test/test_dist_fleet_ps5.py b/test/legacy_test/test_dist_fleet_ps5.py
index 84ae9492b6511..5eeab8dac7443 100644
--- a/test/legacy_test/test_dist_fleet_ps5.py
+++ b/test/legacy_test/test_dist_fleet_ps5.py
@@ -195,11 +195,9 @@ def test(self):
         loss, acc, _ = self.net()
 
         optimizer = fluid.optimizer.Adam(
-            learning_rate=fluid.layers.exponential_decay(
+            learning_rate=paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=base_lr,
-                decay_steps=500,
-                decay_rate=0.969,
-                staircase=True,
+                gamma=0.969,
             )
         )
 
diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py
index 2c83cb2a718e6..ed23ecd294e44 100644
--- a/test/legacy_test/test_dist_transpiler.py
+++ b/test/legacy_test/test_dist_transpiler.py
@@ -307,11 +307,9 @@ def net_conf(self):
         cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
         avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
+            learning_rate=paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True,
+                gamma=0.1,
             )
         )
         sgd_optimizer.minimize(avg_cost)
@@ -444,11 +442,9 @@ def net_conf(self):
         avg_cost = paddle.mean(cost)
 
         sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
+            learning_rate=paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True,
+                gamma=0.1,
             )
         )
         sgd_optimizer.minimize(avg_cost)
diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py
index adecc79758d13..514dd318ed238 100644
--- a/test/legacy_test/test_imperative_optimizer.py
+++ b/test/legacy_test/test_imperative_optimizer.py
@@ -499,25 +499,26 @@ def test_lr_decay_natural_exp(self):
             loss = paddle.mean(b)
             base_lr = 1.0
 
-            adam = fluid.optimizer.Adam(
-                fluid.dygraph.NaturalExpDecay(
-                    learning_rate=base_lr,
-                    decay_steps=3,
-                    decay_rate=0.5,
-                    staircase=True,
-                ),
-                parameter_list=linear.parameters(),
+            scheduler = paddle.optimizer.lr.NaturalExpDecay(
+                learning_rate=base_lr,
+                gamma=0.5,
             )
-
-            np.testing.assert_allclose(
-                adam.current_step_lr(), 1.0, rtol=1e-06, atol=0.0
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=linear.parameters(),
             )
 
+            np.testing.assert_allclose(adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)
+
             ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)]
+            counter = 0
             for i in range(5):
                 adam.minimize(loss)
-                lr = adam.current_step_lr()
-
+                lr = adam.get_lr()
+                counter += 1
+                if counter % 3 == 0:
+                    adam.step()
+                    scheduler.step()
                 np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0)
 
     def test_set_lr(self):
@@ -550,14 +551,12 @@ def test_set_lr(self):
             np.testing.assert_allclose(lr, 0.7, rtol=1e-06, atol=0.0)
 
             with self.assertRaises(RuntimeError):
-                adam = fluid.optimizer.Adam(
-                    fluid.dygraph.NaturalExpDecay(
+                adam = paddle.optimizer.Adam(
+                    paddle.optimizer.lr.NaturalExpDecay(
                         learning_rate=0.1,
-                        decay_steps=3,
-                        decay_rate=0.5,
-                        staircase=True,
+                        gamma=0.5,
                     ),
-                    parameter_list=linear.parameters(),
+                    parameters=linear.parameters(),
                 )
                 adam.set_lr(0.01)
 
diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py
index a6c2f5793a603..f1cc1fe81b7ae 100644
--- a/test/legacy_test/test_learning_rate_scheduler.py
+++ b/test/legacy_test/test_learning_rate_scheduler.py
@@ -123,11 +123,9 @@ def test_LR_state_dict(self):
             linear = paddle.nn.Linear(10, 10)
             input = fluid.dygraph.to_variable(x)
 
-            Exponential_scheduler = fluid.dygraph.ExponentialDecay(
+            Exponential_scheduler = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True,
+                gamma=0.5,
             )
             Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
             Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
@@ -161,11 +159,9 @@ def test_LR_state_dict(self):
 
             paddle.save(linear.state_dict(), "save_path.pdparams")
 
-            Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
+            Exponential_scheduler_test = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=0.1,
-                decay_steps=10000,
-                decay_rate=0.5,
-                staircase=True,
+                gamma=0.5,
             )
             Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
             Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
@@ -180,9 +176,9 @@ def test_LR_state_dict(self):
             )
             adam_test.set_dict(opt_state)
             self.assertEqual(
-                adam_test._learning_rate.step_num,
-                adam1._learning_rate.step_num,
-                "epoch_num is different before and after set_dict",
+                adam_test._learning_rate.last_epoch,
+                adam1._learning_rate.last_epoch,
+                "last_epoch is different before and after set_dict",
             )
 
             paddle.save(adam2.state_dict(), "save_path.pdopt")