From de60c1d10f86a9ee6a9af725007ad35148da75fe Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Tue, 27 Jun 2023 11:46:53 +0800 Subject: [PATCH] replace NaturalExpDecay, ExponentialDecay, InverseTimeDecay with 2.0 version (#54424) * remove the NaturalExpDecay in fluid * fix bug * remove the ExponentialDecay in fluid * remove the InverseTimeDecay in fluid * remove the InverseTimeDecay class * fix bug --- .../distributed/passes/ps_server_pass.py | 7 +- .../fluid/dygraph/learning_rate_scheduler.py | 252 ------------------ .../fluid/layers/learning_rate_scheduler.py | 12 +- .../test_basic_api_transformation.py | 75 +++++- test/legacy_test/dist_ctr.py | 6 +- test/legacy_test/test_dist_fleet_ps5.py | 6 +- test/legacy_test/test_dist_transpiler.py | 12 +- test/legacy_test/test_imperative_optimizer.py | 37 ++- .../test_learning_rate_scheduler.py | 18 +- 9 files changed, 111 insertions(+), 314 deletions(-) diff --git a/python/paddle/distributed/passes/ps_server_pass.py b/python/paddle/distributed/passes/ps_server_pass.py index 3cc1a14dbcd91..4e4377f328f3d 100755 --- a/python/paddle/distributed/passes/ps_server_pass.py +++ b/python/paddle/distributed/passes/ps_server_pass.py @@ -18,7 +18,6 @@ from paddle.fluid.layers.learning_rate_scheduler import ( exponential_decay, inverse_time_decay, - natural_exp_decay, noam_decay, ) from paddle.optimizer.lr import ( @@ -112,9 +111,9 @@ def _get_lr_scheduler_program(self, lr_scheduler, lr_decay_steps): with paddle.static.program_guard( decay_main_program, decay_startup_program ): - lr = natural_exp_decay( - 1.0, lr_decay_steps, lr_scheduler.gamma, True - ) + lr = paddle.optimizer.lr.NaturalExpDecay( + 1.0, lr_scheduler.gamma + ).get_lr() lr_name = lr.name logging.warn( "NaturalExpDecay is set, staircase = True, global learning rate decay step is [ %d ], Change decay steps as follow: \n" diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 2da10b2e52dce..079bf865df6f0 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -24,9 +24,6 @@ __all__ = [ 'NoamDecay', 'PiecewiseDecay', - 'NaturalExpDecay', - 'ExponentialDecay', - 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay', 'LinearLrWarmup', @@ -197,255 +194,6 @@ def step(self): return self.create_lr_var(self.vars[len(self.values) - 1]) -class NaturalExpDecay(LearningRateDecay): - r""" - :api_attr: imperative - - Applies natural exponential decay to the initial learning rate. - - The algorithm can be described as following. - - .. math:: - - decayed\_learning\_rate = learning\_rate * e^{y} - - If staircase is set to False, then: - - .. math:: - - y = - decay\_rate * \\frac{global\_step}{decay\_steps} - - If staircase is set to True, then: - - .. math:: - - y = - decay\_rate * math.floor(\\frac{global\_step}{decay\_steps}) - - Parameters: - learning_rate(Variable|float): The initial learning rate. If the type - is Variable, it's a tensor with shape [1], the data type can be - float32 or float64. It also can be set to python int number. - decay_steps(int): The decay step size. It determines the decay cycle. - decay_rate(int): The decay rate. - staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The - default value is False. - begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0. - step(int, optional): The step size used to calculate the new global_step in the description above. - The default value is 1. - dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as - 'float32', 'float64'. The default value is 'float32'. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - base_lr = 0.1 - with fluid.dygraph.guard(): - emb = paddle.nn.Embedding(10, 10) - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.dygraph.NaturalExpDecay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True), - parameter_list=emb.parameters()) - - """ - - def __init__( - self, - learning_rate, - decay_steps, - decay_rate, - staircase=False, - begin=0, - step=1, - dtype='float32', - ): - super().__init__(begin, step, dtype) - self.learning_rate = learning_rate - self.decay_steps = decay_steps - self.decay_rate = decay_rate - self.staircase = staircase - - def step(self): - div_res = self.create_lr_var(self.step_num / self.decay_steps) - if self.staircase: - div_res = paddle.floor(div_res) - decayed_lr = self.learning_rate * paddle.exp( - -1 * self.decay_rate * div_res - ) - - return decayed_lr - - -class ExponentialDecay(LearningRateDecay): - r""" - :api_attr: imperative - - Applies exponential decay to the learning rate. - - The algorithm can be described as following. - - .. math:: - - decayed\_learning\_rate = learning\_rate * decay\_rate ^ y - - If staircase is set to False, then: - - .. math:: - - y = \\frac{global\_step}{decay\_steps} - - If staircase is set to True, then: - - .. math:: - - y = math.floor(\\frac{global\_step}{decay\_steps}) - - - Parameters: - learning_rate(Variable|float): The initial learning rate. If the type - is Variable, it's a tensor with shape [1], the data type can be - float32 or float64. It also can be set to python int number. - decay_steps(int): The decay step size. It determines the decay cycle. - decay_rate(float): The decay rate. - staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The - default value is False. - begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0. - step(int, optional): The step size used to calculate the new global_step in the description above. - The default value is 1. - dtype(str, optional): The data type used to create the learning rate variable. The data type can be set as - 'float32', 'float64'. The default value is 'float32'. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - base_lr = 0.1 - with fluid.dygraph.guard(): - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.dygraph.ExponentialDecay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True)) - - """ - - def __init__( - self, - learning_rate, - decay_steps, - decay_rate, - staircase=False, - begin=0, - step=1, - dtype='float32', - ): - super().__init__(begin, step, dtype) - self.learning_rate = learning_rate - self.decay_steps = decay_steps - self.decay_rate = decay_rate - self.staircase = staircase - - def step(self): - div_res = self.create_lr_var(self.step_num / self.decay_steps) - if self.staircase: - div_res = paddle.floor(div_res) - - decayed_lr = self.learning_rate * (self.decay_rate**div_res) - - return decayed_lr - - -class InverseTimeDecay(LearningRateDecay): - r""" - :api_attr: imperative - - Applies inverse time decay to the initial learning rate. - - The algorithm can be described as following. - If staircase is set to False, then: - - .. math:: - - decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * \\frac{global\_step}{decay\_step}} - - If staircase is set to True, then: - - .. math:: - - decayed\_learning\_rate = \\frac{learning\_rate}{1 + decay\_rate * math.floor(\\frac{global\_step}{decay\_step})} - - Parameters: - learning_rate(Variable|float): The initial learning rate. If the type - is Variable, it's a tensor with shape [1], the data type can be - float32 or float64. It also can be set to python int number. - decay_steps(int): The decay step size. It determines the decay cycle. - decay_rate(float): The decay rate. - staircase(bool, optional): If set to True, decay the learning rate at discrete intervals. The - default value is False. - begin(int, optional): The begin step. The initial value of global_step described above. The default value is 0. - step(int, optional): The step size used to calculate the new global_step in the description above. - The default value is 1. - dtype(str, optional): The data type used to create the learning rate variable. The data type can be - 'float32', 'float64'. The default value is 'float32'. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - base_lr = 0.1 - with fluid.dygraph.guard(): - emb = paddle.nn.Embedding(10, 10) - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.dygraph.InverseTimeDecay( - learning_rate=base_lr, - decay_steps=10000, - decay_rate=0.5, - staircase=True), - parameter_list = emb.parameters()) - - """ - - def __init__( - self, - learning_rate, - decay_steps, - decay_rate, - staircase=False, - begin=0, - step=1, - dtype='float32', - ): - super().__init__(begin, step, dtype) - self.learning_rate = learning_rate - self.decay_steps = decay_steps - self.decay_rate = decay_rate - self.staircase = staircase - - def step(self): - div_res = self.create_lr_var(self.step_num / self.decay_steps) - if self.staircase: - div_res = paddle.floor(div_res) - - decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res) - - return decayed_lr - - class PolynomialDecay(LearningRateDecay): r""" :api_attr: imperative diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 4e01297d76a81..37f61d351e622 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -166,8 +166,8 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ with default_main_program()._lr_schedule_guard(): if in_dygraph_mode(): - decay = imperate_lr.ExponentialDecay( - learning_rate, decay_steps, decay_rate, staircase + decay = paddle.optimizer.lr.ExponentialDecay( + learning_rate, decay_rate ) return decay else: @@ -228,8 +228,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ with default_main_program()._lr_schedule_guard(): if in_dygraph_mode(): - decay = imperate_lr.NaturalExpDecay( - learning_rate, decay_steps, decay_rate, staircase + decay = paddle.optimizer.lr.NaturalExpDecay( + learning_rate, decay_rate ) return decay else: @@ -288,8 +288,8 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ with default_main_program()._lr_schedule_guard(): if in_dygraph_mode(): - decay = imperate_lr.InverseTimeDecay( - learning_rate, decay_steps, decay_rate, staircase + decay = paddle.optimizer.lr.InverseTimeDecay( + learning_rate, decay_rate ) return decay else: diff --git a/test/dygraph_to_static/test_basic_api_transformation.py b/test/dygraph_to_static/test_basic_api_transformation.py index ea6cb541ff99e..0abe6bcb5194a 100644 --- a/test/dygraph_to_static/test_basic_api_transformation.py +++ b/test/dygraph_to_static/test_basic_api_transformation.py @@ -342,8 +342,8 @@ def dyfunc_CosineDecay(): def dyfunc_ExponentialDecay(): base_lr = 0.1 - exponential_decay = fluid.dygraph.ExponentialDecay( - learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True + exponential_decay = paddle.optimizer.lr.ExponentialDecay( + learning_rate=base_lr, gamma=0.5 ) lr = exponential_decay() return lr @@ -351,8 +351,8 @@ def dyfunc_ExponentialDecay(): def dyfunc_InverseTimeDecay(): base_lr = 0.1 - inverse_time_decay = fluid.dygraph.InverseTimeDecay( - learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True + inverse_time_decay = paddle.optimizer.lr.InverseTimeDecay( + learning_rate=base_lr, gamma=0.5 ) lr = inverse_time_decay() return lr @@ -360,8 +360,8 @@ def dyfunc_InverseTimeDecay(): def dyfunc_NaturalExpDecay(): base_lr = 0.1 - natural_exp_decay = fluid.dygraph.NaturalExpDecay( - learning_rate=base_lr, decay_steps=10000, decay_rate=0.5, staircase=True + natural_exp_decay = paddle.optimizer.lr.NaturalExpDecay( + learning_rate=base_lr, gamma=0.5 ) lr = natural_exp_decay() return lr @@ -424,16 +424,79 @@ class TestDygraphBasicApi_ExponentialDecay(TestDygraphBasicApi_CosineDecay): def setUp(self): self.dygraph_func = dyfunc_ExponentialDecay + def get_dygraph_output(self): + with fluid.dygraph.guard(): + fluid.default_startup_program.random_seed = SEED + fluid.default_main_program.random_seed = SEED + res = self.dygraph_func() + return res + + def get_static_output(self): + startup_program = fluid.Program() + startup_program.random_seed = SEED + main_program = fluid.Program() + main_program.random_seed = SEED + with fluid.program_guard(main_program, startup_program): + static_out = dygraph_to_static_func(self.dygraph_func)() + static_out = paddle.to_tensor(static_out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(startup_program) + static_res = exe.run(main_program, fetch_list=static_out) + return static_res[0] + class TestDygraphBasicApi_InverseTimeDecay(TestDygraphBasicApi_CosineDecay): def setUp(self): self.dygraph_func = dyfunc_InverseTimeDecay + def get_dygraph_output(self): + with fluid.dygraph.guard(): + fluid.default_startup_program.random_seed = SEED + fluid.default_main_program.random_seed = SEED + res = self.dygraph_func() + return res + + def get_static_output(self): + startup_program = fluid.Program() + startup_program.random_seed = SEED + main_program = fluid.Program() + main_program.random_seed = SEED + with fluid.program_guard(main_program, startup_program): + static_out = dygraph_to_static_func(self.dygraph_func)() + static_out = paddle.to_tensor(static_out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(startup_program) + static_res = exe.run(main_program, fetch_list=static_out) + return static_res[0] + class TestDygraphBasicApi_NaturalExpDecay(TestDygraphBasicApi_CosineDecay): def setUp(self): self.dygraph_func = dyfunc_NaturalExpDecay + def get_dygraph_output(self): + with fluid.dygraph.guard(): + fluid.default_startup_program.random_seed = SEED + fluid.default_main_program.random_seed = SEED + res = self.dygraph_func() + return res + + def get_static_output(self): + startup_program = fluid.Program() + startup_program.random_seed = SEED + main_program = fluid.Program() + main_program.random_seed = SEED + with fluid.program_guard(main_program, startup_program): + static_out = dygraph_to_static_func(self.dygraph_func)() + static_out = paddle.to_tensor(static_out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(startup_program) + static_res = exe.run(main_program, fetch_list=static_out) + return static_res[0] + class TestDygraphBasicApi_NoamDecay(TestDygraphBasicApi_CosineDecay): def setUp(self): diff --git a/test/legacy_test/dist_ctr.py b/test/legacy_test/dist_ctr.py index f6b33319a9798..4056e5bc2285e 100644 --- a/test/legacy_test/dist_ctr.py +++ b/test/legacy_test/dist_ctr.py @@ -117,11 +117,9 @@ def get_model(self, batch_size=2): use_lr_decay = bool(os.getenv('LR_DECAY', 0)) lr = 0.0001 if use_lr_decay: - lr = fluid.layers.exponential_decay( + lr = paddle.optimizer.lr.ExponentialDecay( learning_rate=0.0001, - decay_steps=10000, - decay_rate=0.999, - staircase=True, + gamma=0.999, ) sgd_optimizer = fluid.optimizer.SGD( diff --git a/test/legacy_test/test_dist_fleet_ps5.py b/test/legacy_test/test_dist_fleet_ps5.py index 84ae9492b6511..5eeab8dac7443 100644 --- a/test/legacy_test/test_dist_fleet_ps5.py +++ b/test/legacy_test/test_dist_fleet_ps5.py @@ -195,11 +195,9 @@ def test(self): loss, acc, _ = self.net() optimizer = fluid.optimizer.Adam( - learning_rate=fluid.layers.exponential_decay( + learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=base_lr, - decay_steps=500, - decay_rate=0.969, - staircase=True, + gamma=0.969, ) ) diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py index 2c83cb2a718e6..ed23ecd294e44 100644 --- a/test/legacy_test/test_dist_transpiler.py +++ b/test/legacy_test/test_dist_transpiler.py @@ -307,11 +307,9 @@ def net_conf(self): cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( + learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=1.0, - decay_steps=2100, - decay_rate=0.1, - staircase=True, + gamma=0.1, ) ) sgd_optimizer.minimize(avg_cost) @@ -444,11 +442,9 @@ def net_conf(self): avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( + learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=1.0, - decay_steps=2100, - decay_rate=0.1, - staircase=True, + gamma=0.1, ) ) sgd_optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py index adecc79758d13..514dd318ed238 100644 --- a/test/legacy_test/test_imperative_optimizer.py +++ b/test/legacy_test/test_imperative_optimizer.py @@ -499,25 +499,26 @@ def test_lr_decay_natural_exp(self): loss = paddle.mean(b) base_lr = 1.0 - adam = fluid.optimizer.Adam( - fluid.dygraph.NaturalExpDecay( - learning_rate=base_lr, - decay_steps=3, - decay_rate=0.5, - staircase=True, - ), - parameter_list=linear.parameters(), + scheduler = paddle.optimizer.lr.NaturalExpDecay( + learning_rate=base_lr, + gamma=0.5, ) - - np.testing.assert_allclose( - adam.current_step_lr(), 1.0, rtol=1e-06, atol=0.0 + adam = paddle.optimizer.Adam( + learning_rate=scheduler, + parameters=linear.parameters(), ) + np.testing.assert_allclose(adam.get_lr(), 1.0, rtol=1e-06, atol=0.0) + ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] + counter = 0 for i in range(5): adam.minimize(loss) - lr = adam.current_step_lr() - + lr = adam.get_lr() + counter += 1 + if counter % 3 == 0: + adam.step() + scheduler.step() np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0) def test_set_lr(self): @@ -550,14 +551,12 @@ def test_set_lr(self): np.testing.assert_allclose(lr, 0.7, rtol=1e-06, atol=0.0) with self.assertRaises(RuntimeError): - adam = fluid.optimizer.Adam( - fluid.dygraph.NaturalExpDecay( + adam = paddle.optimizer.Adam( + paddle.optimizer.lr.NaturalExpDecay( learning_rate=0.1, - decay_steps=3, - decay_rate=0.5, - staircase=True, + gamma=0.5, ), - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) adam.set_lr(0.01) diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py index a6c2f5793a603..f1cc1fe81b7ae 100644 --- a/test/legacy_test/test_learning_rate_scheduler.py +++ b/test/legacy_test/test_learning_rate_scheduler.py @@ -123,11 +123,9 @@ def test_LR_state_dict(self): linear = paddle.nn.Linear(10, 10) input = fluid.dygraph.to_variable(x) - Exponential_scheduler = fluid.dygraph.ExponentialDecay( + Exponential_scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, + gamma=0.5, ) Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3) Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau( @@ -161,11 +159,9 @@ def test_LR_state_dict(self): paddle.save(linear.state_dict(), "save_path.pdparams") - Exponential_scheduler_test = fluid.dygraph.ExponentialDecay( + Exponential_scheduler_test = paddle.optimizer.lr.ExponentialDecay( learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, + gamma=0.5, ) Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3) Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau( @@ -180,9 +176,9 @@ def test_LR_state_dict(self): ) adam_test.set_dict(opt_state) self.assertEqual( - adam_test._learning_rate.step_num, - adam1._learning_rate.step_num, - "epoch_num is different before and after set_dict", + adam_test._learning_rate.last_epoch, + adam1._learning_rate.last_epoch, + "last_epoch is different before and after set_dict", ) paddle.save(adam2.state_dict(), "save_path.pdopt")