From 6cc0fc2372e8cf164db6664a480891d8ac065f50 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Fri, 21 Aug 2020 17:41:52 +0800 Subject: [PATCH] update api && add ut --- .../fluid/tests/unittests/test_adam_op.py | 19 +- .../fluid/tests/unittests/test_adamw_op.py | 14 +- ...st_fleet_graph_execution_meta_optimizer.py | 121 ++- .../unittests/test_imperative_optimizer_v2.py | 728 ++++++++++++++++++ python/paddle/optimizer/adam.py | 4 +- python/paddle/optimizer/adamw.py | 4 +- python/paddle/optimizer/optimizer.py | 26 +- python/paddle/optimizer/rmsprop.py | 3 +- 8 files changed, 894 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 09eea7cc89997..a6841feb79657 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -458,15 +458,24 @@ def test_adam_op_with_state_dict(self): adam.set_state_dict(state_dict) #learning_rate is Decay - from paddle.fluid.regularizer import L2Decay + learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120) adam = paddle.optimizer.Adam( - learning_rate=0.01, - weight_decay=L2Decay(0.001), + learning_rate=learning_rate, + weight_decay=fluid.regularizer.L2Decay(0.001), parameters=emb.parameters()) state_dict = adam.state_dict() adam.set_state_dict(state_dict) + #leanrning_rate is Tensor + learning_rate = np.array([0.01]).astype("float32") + learning_rate = paddle.to_tensor(learning_rate) + adam = paddle.optimizer.Adam( + learning_rate=learning_rate, parameters=emb.parameters()) + + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) + params = adam.get_opti_var_name_list() assert (params is not None) @@ -478,12 +487,12 @@ def test_adam_op_with_set_lr(self): lr = 0.01 adam.set_lr(lr) - cur_lr = adam.current_step_lr() + cur_lr = adam.get_lr() assert (lr == cur_lr) lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32') adam.set_lr(lr_var) - cur_lr = adam.current_step_lr() + cur_lr = adam.get_lr() assert (np.float32(lr) == cur_lr) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index ec25fc3eea9eb..0c2cd7ec9c7ed 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -19,7 +19,7 @@ class TestAdamWOp(unittest.TestCase): - def test_adamw_opi_dygraph(self): + def test_adamw_op_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") a = paddle.to_variable(value) @@ -34,6 +34,18 @@ def test_adamw_opi_dygraph(self): adam.step() adam.clear_gradients() + def test_adamw_op_coverage(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.AdamW( + learning_rate=0.0, + parameters=linear.parameters(), + apply_decay_param_fun=lambda name: True, + weight_decay=0.01) + assert (adam.__str__ is not None) + def test_adamw_op(self): place = fluid.CPUPlace() shape = [2, 3, 8, 8] diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py index 26e004164eb95..25039b8c9e015 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py @@ -111,7 +111,126 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True - optimizer = paddle.fluidoptimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace()) + exe.run(paddle.fluid.default_startup_program()) + + import numpy as np + + def gen_data(): + return { + "x": np.random.random(size=(128, 32)).astype('float32'), + "y": np.random.randint( + 2, size=(128, 1)).astype('int64') + } + + for i in range(10): + cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name]) + print("cost of step[{}] = {}".format(i, cost_val)) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer_not_apply_v2(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.nccl_comm_num = 2 + strategy.sync_nccl_allreduce = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py new file mode 100644 index 0000000000000..9f75c92b185ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -0,0 +1,728 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six +import itertools + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer +from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer +from paddle.fluid.dygraph import Linear +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + +# Note(wangzhongpu) +# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. + + +class MLP(fluid.Layer): + def __init__(self, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._fc1 = Linear(784, 10) + self._fc2 = Linear(10, 10) + + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y + + +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 20 + + def get_optimizer_dygraph(self, parameter_list): + raise NotImplementedError() + + def get_optimizer(self): + raise NotImplementedError() + + def reader_decorator(self, reader): + def _reader_imple(): + for item in reader(): + image = np.array(item[0]).reshape(1, 784) + label = np.array(item[1]).astype('int64').reshape(1) + yield image, label + + return _reader_imple + + def _check_exception(self, exception_message, place=None): + seed = 90 + batch_size = 128 + if place == None: + place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + with fluid.dygraph.guard(place): + try: + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + except Exception as e: + assert str(e) == exception_message + + def _check_mlp(self, place=None): + seed = 90 + batch_size = 128 + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + + batch_py_reader = fluid.io.PyReader(capacity=1) + batch_py_reader.decorate_sample_list_generator( + paddle.batch( + self.reader_decorator(paddle.dataset.mnist.train()), + batch_size=batch_size, + drop_last=True), + places=fluid.CPUPlace()) + + dy_param_init_value = {} + for batch_id, data in enumerate(batch_py_reader()): + if batch_id >= self.batch_num: + break + + img = data[0] + label = data[1] + label.stop_gradient = True + + img = fluid.layers.reshape(img, shape=[batch_size, -1]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + dy_out = avg_loss.numpy() + + if batch_id == 0: + for param in mlp.parameters(): + dy_param_init_value[param.name] = param.numpy() + + avg_loss.backward() + optimizer.minimize(avg_loss) + mlp.clear_gradients() + dy_param_value = {} + for param in mlp.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + + mlp = MLP() + optimizer = self.get_optimizer() + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + img = fluid.layers.reshape(img, shape=[batch_size, 784]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in mlp.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= self.batch_num: + break + + static_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + + self.assertTrue(np.allclose(static_out, dy_out)) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + + +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + bd = [3, 6, 9] + optimizer = SGDOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, + values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = Adam( + learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestOptimizerLearningRate(unittest.TestCase): + def test_constant_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.001, rtol=1e-06, atol=0.0)) + + for i in range(10): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0)) + + def test_lr_decay(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + + adam = paddle.optimizer.Adam( + fluid.dygraph.PiecewiseDecay(bd, value, 0), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.2, rtol=1e-06, atol=0.0)) + + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_lr_decay_natural_exp(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + base_lr = 1.0 + + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=base_lr, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)) + + ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] + for i in range(5): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_set_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue( + np.allclose( + lr, lr_list[i], rtol=1e-06, atol=0.0)) + + lr_var = fluid.layers.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0)) + + with self.assertRaises(RuntimeError): + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=0.1, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + adam.set_lr(0.01) + + +class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = MomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_momentum(self): + self._check_mlp() + + +class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LarsMomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_larsmomentum(self): + self._check_mlp() + + +class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_adagrad(self): + self._check_mlp() + + +class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdamaxOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdamaxOptimizer(learning_rate=0.2) + return optimizer + + def test_adamax(self): + self._check_mlp() + + +class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DpsgdOptimizer( + learning_rate=0.01, + clip=10.0, + batch_size=16.0, + sigma=1.0, + parameter_list=parameter_list) + optimizer._seed = 100 + return optimizer + + def get_optimizer(self): + optimizer = DpsgdOptimizer( + learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer._seed = 100 + return optimizer + + def test_dpsgd(self): + self._check_mlp(place=fluid.CPUPlace()) + + +class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DecayedAdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = DecayedAdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_decayadagrad(self): + self._check_mlp() + + +class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, + epsilon=1.0e-6, + rho=0.95, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) + return optimizer + + def test_adadelta(self): + self._check_mlp() + + +class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = RMSPropOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = RMSPropOptimizer(learning_rate=0.1) + return optimizer + + def test_rmsprop(self): + self._check_mlp() + + +class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = FtrlOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = FtrlOptimizer(learning_rate=0.1) + return optimizer + + def test_ftrl(self): + self._check_mlp() + + +def exclude_fn(param): + return param.name.endswith('.b_0') + + +class TestImperativeLambOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LambOptimizer( + learning_rate=0.002, + exclude_from_weight_decay_fn=exclude_fn, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LambOptimizer( + learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn) + return optimizer + + def test_lamb(self): + self._check_mlp() + + +class TestImperativeModelAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ModelAverage( + 0.15, min_average_window=10000, max_average_window=12500) + return optimizer + + def test_modelaverage(self): + exception_message = "In dygraph, don't support ModelAverage." + self._check_exception(exception_message) + + +class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DGCMomentumOptimizer( + learning_rate=0.0001, + momentum=0.9, + rampup_step=1000, + rampup_begin_step=1252, + sparsity=[0.999, 0.999]) + return optimizer + + def test_dgcmomentum(self): + exception_message = "In dygraph, don't support DGCMomentumOptimizer." + self._check_exception(exception_message) + + +class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ExponentialMovingAverage(0.999) + return optimizer + + def test_exponentialmoving(self): + exception_message = "In dygraph, don't support ExponentialMovingAverage." + self._check_exception(exception_message) + + +class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = PipelineOptimizer(optimizer) + return optimizer + + def test_pipline(self): + exception_message = "In dygraph, don't support PipelineOptimizer." + self._check_exception(exception_message) + + +class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) + return optimizer + + def test_lookahead(self): + exception_message = "In dygraph, don't support LookaheadOptimizer." + self._check_exception(exception_message) + + +class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = RecomputeOptimizer(optimizer) + return optimizer + + def test_recompute(self): + exception_message = "In dygraph, don't support RecomputeOptimizer." + self._check_exception(exception_message) + + +class TestImperativeOptimizerList(unittest.TestCase): + def test_parameter_list(self): + with fluid.dygraph.guard(): + linear_1 = Linear(10, 10) + linear_2 = Linear(10, 10) + + sgd = SGDOptimizer( + 1.0, + parameter_list=itertools.chain(linear_1.parameters(), + linear_2.parameters())) + + in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + in_data = fluid.dygraph.to_variable(in_np) + + y = linear_1(in_data) + y = linear_2(y) + loss = fluid.layers.reduce_mean(y) + loss.backward() + sgd.minimize(loss) + + self.assertTrue( + len(sgd._parameter_list) == + len(linear_1.parameters() + linear_2.parameters())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b829bcd6047be..4ba5ecddb974e 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -45,8 +45,8 @@ class Adam(Optimizer): Related paper: `Adam: A Method for Stochastic Optimization `_ Args: - learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. It should be a float number or a Tensor with shape [1] and data type as float32. The default value is 0.9. diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index cc46e8bdf9944..7d47aa3f20291 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -145,8 +145,8 @@ class AdamW(DecoupledWeightDecay, Adam): Args: - learning_rate (float|Tensor, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. The default value is 0.001. + learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index ff894805c0c7f..4b8882f5fc28f 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -52,8 +52,8 @@ class Optimizer(object): but need to use one of it's implementation. Args: - learning_rate (float|Tensor): The learning rate used to update ``Parameter``. - It can be a float value or a ``Tensor`` with a float type. + learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. \ The default value is None in static mode, at this time all parameters will be updated. @@ -161,7 +161,7 @@ def __init__(self, self._opti_name_list = [] self._accumulators_holder = {} self._param_device_map = dict() - self.clear_grad = self.clear_gradients + self.clear_gradients = self.clear_grad @framework.dygraph_only def state_dict(self): @@ -366,7 +366,7 @@ def set_lr(self, value): lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] for i in range(5): adam.set_lr(lr_list[i]) - lr = adam.current_step_lr() + lr = adam.get_lr() print("current lr is {}".format(lr)) # Print: # current lr is 0.2 @@ -380,7 +380,7 @@ def set_lr(self, value): lr_var = paddle.create_global_var( shape=[1], value=0.7, dtype='float32') adam.set_lr(lr_var) - lr = adam.current_step_lr() + lr = adam.get_lr() print("current lr is {}".format(lr)) # Print: # current lr is 0.7 @@ -416,7 +416,7 @@ def set_lr(self, value): self._learning_rate_map[framework.default_main_program()] = value @framework.dygraph_only - def current_step_lr(self): + def get_lr(self): """ :api_attr: imperative @@ -435,7 +435,7 @@ def current_step_lr(self): paddle.disable_static() emb = paddle.nn.Embedding([10, 10]) adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) - lr = adam.current_step_lr() + lr = adam.get_lr() print(lr) # 0.001 # example2: PiecewiseDecay is used, return the step learning rate @@ -452,13 +452,13 @@ def current_step_lr(self): parameters=linear.parameters()) # first step: learning rate is 0.2 - np.allclose(adam.current_step_lr(), 0.2, rtol=1e-06, atol=0.0) # True + np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True # learning rate for different steps ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] for i in range(12): adam.step() - lr = adam.current_step_lr() + lr = adam.get_lr() np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True """ @@ -778,7 +778,7 @@ def backward(self, out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ act_no_grad_set = None if framework.in_dygraph_mode(): @@ -899,7 +899,7 @@ def _get_no_grad_set(self, loss, no_grad_set=None): return no_grad_set @framework.dygraph_only - def clear_gradients(self): + def clear_grad(self): """ Clear the gradients of all optimized parameters for model. @@ -921,7 +921,7 @@ def clear_gradients(self): out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ for p in self._parameter_list: @@ -1021,7 +1021,7 @@ def step(self): out = linear(a) out.backward() adam.step() - adam.clear_gradients() + adam.clear_grad() """ parameter_list = self._parameter_list self._dtype = None diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py index 155a04b42db88..9ed82646e6ef0 100644 --- a/python/paddle/optimizer/rmsprop.py +++ b/python/paddle/optimizer/rmsprop.py @@ -69,7 +69,8 @@ class RMSProp(Optimizer): Parameters: - learning_rate(float): Global learning rate. + learning_rate (float|Tensor|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. rho(float): rho is :math: `\\rho` in equation, default is 0.95. epsilon(float): :math: `\\epsilon` in equation is smoothing term to avoid division by zero, default is 1e-6.