From eeda90d6747bce7b10867dde0a8085693cf2e608 Mon Sep 17 00:00:00 2001 From: MRXLT Date: Sun, 23 Aug 2020 20:56:24 +0800 Subject: [PATCH] [WIP] update optimizer for 2.0 (#26288) refine Optimizer/Adam/Admax/RMSProp && add Admw * buf fix * update comment * unify arguments place; notest * fix ut, test=develop * bug fix * fix conflicts, test=develop * add examples code * bug fix * fix comments * fix sample code * add sample code for Optimizer * add adamax ut, test=develop * fix rmsprop ut, test=develop * add ut for optimizer.py and adamw.py * remove TestAdamOptimizerBetaVariable * update api && add ut * update doc && fix ut * add ut Co-authored-by: mapingshuo --- python/paddle/fluid/optimizer.py | 4 +- .../fluid/tests/unittests/test_adam_op.py | 144 ++- .../fluid/tests/unittests/test_adamax_api.py | 67 ++ .../fluid/tests/unittests/test_adamw_op.py | 81 ++ .../test_dist_fleet_a_sync_optimizer_async.py | 4 +- .../test_dist_fleet_a_sync_optimizer_geo.py | 2 +- .../test_dist_fleet_a_sync_optimizer_sync.py | 2 +- .../tests/unittests/test_dist_fleet_ps2.py | 2 +- .../test_fleet_amp_meta_optimizer.py | 2 +- .../test_fleet_dgc_meta_optimizer.py | 8 +- ...est_fleet_gradient_merge_meta_optimizer.py | 2 +- ...st_fleet_graph_execution_meta_optimizer.py | 121 ++- .../unittests/test_fleet_graph_executor.py | 2 +- .../test_fleet_lamb_meta_optimizer.py | 7 +- .../test_fleet_lars_meta_optimizer.py | 5 +- .../test_fleet_localsgd_meta_optimizer.py | 2 +- .../test_fleet_pipeline_meta_optimizer.py | 2 +- .../test_fleet_recompute_meta_optimizer.py | 2 +- .../unittests/test_imperative_optimizer_v2.py | 728 +++++++++++++ .../unittests/test_imperative_save_load_v2.py | 917 ++++++++++++++++ .../tests/unittests/test_retain_graph.py | 4 +- .../fluid/tests/unittests/test_rmsprop_op.py | 55 + python/paddle/optimizer/__init__.py | 32 +- python/paddle/optimizer/adam.py | 246 +++++ python/paddle/optimizer/adamax.py | 192 ++++ python/paddle/optimizer/adamw.py | 233 ++++ python/paddle/optimizer/optimizer.py | 995 ++++++++++++++++++ python/paddle/optimizer/rmsprop.py | 207 ++++ 28 files changed, 3992 insertions(+), 76 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_adamax_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_adamw_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py create mode 100644 python/paddle/optimizer/adam.py create mode 100644 python/paddle/optimizer/adamax.py create mode 100644 python/paddle/optimizer/adamw.py create mode 100644 python/paddle/optimizer/optimizer.py create mode 100644 python/paddle/optimizer/rmsprop.py diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 0d6aa46065d87..a2a5a85cc0a29 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -40,6 +40,7 @@ from functools import reduce from .wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt +import paddle __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -3690,7 +3691,8 @@ def train_reader(): def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): if framework.in_dygraph_mode(): raise Exception("In dygraph, don't support PipelineOptimizer.") - if not isinstance(optimizer, Optimizer): + if not isinstance(optimizer, Optimizer) and not isinstance( + optimizer, paddle.optimizer.Optimizer): raise ValueError("The 'optimizer' parameter for " "PipelineOptimizer must be an instance of " "Optimizer, but the given type is {}.".format( diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 7a7099b7113c8..990499858ca52 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -20,6 +20,7 @@ from paddle.fluid import core from paddle.fluid.op import Operator import paddle.fluid as fluid +import paddle class TestAdamOp1(OpTest): @@ -401,46 +402,111 @@ def test_check_output(self): self.check_output() -class TestAdamOptimizerBetaVariable(unittest.TestCase): - def test_adam_optimizer(self): - def test_with_place(place, shape): - exe = fluid.Executor(place) - - train_prog = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(train_prog, startup): - with fluid.unique_name.guard(): - data = fluid.data(name="data", shape=shape) - conv = fluid.layers.conv2d(data, 8, 3) - loss = fluid.layers.reduce_mean(conv) - - beta1 = fluid.layers.create_global_var( - shape=[1], - value=0.85, - dtype='float32', - persistable=True) - beta2 = fluid.layers.create_global_var( - shape=[1], - value=0.95, - dtype='float32', - persistable=True) - opt = fluid.optimizer.Adam( - learning_rate=1e-5, beta1=beta1, beta2=beta2) - opt.minimize(loss) - - exe.run(startup) - data_np = np.random.random(shape).astype('float32') - rets = exe.run(train_prog, - feed={"data": data_np}, - fetch_list=[loss]) - assert rets[0] is not None - +class TestAdamOpV2(unittest.TestCase): + def test_adam_op(self): + place = fluid.CPUPlace() shape = [2, 3, 8, 8] - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for place in places: - test_with_place(place, shape) + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = fluid.layers.reduce_mean(conv) + + beta1 = fluid.layers.create_global_var( + shape=[1], value=0.85, dtype='float32', persistable=True) + beta2 = fluid.layers.create_global_var( + shape=[1], value=0.95, dtype='float32', persistable=True) + betas = [beta1, beta2] + opt = paddle.optimizer.Adam( + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + def test_adam_op_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + + adam = paddle.optimizer.Adam( + learning_rate=0.01, parameters=linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adam_op_with_state_dict(self): + + import paddle + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) + + #learning_rate is Decay + learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120) + adam = paddle.optimizer.Adam( + learning_rate=learning_rate, + weight_decay=fluid.regularizer.L2Decay(0.001), + parameters=emb.parameters()) + lr = adam.get_lr() + state_dict = adam.state_dict() + adam.set_state_dict(state_dict) + + #leanrning_rate is Tensor + with self.assertRaises(TypeError): + learning_rate = np.array([0.01]).astype("float32") + learning_rate = paddle.to_tensor(learning_rate) + adam = paddle.optimizer.Adam( + learning_rate=learning_rate, parameters=emb.parameters()) + + params = adam.get_opti_var_name_list() + assert (params is not None) + + def test_adam_with_grad_clip(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = fluid.dygraph.to_variable(value) + linear = fluid.Linear(13, 5, dtype="float32") + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) + adam = paddle.optimizer.Adam( + 0.1, parameters=linear.parameters(), grad_clip=clip) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adam_op_with_set_lr(self): + paddle.disable_static() + linear = paddle.nn.Linear(10, 10) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + lr = 0.01 + adam.set_lr(lr) + cur_lr = adam.get_lr() + assert (lr == cur_lr) + + lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32') + adam.set_lr(lr_var) + cur_lr = adam.get_lr() + assert (np.float32(lr) == cur_lr) + + with self.assertRaises(TypeError): + lr = int(1) + adam.set_lr(lr) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py new file mode 100644 index 0000000000000..f6946dc80b5e5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py @@ -0,0 +1,67 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid + + +class TestAdamaxAPI(unittest.TestCase): + def test_adamax_api_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.Adamax( + learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adamax_api(self): + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = paddle.mean(conv) + beta1 = 0.85 + beta2 = 0.95 + opt = paddle.optimizer.Adamax( + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py new file mode 100644 index 0000000000000..ddb70d6e6400c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -0,0 +1,81 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.fluid as fluid + + +class TestAdamWOp(unittest.TestCase): + def test_adamw_op_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.AdamW( + learning_rate=0.01, + parameters=linear.parameters(), + apply_decay_param_fun=lambda name: True, + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_adamw_op_coverage(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_variable(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + adam = paddle.optimizer.AdamW( + learning_rate=0.0, + parameters=linear.parameters(), + apply_decay_param_fun=lambda name: True, + weight_decay=0.01) + assert (adam.__str__() is not None) + + def test_adamw_op(self): + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = paddle.mean(conv) + + beta1 = fluid.layers.create_global_var( + shape=[1], value=0.85, dtype='float32', persistable=True) + beta2 = fluid.layers.create_global_var( + shape=[1], value=0.95, dtype='float32', persistable=True) + betas = [beta1, beta2] + opt = paddle.optimizer.AdamW( + learning_rate=1e-5, + beta1=beta1, + beta2=beta2, + weight_decay=0.01, + epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py index 687143f8e6f59..9df55a6b873e2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -100,7 +100,7 @@ def test_a_sync_optimizer_pserver(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py index 635e93245ccad..59ca41a11e325 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py index 23f0d8f45b452..e0993e022e1b9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py @@ -47,7 +47,7 @@ def test_gradient_merge_optimizer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index 61eaed9072b4e..e7b10be2349cc 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -165,7 +165,7 @@ def test(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py index 22a1434ae251a..38c3903306e6e 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py @@ -51,7 +51,7 @@ def test_amp_optimizer(self): "custom_black_list": ['tanh'], } - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py index b43687ce1cdab..55d4ff7726aac 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py @@ -60,7 +60,8 @@ def test_dgc_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -72,7 +73,7 @@ def test_dgc_not_apply_with_adam(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -87,7 +88,8 @@ def test_dgc_not_apply_with_one_worker(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py index 581f8becbbff1..af72df5186876 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -44,7 +44,7 @@ def test_gradient_merge_optimizer(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.gradient_merge = True strategy.gradient_merge_configs = {"k_steps": 2, "avg": True} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py index 8b2c32c2f0236..cac2698d33615 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py @@ -58,7 +58,7 @@ def node_func(): avg_cost = paddle.fluid.layers.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -118,10 +118,129 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace()) + exe.run(paddle.fluid.default_startup_program()) + + import numpy as np + + def gen_data(): + return { + "x": np.random.random(size=(128, 32)).astype('float32'), + "y": np.random.randint( + 2, size=(128, 1)).astype('int64') + } + + for i in range(10): + cost_val = exe.run(feed=gen_data(), fetch_list=[avg_cost.name]) + print("cost of step[{}] = {}".format(i, cost_val)) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer_not_apply_v2(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36003", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36004", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36003,127.0.0.1:36004", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) + + proc_a = launch_func(node_func, node_a) + proc_a.start() + proc_b = launch_func(node_func, node_b) + proc_b.start() + proc_a.join() + proc_b.join() + + def test_graph_execution_optimizer(self): + node_a = { + "PADDLE_TRAINER_ID": "0", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36001", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + node_b = { + "PADDLE_TRAINER_ID": "1", + "PADDLE_CURRENT_ENDPOINT": "127.0.0.1:36002", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": "127.0.0.1:36001,127.0.0.1:36002", + "http_proxy": "", + "https_proxy": "" + } + + def node_func(): + import paddle.distributed.fleet as fleet + import paddle.fluid.incubate.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + prediction = paddle.fluid.layers.fc(input=[fc_2], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.nccl_comm_num = 2 + strategy.sync_nccl_allreduce = True + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) exe = paddle.fluid.Executor(place=paddle.fluid.CPUPlace()) exe.run(paddle.fluid.default_startup_program()) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py index 4d92c6f70541d..69f5b134888b0 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py @@ -60,7 +60,7 @@ def node_func(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 strategy.sync_nccl_allreduce = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py index 134aea363b55e..3f140f53b043b 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py @@ -62,7 +62,7 @@ def test_lamb_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -75,7 +75,8 @@ def test_lamb_not_apply_with_momentum(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.1, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -88,7 +89,7 @@ def test_lamb_exclude_fn(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['.b_0'], diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index b15db0b12d001..3caa1a4eac0bf 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -62,7 +62,8 @@ def test_lars_optimizer(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + optimizer = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -75,7 +76,7 @@ def test_lars_not_apply_with_adam(self): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py index 86098d42b823b..07b988bf87520 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py @@ -46,7 +46,7 @@ def test_localsgd_optimizer(self): config['k_steps'] = 1 strategy.localsgd_configs = config - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py index ca969bc4032b1..adbb1268c6f4d 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py @@ -53,7 +53,7 @@ def test_pipeline_optimizer(self): strategy.pipeline = True strategy.pipeline_configs = {'micro_batch': 2} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py index 95e1c3a360257..a42010a4eaa50 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py @@ -45,7 +45,7 @@ def test_recompute_optimizer(self): strategy.recompute = True strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]} - optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py new file mode 100644 index 0000000000000..9f75c92b185ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -0,0 +1,728 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six +import itertools + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer, Adam, MomentumOptimizer, LarsMomentumOptimizer, AdagradOptimizer, AdamaxOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, AdadeltaOptimizer, RMSPropOptimizer, FtrlOptimizer, LambOptimizer +from paddle.fluid.optimizer import ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer +from paddle.fluid.dygraph import Linear +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + +# Note(wangzhongpu) +# In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. + + +class MLP(fluid.Layer): + def __init__(self, param_attr=None, bias_attr=None): + super(MLP, self).__init__() + + self._fc1 = Linear(784, 10) + self._fc2 = Linear(10, 10) + + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y + + +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 20 + + def get_optimizer_dygraph(self, parameter_list): + raise NotImplementedError() + + def get_optimizer(self): + raise NotImplementedError() + + def reader_decorator(self, reader): + def _reader_imple(): + for item in reader(): + image = np.array(item[0]).reshape(1, 784) + label = np.array(item[1]).astype('int64').reshape(1) + yield image, label + + return _reader_imple + + def _check_exception(self, exception_message, place=None): + seed = 90 + batch_size = 128 + if place == None: + place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + with fluid.dygraph.guard(place): + try: + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + except Exception as e: + assert str(e) == exception_message + + def _check_mlp(self, place=None): + seed = 90 + batch_size = 128 + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mlp = MLP() + optimizer = self.get_optimizer_dygraph( + parameter_list=mlp.parameters()) + + batch_py_reader = fluid.io.PyReader(capacity=1) + batch_py_reader.decorate_sample_list_generator( + paddle.batch( + self.reader_decorator(paddle.dataset.mnist.train()), + batch_size=batch_size, + drop_last=True), + places=fluid.CPUPlace()) + + dy_param_init_value = {} + for batch_id, data in enumerate(batch_py_reader()): + if batch_id >= self.batch_num: + break + + img = data[0] + label = data[1] + label.stop_gradient = True + + img = fluid.layers.reshape(img, shape=[batch_size, -1]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + dy_out = avg_loss.numpy() + + if batch_id == 0: + for param in mlp.parameters(): + dy_param_init_value[param.name] = param.numpy() + + avg_loss.backward() + optimizer.minimize(avg_loss) + mlp.clear_gradients() + dy_param_value = {} + for param in mlp.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if place == None: + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + + exe = fluid.Executor(place) + + mlp = MLP() + optimizer = self.get_optimizer() + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + img = fluid.layers.reshape(img, shape=[batch_size, 784]) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in mlp.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= self.batch_num: + break + + static_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + + self.assertTrue(np.allclose(static_out, dy_out)) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + + +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + bd = [3, 6, 9] + optimizer = SGDOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, + values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = Adam( + learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = SGDOptimizer( + learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000), + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestOptimizerLearningRate(unittest.TestCase): + def test_constant_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.001, rtol=1e-06, atol=0.0)) + + for i in range(10): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, 0.001, rtol=1e-06, atol=0.0)) + + def test_lr_decay(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + + adam = paddle.optimizer.Adam( + fluid.dygraph.PiecewiseDecay(bd, value, 0), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 0.2, rtol=1e-06, atol=0.0)) + + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_lr_decay_natural_exp(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + base_lr = 1.0 + + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=base_lr, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + + self.assertTrue( + np.allclose( + adam.get_lr(), 1.0, rtol=1e-06, atol=0.0)) + + ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] + for i in range(5): + adam.minimize(loss) + lr = adam.get_lr() + + self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0)) + + def test_set_lr(self): + with fluid.dygraph.guard(): + a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + linear = fluid.dygraph.nn.Linear(10, 10) + + a = fluid.dygraph.to_variable(a) + + b = linear(a) + + loss = fluid.layers.reduce_mean(b) + + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue( + np.allclose( + lr, lr_list[i], rtol=1e-06, atol=0.0)) + + lr_var = fluid.layers.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) + adam.minimize(loss) + lr = adam.get_lr() + self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0)) + + with self.assertRaises(RuntimeError): + adam = paddle.optimizer.Adam( + fluid.dygraph.NaturalExpDecay( + learning_rate=0.1, + decay_steps=3, + decay_rate=0.5, + staircase=True), + parameters=linear.parameters()) + adam.set_lr(0.01) + + +class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = MomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_momentum(self): + self._check_mlp() + + +class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LarsMomentumOptimizer( + learning_rate=0.001, momentum=0.9, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + return optimizer + + def test_larsmomentum(self): + self._check_mlp() + + +class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_adagrad(self): + self._check_mlp() + + +class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdamaxOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdamaxOptimizer(learning_rate=0.2) + return optimizer + + def test_adamax(self): + self._check_mlp() + + +class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DpsgdOptimizer( + learning_rate=0.01, + clip=10.0, + batch_size=16.0, + sigma=1.0, + parameter_list=parameter_list) + optimizer._seed = 100 + return optimizer + + def get_optimizer(self): + optimizer = DpsgdOptimizer( + learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer._seed = 100 + return optimizer + + def test_dpsgd(self): + self._check_mlp(place=fluid.CPUPlace()) + + +class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DecayedAdagradOptimizer( + learning_rate=0.2, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = DecayedAdagradOptimizer(learning_rate=0.2) + return optimizer + + def test_decayadagrad(self): + self._check_mlp() + + +class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, + epsilon=1.0e-6, + rho=0.95, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = AdadeltaOptimizer( + learning_rate=0.0003, epsilon=1.0e-6, rho=0.95) + return optimizer + + def test_adadelta(self): + self._check_mlp() + + +class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = RMSPropOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = RMSPropOptimizer(learning_rate=0.1) + return optimizer + + def test_rmsprop(self): + self._check_mlp() + + +class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = FtrlOptimizer( + learning_rate=0.1, parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = FtrlOptimizer(learning_rate=0.1) + return optimizer + + def test_ftrl(self): + self._check_mlp() + + +def exclude_fn(param): + return param.name.endswith('.b_0') + + +class TestImperativeLambOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = LambOptimizer( + learning_rate=0.002, + exclude_from_weight_decay_fn=exclude_fn, + parameter_list=parameter_list) + return optimizer + + def get_optimizer(self): + optimizer = LambOptimizer( + learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn) + return optimizer + + def test_lamb(self): + self._check_mlp() + + +class TestImperativeModelAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ModelAverage( + 0.15, min_average_window=10000, max_average_window=12500) + return optimizer + + def test_modelaverage(self): + exception_message = "In dygraph, don't support ModelAverage." + self._check_exception(exception_message) + + +class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = DGCMomentumOptimizer( + learning_rate=0.0001, + momentum=0.9, + rampup_step=1000, + rampup_begin_step=1252, + sparsity=[0.999, 0.999]) + return optimizer + + def test_dgcmomentum(self): + exception_message = "In dygraph, don't support DGCMomentumOptimizer." + self._check_exception(exception_message) + + +class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = ExponentialMovingAverage(0.999) + return optimizer + + def test_exponentialmoving(self): + exception_message = "In dygraph, don't support ExponentialMovingAverage." + self._check_exception(exception_message) + + +class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = PipelineOptimizer(optimizer) + return optimizer + + def test_pipline(self): + exception_message = "In dygraph, don't support PipelineOptimizer." + self._check_exception(exception_message) + + +class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) + return optimizer + + def test_lookahead(self): + exception_message = "In dygraph, don't support LookaheadOptimizer." + self._check_exception(exception_message) + + +class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): + def get_optimizer_dygraph(self, parameter_list): + optimizer = paddle.optimizer.SGD(learning_rate=0.5, + parameter_list=parameter_list) + optimizer = RecomputeOptimizer(optimizer) + return optimizer + + def test_recompute(self): + exception_message = "In dygraph, don't support RecomputeOptimizer." + self._check_exception(exception_message) + + +class TestImperativeOptimizerList(unittest.TestCase): + def test_parameter_list(self): + with fluid.dygraph.guard(): + linear_1 = Linear(10, 10) + linear_2 = Linear(10, 10) + + sgd = SGDOptimizer( + 1.0, + parameter_list=itertools.chain(linear_1.parameters(), + linear_2.parameters())) + + in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + in_data = fluid.dygraph.to_variable(in_np) + + y = linear_1(in_data) + y = linear_2(y) + loss = fluid.layers.reduce_mean(y) + loss.backward() + sgd.minimize(loss) + + self.assertTrue( + len(sgd._parameter_list) == + len(linear_1.parameters() + linear_2.parameters())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py new file mode 100644 index 0000000000000..4ab35a21aff43 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -0,0 +1,917 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.dygraph.nn import Embedding, Linear +import paddle.fluid.framework as framework +from paddle.optimizer import Adam +from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay +from test_imperative_base import new_program_scope +import numpy as np +import six +import paddle + + +class SimpleLSTMRNN(fluid.Layer): + def __init__(self, + hidden_size, + num_steps, + num_layers=2, + init_scale=0.1, + dropout=None): + super(SimpleLSTMRNN, self).__init__() + self._hidden_size = hidden_size + self._num_layers = num_layers + self._init_scale = init_scale + self._dropout = dropout + self._input = None + self._num_steps = num_steps + self.cell_array = [] + self.hidden_array = [] + self.weight_1_arr = [] + self.weight_2_arr = [] + self.bias_arr = [] + self.mask_array = [] + + for i in range(self._num_layers): + weight_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 2, self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)) + self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1)) + bias_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.Constant(0.0)) + self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1)) + + def forward(self, input_embedding, init_hidden=None, init_cell=None): + self.cell_array = [] + self.hidden_array = [] + + for i in range(self._num_layers): + pre_hidden = fluid.layers.slice( + init_hidden, axes=[0], starts=[i], ends=[i + 1]) + pre_cell = fluid.layers.slice( + init_cell, axes=[0], starts=[i], ends=[i + 1]) + pre_hidden = fluid.layers.reshape( + pre_hidden, shape=[-1, self._hidden_size]) + pre_cell = fluid.layers.reshape( + pre_cell, shape=[-1, self._hidden_size]) + self.hidden_array.append(pre_hidden) + self.cell_array.append(pre_cell) + + res = [] + for index in range(self._num_steps): + self._input = fluid.layers.slice( + input_embedding, axes=[1], starts=[index], ends=[index + 1]) + self._input = fluid.layers.reshape( + self._input, shape=[-1, self._hidden_size]) + for k in range(self._num_layers): + pre_hidden = self.hidden_array[k] + pre_cell = self.cell_array[k] + weight_1 = self.weight_1_arr[k] + bias = self.bias_arr[k] + + nn = fluid.layers.concat([self._input, pre_hidden], 1) + gate_input = fluid.layers.matmul(x=nn, y=weight_1) + + gate_input = fluid.layers.elementwise_add(gate_input, bias) + i, j, f, o = fluid.layers.split( + gate_input, num_or_sections=4, dim=-1) + c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( + i) * fluid.layers.tanh(j) + m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + self.hidden_array[k] = m + self.cell_array[k] = c + self._input = m + + if self._dropout is not None and self._dropout > 0.0: + self._input = fluid.layers.dropout( + self._input, + dropout_prob=self._dropout, + dropout_implementation='upscale_in_train') + res.append( + fluid.layers.reshape( + self._input, shape=[1, -1, self._hidden_size])) + real_res = fluid.layers.concat(res, 0) + real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + last_hidden = fluid.layers.concat(self.hidden_array, 1) + last_hidden = fluid.layers.reshape( + last_hidden, shape=[-1, self._num_layers, self._hidden_size]) + last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_cell = fluid.layers.concat(self.cell_array, 1) + last_cell = fluid.layers.reshape( + last_cell, shape=[-1, self._num_layers, self._hidden_size]) + last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + return real_res, last_hidden, last_cell + + +class PtbModel(fluid.Layer): + def __init__(self, + hidden_size, + vocab_size, + num_layers=2, + num_steps=20, + init_scale=0.1, + dropout=None): + super(PtbModel, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.init_scale = init_scale + self.num_layers = num_layers + self.num_steps = num_steps + self.dropout = dropout + self.simple_lstm_rnn = SimpleLSTMRNN( + hidden_size, + num_steps, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + self.embedding = Embedding( + size=[vocab_size, hidden_size], + dtype='float32', + is_sparse=False, + param_attr=fluid.ParamAttr( + name='embedding_para', + initializer=fluid.initializer.UniformInitializer( + low=-init_scale, high=init_scale))) + + self.softmax_weight = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.hidden_size, self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + self.softmax_bias = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + + def forward(self, input, label, init_hidden, init_cell): + init_h = fluid.layers.reshape( + init_hidden, shape=[self.num_layers, -1, self.hidden_size]) + + init_c = fluid.layers.reshape( + init_cell, shape=[self.num_layers, -1, self.hidden_size]) + + x_emb = self.embedding(input) + x_emb = fluid.layers.reshape( + x_emb, shape=[-1, self.num_steps, self.hidden_size]) + if self.dropout is not None and self.dropout > 0.0: + x_emb = fluid.layers.dropout( + x_emb, + dropout_prob=self.drop_out, + dropout_implementation='upscale_in_train') + rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, + init_c) + rnn_out = fluid.layers.reshape( + rnn_out, shape=[-1, self.num_steps, self.hidden_size]) + + projection = fluid.layers.matmul(rnn_out, self.softmax_weight) + projection = fluid.layers.elementwise_add(projection, self.softmax_bias) + projection = fluid.layers.reshape( + projection, shape=[-1, self.vocab_size]) + loss = fluid.layers.softmax_with_cross_entropy( + logits=projection, label=label, soft_label=False) + loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + + return loss, last_hidden, last_cell + + +class TestDygraphPtbRnn(unittest.TestCase): + def setUp(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + self.opti_dict = adam.state_dict() + self.base_opti = {} + for k, v in self.opti_dict.items(): + if isinstance(v, core.VarBase): + self.base_opti[v.name] = v.numpy() + self.assertTrue(np.sum(np.abs(v.numpy())) != 0) + else: + self.base_opti[k] = v + + fluid.save_dygraph(self.opti_dict, "./test_dy") + + self.state_dict = ptb_model.state_dict() + + self.model_base = {} + for k, v in self.state_dict.items(): + np_t = v.numpy() + self.model_base[k] = np_t + + paddle.save(self.state_dict, "./test_dy") + + def testLoadAndSetVarBase(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + para_state_dict, opti_state_dict = paddle.load("./test_dy") + adam.set_state_dict(opti_state_dict) + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + for k, v in state_dict.items(): + np_t = v.numpy() + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(para_state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetVariable(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + adam.set_state_dict(self.opti_dict) + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + for k, v in state_dict.items(): + np_t = v.numpy() + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(self.state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetNumpy(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [1.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + new_lr = 1.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + if i == 0: + for param in ptb_model.parameters(): + dy_param_init[param.name] = param.numpy() + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + if i == batch_num - 1: + for param in ptb_model.parameters(): + dy_param_updated[param.name] = param.numpy() + + # check optimizer + opti_dict = adam.state_dict() + np_opti_dict = {} + # set to zero + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + np_t = v.numpy() + np_opti_dict[v.name] = np_t + var = v.value().get_tensor() + var.set(np.zeros_like(np_t), place) + self.assertTrue(np.sum(np.abs(v.numpy())) == 0) + else: + np_opti_dict[k] = v + + if isinstance(adam._learning_rate, LearningRateDecay): + adam._learning_rate.step_num = 0 + + adam.set_state_dict(np_opti_dict) + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if isinstance(v, core.VarBase): + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name])) + else: + self.assertEqual(v, self.base_opti[k]) + + # check parameter + state_dict = ptb_model.state_dict() + np_state_dict = {} + for k, v in state_dict.items(): + np_t = v.numpy() + np_state_dict[k] = np_t + var = v.value().get_tensor() + + var.set(np.zeros_like(np_t), place) + + ptb_model.set_dict(np_state_dict) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetVariableBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=0.0, + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + adam.set_state_dict(self.opti_dict) + ptb_model.set_dict(self.state_dict) + + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testLoadAndSetVarBaseBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [0.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + # set lr to zero not update parameter + new_lr = 0.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=0.0, + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + state_dict, opti_dict = fluid.load_dygraph("./test_dy") + adam.set_state_dict(opti_dict) + ptb_model.set_dict(state_dict) + + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + # check parameter + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testSetNumpyBeforeTrain(self): + seed = 90 + hidden_size = 10 + vocab_size = 1000 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 200 + + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # TODO: marsyang1993 Change seed to + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + bd = [] + lr_arr = [0.0] + # this a fake lr decay strategy + for i in range(1, 10): + bd.append(100 * i) + # set lr to 0.0, not update parameter + new_lr = 0.0 + lr_arr.append(new_lr) + + place = fluid.CPUPlace() if not core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + adam = Adam( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + beta1=0.8, + beta2=0.6, + parameters=ptb_model.parameters()) + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + np_opti_dict = {} + np_state_dict = {} + + for k, v in self.opti_dict.items(): + if isinstance(v, core.VarBase): + np_opti_dict[v.name] = v.numpy() + else: + np_opti_dict[k] = v + + for k, v in self.state_dict.items(): + np_state_dict[k] = v.numpy() + + adam.set_state_dict(np_opti_dict) + ptb_model.set_dict(np_state_dict) + for i in range(1): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + y_data = y_data.reshape((-1, 1)) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + x = to_variable(x_data) + y = to_variable(y_data) + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + dy_loss.backward() + adam.minimize(dy_loss) + ptb_model.clear_gradients() + + opti_dict = adam.state_dict() + for k, v in opti_dict.items(): + if k == "global_step": + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] + 1)) + + if k.find("beta1_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta1)) + if k.find("beta2_pow_acc_0") > 0: + self.assertTrue( + np.array_equal(v.numpy(), self.base_opti[v.name] * + adam._beta2)) + + # check parameter + + state_dict = ptb_model.state_dict() + + for k, v in state_dict.items(): + new_t = v.numpy() + + base_t = self.model_base[k] + self.assertTrue(np.array_equal(new_t, base_t)) + + def testOnlyLoadParams(self): + with fluid.dygraph.guard(): + emb = fluid.dygraph.Embedding([10, 10]) + state_dict = emb.state_dict() + paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy')) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy')) + + self.assertTrue(opti_state_dict == None) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy.pdparams')) + + para_state_dict, opti_state_dict = paddle.load( + os.path.join('saved_dy', 'emb_dy.pdopt')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py index 5f3e8ff737a04..9abbee173852b 100644 --- a/python/paddle/fluid/tests/unittests/test_retain_graph.py +++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py @@ -96,8 +96,8 @@ def run_retain(self, need_retain): g = Generator() d = Discriminator() - optim_g = paddle.optimizer.Adam(parameter_list=g.parameters()) - optim_d = paddle.optimizer.Adam(parameter_list=d.parameters()) + optim_g = paddle.optimizer.Adam(parameters=g.parameters()) + optim_d = paddle.optimizer.Adam(parameters=d.parameters()) gan_criterion = paddle.nn.MSELoss() l1_criterion = paddle.nn.L1Loss() diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index eb12bc7417673..0f225758ced3b 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -20,6 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator import paddle.fluid as fluid +import paddle def create_selected_rows_and_tensor(scope, place, height, row_num, @@ -222,5 +223,59 @@ def test_rmsprop(self): size=size) +class TestRMSPropV2(unittest.TestCase): + def test_rmsprop_dygraph(self): + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.RMSProp( + learning_rate=0.01, + parameters=linear.parameters(), + weight_decay=0.01) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_rmsprop(self): + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) + rms_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + def test_raise_error(self): + self.assertRaises(ValueError, paddle.optimizer.RMSProp, None) + self.assertRaises( + ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None) + self.assertRaises( + ValueError, + paddle.optimizer.RMSProp, + learning_rate=0.1, + epsilon=None) + self.assertRaises( + ValueError, + paddle.optimizer.RMSProp, + learning_rate=0.1, + momentum=None) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 4dc3cf397aea5..7159baeb30512 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -14,21 +14,25 @@ __all__ = [ 'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam', - 'Adamax', 'AdamaxOptimizer', 'AdamOptimizer', 'DecayedAdagrad', - 'DecayedAdagradOptimizer', 'DGCMomentumOptimizer', 'Dpsgd', - 'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', - 'LambOptimizer', 'LarsMomentum', 'LarsMomentumOptimizer', - 'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer', - 'PipelineOptimizer', 'RecomputeOptimizer', 'RMSPropOptimizer', 'SGD', - 'SGDOptimizer' + 'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', + 'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer', + 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer', + 'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer', + 'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer', + 'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer' ] -from ..fluid.optimizer import SGD, Momentum, Adagrad, Adam, Adamax, Dpsgd, DecayedAdagrad, \ - Ftrl, SGDOptimizer, MomentumOptimizer, AdagradOptimizer, \ - AdamOptimizer, AdamaxOptimizer, DpsgdOptimizer, \ - DecayedAdagradOptimizer, RMSPropOptimizer, FtrlOptimizer, Adadelta, \ - AdadeltaOptimizer, ModelAverage, LarsMomentum, \ - LarsMomentumOptimizer, DGCMomentumOptimizer, LambOptimizer, \ +from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \ + Ftrl, Adadelta, \ + SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\ + DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \ + ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\ ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \ - RecomputeOptimizer + RecomputeOptimizer, LarsMomentumOptimizer + +from .optimizer import Optimizer +from .adam import Adam +from .adamw import AdamW +from .adamax import Adamax +from .rmsprop import RMSProp diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py new file mode 100644 index 0000000000000..0da8053fe8a34 --- /dev/null +++ b/python/paddle/optimizer/adam.py @@ -0,0 +1,246 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + +__all__ = ["Adam"] + + +class Adam(Optimizer): + """ + The Adam optimizer uses an optimization described at the end + of section 2 of `Adam paper `_ , + it can dynamically adjusts the learning rate of each parameter using + the 1st moment estimates and the 2nd moment estimates of the gradient. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + Args: + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. + beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + out.backward() + adam.step() + adam.clear_grad() + + .. code-block:: python + + # Adam with beta1/beta2 as Tensor and weight_decay as float + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() + + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None, + lazy_mode=False): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(Adam, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + self.type = "adam" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + self._lazy_mode = lazy_mode + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + # Create accumulator tensors for first and second moments + for p in parameters: + self._add_accumulator(self._moment1_acc_str, p) + self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=0.9 if isinstance(self._beta1, Variable) \ + else self._beta1, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + fill_value=0.999 if isinstance(self._beta2, Variable) \ + else self._beta2, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + lr = self._create_param_lr(param_and_grad) + # create the adam optimize op + + if framework.in_dygraph_mode(): + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + _, _, _, _, _ = core.ops.adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, + moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, + 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', + 1000, 'beta1', _beta1, 'beta2', _beta2) + + return None + + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc] + } + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "epsilon": self._epsilon, + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000 + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + + adam_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + + return adam_op diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py new file mode 100644 index 0000000000000..73a78b17cbba5 --- /dev/null +++ b/python/paddle/optimizer/adamax.py @@ -0,0 +1,192 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable, name_scope + +__all__ = ["Adamax"] + + +class Adamax(Optimizer): + """ + The Adamax optimizer is implemented based on the Adamax Optimization + in Section 7 of `Adam paper `_. + The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm, + which makes the learning rate update algorithm more stable and simple. + + The parameter ``param_out`` update rule with gradient ``grad``: + + .. math:: + + t & = t + 1 + + moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad + + inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|) + + learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t} + + param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out} + + Related paper: `Adam: A Method for Stochastic Optimization `_ + + The original paper does not have an ``epsilon`` attribute, + it is added here for numerical stability to prevent the division by 0 error. + + Args: + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + The default value is 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + The default value is 1e-08. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + **Notes**: + **Currently, Adamax doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.Adamax(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() + + """ + _moment_acc_str = "moment" + _inf_norm_acc_str = "inf_norm" + _beta1_pow_acc_str = "beta1_pow_acc" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(Adamax, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + self.type = "adamax" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + + def _create_accumulators(self, block, parameters): + # Create accumulator tensors for first moment and infinity norm + for p in parameters: + self._add_accumulator(self._moment_acc_str, p) + self._add_accumulator(self._inf_norm_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=self._beta1, + shape=[1]) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) + inf_norm = self._get_accumulator(self._inf_norm_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + # create the adamax optimize op + adamax_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment": moment, + "InfNorm": inf_norm, + "Beta1Pow": beta1_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": moment, + "InfNormOut": inf_norm + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon + }, + stop_gradient=True) + + return adamax_op + + def _finish_update(self, block, parameters_and_grads): + """Update Beta1 Power accumulator + """ + assert isinstance(block, framework.Block) + for param, grad in parameters_and_grads: + if grad is None or param.trainable is False: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope('adamax'): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}, + stop_gradient=True) diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py new file mode 100644 index 0000000000000..f498fcbffa24e --- /dev/null +++ b/python/paddle/optimizer/adamw.py @@ -0,0 +1,233 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from .adam import Adam +from ..fluid import framework +import paddle +__all__ = ['AdamW'] + + +class DecoupledWeightDecay(object): + def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs): + if not isinstance(coeff, float) and \ + not isinstance(coeff, framework.Variable): + raise TypeError("coeff should be float or Tensor.") + self._params_name = set() + self._apply_decay_param_fun = apply_decay_param_fun + self._coeff = coeff + super(DecoupledWeightDecay, self).__init__(**kwargs) + + def _scale_parameters(self, params_and_grads): + """ + Adds weight decay ops. + scaled_parameter = parameter * coeff + + Args: + params_and_grads: A list of (parameters, gradients) pairs, + the parameters need to decay. + Raises: + Exception: The type of coeff and parameter is not consistent. + """ + if isinstance(self._coeff, float) and self._coeff == 0.0: + return + + scaled_params = [] + for param, grad in params_and_grads: + # If no gradient then we don't need to do anything + if grad is None: + continue + if self._apply_decay_param_fun is not None \ + and not self._apply_decay_param_fun(param.name): + continue + + if isinstance(self._coeff, float): + assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \ + "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype) + else: + assert self._coeff.dtype == param.dtype, \ + "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) + + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + assert param.name not in self._params_name + scaled_params.append((param, grad, param * self._coeff)) + self._params_name.add(param.name) + return scaled_params + + def backward(self, **kargs): + return super(DecoupledWeightDecay, self).backward(**kargs) + + def _apply_optimize(self, **kargs): + return super(DecoupledWeightDecay, self)._apply_optimize(**kargs) + + def minimize(self, + loss, + startup_program=None, + parameters=None, + no_grad_set=None): + params_grads = self.backward( + loss=loss, + startup_program=startup_program, + parameters=parameters, + no_grad_set=no_grad_set) + scaled_params = self._scale_parameters(params_grads) + for p_grad_sgrad in scaled_params: + param, grad, scaled_param = p_grad_sgrad + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + updated_param = paddle.fluid.layers.elementwise_sub( + x=param, y=scaled_param) + paddle.fluid.layers.assign(input=updated_param, output=param) + + optimize_ops = self._apply_optimize( + loss=loss, + params_grads=params_grads, + startup_program=startup_program) + return optimize_ops, params_grads + + @framework.dygraph_only + def step(self): + parameter_list = self._parameter_list + self._dtype = None + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if param._grad_ivar() is not None: + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + + scaled_params = self._scale_parameters(params_grads) + for p_grad_sgrad in scaled_params: + param, grad, scaled_param = p_grad_sgrad + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('weight decay'): + updated_param = paddle.fluid.layers.elementwise_sub( + x=param, y=scaled_param) + paddle.fluid.layers.assign(input=updated_param, output=param) + optimize_ops = self._apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) + + def __str__(self): + return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) + + +class AdamW(DecoupledWeightDecay, Adam): + """ + The AdamW optimizer is implemented based on the AdamW Optimization + in paper `DECOUPLED WEIGHT DECAY REGULARIZATION `_. + it can resolves the problem of L2 regularization failure in the Adam optimizer. + + .. math:: + + t & = t + 1 + + moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad + + moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad + + learning\_rate & = learning\_rate * \\ + \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t} + + param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) + + + Args: + learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. The default value is 0.001. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.9. + beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates. + It should be a float number or a Tensor with shape [1] and data type as float32. + The default value is 0.999. + epsilon (float, optional): A small float value for numerical stability. + weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0. + The default value is 1e-08. + apply_decay_param_fun (function|None): If it is not None, + only tensors that makes apply_decay_param_fun(Tensor)==True + will be updated. It only works when we want to specify tensors. + Default: None. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. + The accumulators are updated at every step. Every element of the two moving-average + is updated in both dense mode and sparse mode. If the size of parameter is very large, + then the update may be very slow. The lazy mode only update the element that has + gradient in current mini-batch, so it will be much more faster. But this mode has + different semantics with the original Adam algorithm and may lead to different result. + The default value is False. + **Notes**: + **Currently, AdamW doesn't support sparse parameter optimization.** + + Examples: + .. code-block:: python + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.AdamW(learning_rate=0.1, + parameters=linear.parameters(), + beta1=beta1, + beta2=beta2, + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() + + """ + + def __init__(self, + learning_rate=0.001, + parameters=None, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + weight_decay=0.0, + apply_decay_param_fun=None, + grad_clip=None, + name=None, + lazy_mode=False): + args_dict = { + "learning_rate": learning_rate, + "parameters": parameters, + "beta1": beta1, + "beta2": beta2, + "epsilon": epsilon, + "grad_clip": grad_clip, + "name": name, + "lazy_mode": lazy_mode + } + super(AdamW, self).__init__( + weight_decay, + apply_decay_param_fun=apply_decay_param_fun, + **args_dict) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py new file mode 100644 index 0000000000000..2c2f6f1ce7e14 --- /dev/null +++ b/python/paddle/optimizer/optimizer.py @@ -0,0 +1,995 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import six +import logging +from collections import defaultdict + +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard + +from ..fluid import framework +from ..fluid import layers +from ..fluid import unique_name +from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name +from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops +from ..fluid.framework import program_guard +from ..fluid.initializer import Constant +from ..fluid.layer_helper import LayerHelper +from ..fluid.layers import ops +from ..fluid.regularizer import append_regularization_ops +from ..fluid.dygraph import base as imperative_base +from ..fluid.dygraph import no_grad +from ..fluid.dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay +from paddle.fluid import core +from paddle.fluid.layers import tensor +from functools import reduce +from ..fluid.wrapped_decorator import signature_safe_contextmanager +from .. import compat as cpt + +__all__ = ['Optimizer'] + + +class Optimizer(object): + """Optimizer Base class. + + Define the common interface of an optimizer. + User should not use this class directly, + but need to use one of it's implementation. + + Args: + learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \ + some derived class of ``GradientClipBase`` . There are three cliping strategies \ + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , \ + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name`. + The default value is None. + + Returns: + Base class for optimizer. + + Examples: + .. code-block:: python + + #Take the subclass adam as an example + #Optimizer + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + out.backward() + adam.step() + adam.clear_grad() + + """ + + @imperative_base.no_grad() + def __init__(self, + learning_rate, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None): + self._parameter_list = list( + parameters) if parameters is not None else None + self._name = name + if framework.in_dygraph_mode(): + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, LearningRateDecay): + raise TypeError( + "learning rate should be float or LearningRateDecay, got %s here" + % type(learning_rate)) + if self._parameter_list is None: + raise AttributeError( + "parameters argument given to the Optimizer should not be None in dygraph mode." + ) + if weight_decay is not None: + for param in self._parameter_list: + if param.regularizer is not None: + logging.info( + "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. " + "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!" + % weight_decay.__str__()) + break + else: + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, framework.Variable): + raise TypeError( + "learning rate should be float or Tensor, got %s here" % + type(learning_rate)) + + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipBase): + raise TypeError( + "'grad_clip' should be an instance of GradientClipBase's derived class" + ) + if isinstance(weight_decay, float): + from ..fluid.regularizer import L2Decay + self.regularization = L2Decay(weight_decay) + else: + self.regularization = weight_decay + self._grad_clip = grad_clip + self._learning_rate = learning_rate + # the learning rate type should be inferenced from loss + self._dtype = None + # each program should have a independent learning rate + # program -> tensor(learning_rate) + self._learning_rate_map = dict() + if isinstance(self._learning_rate, framework.Variable): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate + # Dictionary of accumulators. Some optimizer subclasses need to + # allocate and manage extra tensors associated with the parameters + # to train. These tensors are called accumulators. + # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} + self._accumulators = defaultdict(lambda: dict()) + self.helper = None + self._opti_name_list = [] + self._accumulators_holder = {} + self._param_device_map = dict() + self.clear_gradients = self.clear_grad + + @framework.dygraph_only + def state_dict(self): + ''' + Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be include in state dict. + If the optimizer never be called(minimize function), the state_dict is empty. + + Args: + None + + Returns: + state_dict(dict) : dict contains all the Tensor used by optimizer + + Examples: + .. code-block:: python + + import paddle + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) + state_dict = adam.state_dict() + + ''' + state_dict = {} + for k, v in self._accumulators.items(): + for para_name, var_tmp in v.items(): + state_dict[var_tmp.name] = var_tmp + # global step if use lr decay + if isinstance(self._learning_rate, LearningRateDecay): + state_dict["LR_Scheduler"] = self._learning_rate.state_dict() + + if not isinstance(self._learning_rate, _LearningRateEpochDecay): + var_tmp = None + var_temp = framework._varbase_creator( + None, name='global_step', dtype='int32') + + tensor.fill_constant( + [1], "int32", self._learning_rate.step_num, out=var_temp) + + state_dict['global_step'] = var_temp + return state_dict + + @framework.dygraph_only + def set_state_dict(self, state_dict): + ''' + Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed. + + Args: + state_dict(dict) : Dict contains all the Tensor needed by optimizer + Return: + None + + Examples: + .. code-block:: python + + import paddle + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + + state_dict = emb.state_dict() + paddle.framework.save(state_dict, "paddle_dy") + + adam = paddle.optimizer.Adam(learning_rate=paddle.nn.functional.noam_decay( 100, 10000), + parameters=emb.parameters()) + state_dict = adam.state_dict() + paddle.framework.save(state_dict, "paddle_dy") + + para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy") + + adam.set_state_dict(opti_state_dict) + + ''' + + if isinstance(self._learning_rate, LearningRateDecay): + self._learning_rate.set_dict(state_dict["LR_Scheduler"]) + + if not isinstance(self._learning_rate, _LearningRateEpochDecay): + assert 'global_step' in state_dict, \ + 'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict' + global_step = state_dict['global_step'] + + if isinstance(global_step, Variable): + step_np = global_step + step_np = np.array(step_np.value().get_tensor()) + assert step_np.shape == (1,), \ + "global step shape is (1,), the shape is {}".format( step_np.shape ) + + self._learning_rate.step_num = int(step_np[0]) + elif isinstance(global_step, np.ndarray): + assert global_step.shape == (1,), \ + "global step shape is (1,), the shape is {}".format( global_step.shape ) + self._learning_rate.step_num = global_step[0] + else: + raise RuntimeError( + "Type not supprt, value in state dict must be [VarBase, Tensor, numpy], the type is ", + type(global_step)) + + self._accumulators_holder = state_dict + for k, v in self._accumulators.items(): + for para_name, var_tmp in v.items(): + assert var_tmp.name in state_dict, \ + "optimizer Tensor {} not found".format( var_tmp.name ) + var = var_tmp.value() + tensor = var.get_tensor() + model_np = np.array(tensor) + + load_para = state_dict[var_tmp.name] + + if isinstance(load_para, Variable): + load_para_np = load_para.numpy() + elif isinstance(load_para, core.VarBase): + load_para_np = load_para.numpy() + elif isinstance(load_para, np.ndarray): + load_para_np = load_para + else: + raise RuntimeError("State dict type {} not supprt".format( + str(type(load_para)))) + + assert model_np.shape == load_para_np.shape, \ + "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format( + item.name, model_np.shape, load_para_np.shape) + + assert model_np.dtype == load_para_np.dtype, \ + "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( + item.name, model_np.dtype, load_para_np.dtype) + + tensor.set(load_para_np, framework._current_expected_place()) + + def get_opti_var_name_list(self): + return self._opti_name_list + + def _create_global_learning_rate(self): + if imperative_base.enabled(): + # create learning rate tensor + if isinstance(self._learning_rate, float): + lr = self._global_learning_rate() + + if isinstance(lr, framework.Variable): + return + else: + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + # get learning rate Tensor from LearningRateDecay + elif isinstance(self._learning_rate, LearningRateDecay): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate() + else: + raise TypeError( + "optimizer's learning rate must be float or LearningRateDecay" + ) + else: + lr = self._global_learning_rate() + + if isinstance(lr, framework.Variable): + return + else: + if not isinstance(self._learning_rate, float): + raise TypeError( + "learning rate Tensor is create outside optimizer," + "can not create new learning rate Tensor for new program" + ) + + # create learning rate in the current main program + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + + @framework.dygraph_only + def set_lr(self, value): + """ + :api_attr: imperative + + Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay, + this API cannot be invoked, because it will lead to conflict. + + Args: + value (float|Tensor): the value of learning rate + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + paddle.disable_static() + linear = paddle.nn.Linear(10, 10) + + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) + + # set learning rate manually by python float value + lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] + for i in range(5): + adam.set_lr(lr_list[i]) + lr = adam.get_lr() + print("current lr is {}".format(lr)) + # Print: + # current lr is 0.2 + # current lr is 0.3 + # current lr is 0.4 + # current lr is 0.5 + # current lr is 0.6 + + + # set learning rate manually by framework Tensor + lr_var = paddle.create_global_var( + shape=[1], value=0.7, dtype='float32') + adam.set_lr(lr_var) + lr = adam.get_lr() + print("current lr is {}".format(lr)) + # Print: + # current lr is 0.7 + + + + """ + if not isinstance(value, (framework.Variable, float)): + raise TypeError( + "The type of 'value' in optimizer.set_lr must be (float, Tensor), but received %s." + % (type(value))) + if isinstance(self._learning_rate, LearningRateDecay): + raise RuntimeError( + "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict." + ) + if isinstance(value, float): + self._learning_rate = value + current_lr = self._global_learning_rate() + if current_lr is not None: + global_block = framework.default_main_program().global_block() + global_block.append_op( + type='fill_constant', + outputs={'Out': [current_lr]}, + attrs={ + 'dtype': current_lr.dtype, + 'shape': list(current_lr.shape), + 'value': float(value) + }, + stop_gradient=True) + else: + assert len(value.shape) == 1 and value.shape[ + 0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]" + self._learning_rate_map[framework.default_main_program()] = value + + @framework.dygraph_only + def get_lr(self): + """ + :api_attr: imperative + + Get current step learning rate. The return value is all the same When LearningRateDecay is not used, + otherwise return the step learning rate. + + Returns: + float: The learning rate of the current step. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + # example1: LearningRateDecay is not used, return value is all the same + paddle.disable_static() + emb = paddle.nn.Embedding([10, 10]) + adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) + lr = adam.get_lr() + print(lr) # 0.001 + + # example2: PiecewiseDecay is used, return the step learning rate + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.reduce_mean(out) + + bd = [2, 4, 6, 8] + value = [0.2, 0.4, 0.6, 0.8, 1.0] + adam = paddle.optimizer.Adam(paddle.PiecewiseDecay(bd, value, 0), + parameters=linear.parameters()) + + # first step: learning rate is 0.2 + np.allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) # True + + # learning rate for different steps + ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] + for i in range(12): + adam.step() + lr = adam.get_lr() + np.allclose(lr, ret[i], rtol=1e-06, atol=0.0) # True + + """ + current_lr = self._global_learning_rate() + if isinstance(current_lr, framework.Variable): + return self._global_learning_rate().numpy()[0] + + if isinstance(self._learning_rate, float): + return self._learning_rate + elif isinstance(self._learning_rate, _LearningRateEpochDecay): + step_lr = self._learning_rate() + return step_lr.numpy()[0] + else: + step_lr = self._learning_rate.step() + if isinstance(step_lr, (float, int)): + return step_lr + else: + return step_lr.numpy()[0] + + def _global_learning_rate(self, program=None): + """ + get global decayed learning rate + :return: + """ + if program is None: + program = framework.default_main_program() + return self._learning_rate_map.get(program, None) + + def _append_optimize_op(self, block, param_and_grad): + """ append optimize operator to block and return all the added optimize_op + """ + raise NotImplementedError( + "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\"" + ) + + def _create_param_lr(self, param_and_grad): + # create learning rate tensor for every parameter + param = param_and_grad[0] + param_lr = param.optimize_attr['learning_rate'] + if type(param_lr) == Variable: + return param_lr + else: + if param_lr == 1.0: + return self._global_learning_rate() + else: + with default_main_program()._lr_schedule_guard( + is_with_opt=True), framework.name_scope( + 'scale_with_param_lr'): + return self._global_learning_rate() * param_lr + + def _create_accumulators(self, block, parameters): + """Create all accumulators needed by the parameters + + Args: + block: the block in which the loss tensor is present + parameters: list of parameter tensors for the optimizer + """ + pass + + def _finish_update(self, block, parameters_and_grads): + """Finish any custom updates needed + before completing an optimization step + + Args: + block: the block in which the loss tensor is present + parameters: list of parameter tensors for the optimizer + + Returns: + None + """ + pass + + def _add_accumulator(self, + name, + param, + dtype=None, + fill_value=0.0, + shape=None, + type=None, + device=None): + """Utility function to add an accumulator for a parameter + + Args: + block: the block in which the loss tensor is present + name: name of the accumulator + param: parameter tensor for which accumulator is to be added + dtype: data type of the accumulator tensor + fill_value: value to initialize the accumulator tensor + """ + if self._name is not None: + name = self._name + "_" + name + if (name in self._accumulators and + param.name in self._accumulators[name]): + if framework.in_dygraph_mode(): + return self._accumulators[name][param.name] + raise Exception("Accumulator {} already exists for parameter {}". + format(name, param.name)) + if shape == None: + shape = param.shape + assert isinstance(self.helper, LayerHelper) + + var_name = param.name + "_" + name + var_name = unique_name.generate(var_name) + self._opti_name_list.append(var_name) + + var = self.helper.create_global_variable( + name=var_name, + persistable=True, + dtype=dtype or param.dtype, + type=param.type if type is None else type, + shape=shape, + belong_to_optimizer=True) + if device is None: + device = self._get_device_for_param(param.name) + with device_guard(device): + self.helper.set_variable_initializer( + var, initializer=Constant(value=float(fill_value))) + + if framework.in_dygraph_mode(): + if len(self._accumulators_holder) > 0: + assert var_name in self._accumulators_holder, \ + "Optimizer set error, {} should in state dict".format( var_name ) + var.set_value(self._accumulators_holder[var_name]) + + self._accumulators[name][param.name] = var + return var + + def _get_accumulator(self, name, param): + """Utility function to fetch an accumulator for a parameter + + Args: + name: name of the accumulator + param: parameter tensor for which accumulator is to be fetched + + Returns: + accumulator tensor for the parameter + """ + if self._name is not None: + name = self._name + "_" + name + if (name not in self._accumulators or + param.name not in self._accumulators[name]): + raise Exception("Accumulator {} does not exist for parameter {}". + format(name, param.name)) + return self._accumulators[name][param.name] + + def _update_param_device_map(self, parameters_and_grads, target_block): + for param_and_grad in parameters_and_grads: + if param_and_grad[0].trainable is True: + param_name = param_and_grad[0].name + ops = target_block.ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( + ) + for op in ops: + input_arg_names = op.input_arg_names + if param_name in input_arg_names: + self._param_device_map[param_name] = op.attr( + device_attr_name) + break + + def _get_device_for_param(self, param_name): + device = None + if param_name in self._param_device_map: + device = self._param_device_map[param_name] + return device + + def _create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to tensors. + + Args: + parameters_and_grads(list(tuple(Tensor, Tensor))): + a list of (tensor, gradient) pair to update. + + Returns: + return_op_list: a list of operators that will complete one step of + optimization. This will include parameter update ops, global step + update ops and any other custom ops required by subclasses to manage + their internal state. + """ + # This is a default implementation of create_optimization_pass that + # can be shared by most optimizers. This implementation assumes that + # the subclass will implement the _append_optimize_op method and the + # _initialize_tensors method. The subclass can extend the + # _create_accumulators method if it needs to create accumulators + # for parameters and extend _finish_update method to add custom ops. + + # Allways called under program_guard use global block as loss block + # But if current block is in control flow, append optimize op in the + # grad block of current block + + global_block = framework.default_main_program().global_block() + target_block = global_block + current_block = framework.default_main_program().current_block() + if current_block.idx != global_block.idx: + assert current_block.backward_block_idx != -1, \ + "current block is not global_block, but it doesn't have backward block." + target_block = framework.default_main_program().blocks[ + current_block.backward_block_idx] + + start = len(target_block.ops) + self.helper = LayerHelper(self.__class__.__name__) + self._update_param_device_map(parameters_and_grads, target_block) + self._create_accumulators( + target_block, + [p[0] for p in parameters_and_grads if p[0].trainable]) + self._create_global_learning_rate() + + if framework.in_dygraph_mode(): + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + if param_and_grad[0].trainable is True: + self._append_optimize_op(target_block, param_and_grad) + else: + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad), name_scope("optimizer"): + if param_and_grad[0].trainable is True: + device = self._get_device_for_param(param_and_grad[0] + .name) + with device_guard(device): + optimize_op = self._append_optimize_op( + target_block, param_and_grad) + + # Get custom finish ops for subclasses + # FIXME: Need to fix this once we figure out how to handle dependencies + self._finish_update(target_block, parameters_and_grads) + + end = len(target_block.ops) + return target_block._slice_ops(start, end) + + def _append_dgc_ops(self, param_and_grad): + pass + + def backward(self, + loss, + startup_program=None, + parameters=None, + no_grad_set=None, + callbacks=None): + """ + The first part of ``minimize``, do auto-diff to append backward operations for + the current program. + + Args: + loss (Tensor): ``loss`` tensor to run optimizations. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameters``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need + to be updated. The default value is None. + callbacks (list, optional): list of callable objects to run when appending backward + operator for one parameter. The default value is None. + + Return: + list: list of (param, grad) tensor pairs, param is ``Parameter``, + grad is the gradient value corresponding to the parameter. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_grad() + """ + act_no_grad_set = None + if framework.in_dygraph_mode(): + pass + else: + act_no_grad_set = self._get_no_grad_set(loss, no_grad_set) + + self._dtype = loss.dtype + if framework.in_dygraph_mode(): + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if param._grad_ivar() is not None: + # create gradient tensor + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + else: + if callbacks is None: + callbacks = [error_clip_callback] + else: + assert (isinstance(callbacks, list)) + program = loss.block.program + assert len(loss.shape) == 1 and loss.shape[0] == 1, \ + "The loss.shape should be (1L,), but the current loss.shape is {}. " \ + "Maybe that you should call paddle.mean to process the current loss.".format( + loss.shape) + parameter_list = parameters if parameters \ + else self._parameter_list + with program_guard(program, startup_program): + params_grads = append_backward(loss, parameter_list, + act_no_grad_set, callbacks) + # Note: since we can't use all_reduce_op now, + # dgc_op should be the last op of one grad. + self._append_dgc_ops(params_grads) + return params_grads + + def apply_gradients(self, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + optimizer = paddle.optimizer.Adam(learning_rate=0.1, + parameters=linear.parameters()) + params_grads = optimizer.backward(loss) + optimizer.apply_gradients(params_grads) + + """ + + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + # 'optimizer(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) + else: + + params_grads = append_gradient_clip_ops(params_grads) + + # Add regularization if any + params_grads = append_regularization_ops(params_grads, + self.regularization) + + optimize_ops = self._create_optimization_pass(params_grads) + return optimize_ops + + def _apply_optimize(self, loss, startup_program, params_grads): + """ + Second part of `minimize`, appending optimization operators for + given `params_grads` pairs. + Args: + loss (Tensor): loss tensor to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameters`. + params_grads (list): list of (param, grad) pair to do optimization. + Returns: + list: A list of operators appended to the current program. + """ + if framework.in_dygraph_mode(): + with program_guard(framework.default_main_program(), + framework.default_startup_program()): + if self._grad_clip is not None: + params_grads = self._grad_clip(params_grads) + params_grads = append_regularization_ops(params_grads, + self.regularization) + optimize_ops = self._create_optimization_pass(params_grads) + else: + program = loss.block.program + with program_guard(program, startup_program): + optimize_ops = self.apply_gradients(params_grads) + return optimize_ops + + def _get_no_grad_set(self, loss, no_grad_set=None): + no_grad_set = _get_no_grad_set_name(no_grad_set) + parameters = loss.block.program.global_block().all_parameters() + param_no_trainable = set( + [param.name for param in parameters if param.trainable is False]) + # If the parameter is no trainable, it should not have a gradient. + no_grad_set.update(param_no_trainable) + + return no_grad_set + + @framework.dygraph_only + def clear_grad(self): + """ + Clear the gradients of all optimized parameters for model. + + Returns: + None + + Examples: + .. code-block:: python + + import numpy as np + import paddle + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_grad() + + """ + for p in self._parameter_list: + if p.trainable: + p.clear_gradient() + + @imperative_base.no_grad() + def minimize(self, + loss, + startup_program=None, + parameters=None, + no_grad_set=None): + """ + Add operations to minimize ``loss`` by updating ``parameters``. + + Args: + loss (Tensor): A ``Tensor`` containing the value to minimize. + startup_program (Program, optional): :ref:`api_fluid_Program` for + initializing parameters in ``parameters``. The default value + is None, at this time :ref:`api_fluid_default_startup_program` will be used. + parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update + to minimize ``loss``. The default value is None, at this time all parameters + will be updated. + no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need + to be updated. The default value is None. + + Returns: + tuple: tuple (optimize_ops, params_grads), A list of operators appended + by minimize and a list of (param, grad) tensor pairs, param is + ``Parameter``, grad is the gradient value corresponding to the parameter. + The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to + indicate program pruning. If so, the program will be pruned by ``feed`` and + ``fetch_list`` before run, see details in ``Executor``. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + place = fluid.CPUPlace() + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + adam_optimizer = paddle.optimizer.Adam(0.01) + adam_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + """ + assert isinstance(loss, Variable), "The loss should be an Tensor." + + parameter_list = parameters if parameters \ + else self._parameter_list + params_grads = self.backward( + loss, + startup_program=startup_program, + parameters=parameter_list, + no_grad_set=no_grad_set) + + optimize_ops = self._apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads) + + return optimize_ops, params_grads + + @framework.dygraph_only + def step(self): + """ + Execute the optimizer once. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import numpy as np + paddle.disable_static() + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5, dtype="float32") + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_grad() + """ + parameter_list = self._parameter_list + self._dtype = None + params_grads = [] + for param in self._parameter_list: + if not param.trainable: + continue + if param._grad_ivar() is not None: + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) + + optimize_ops = self._apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py new file mode 100644 index 0000000000000..0bc4c9bfd53dc --- /dev/null +++ b/python/paddle/optimizer/rmsprop.py @@ -0,0 +1,207 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + +__all__ = ["RMSProp"] + + +class RMSProp(Optimizer): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning + rate method. The original slides proposed RMSProp: Slide 29 of + http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf . + + The original equation is as follows: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w) + + The first equation calculates moving average of the squared gradient for + each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`. + + In some cases, adding a momentum term :math: `\\beta` is beneficial. + In our implementation, Nesterov momentum is used: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + if centered is True: + + .. math:: + + r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 + + g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w) + + v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 + + \\epsilon}} \\nabla Q_{i}(w) + + w & = w - v(w, t) + + where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95 + and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a + smoothing term to avoid division by zero, usually set somewhere in range + from 1e-4 to 1e-8. + + + Parameters: + learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``. + It can be a float value or a LearningRateDecay. + rho(float): rho is :math: `\\rho` in equation, default is 0.95. + epsilon(float): :math: `\\epsilon` in equation is smoothing term to + avoid division by zero, default is 1e-6. + momentum(float): :math:`\\beta` in equation is the momentum term, + default is 0.0. + centered(bool): If True, gradients are normalized by the estimated variance of + the gradient; if False, by the uncentered second moment. Setting this to + True may help with training, but is slightly more expensive in terms of + computation and memory. Defaults to False. + parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + It canbe a float value as coeff of L2 regularization or \ + :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`. + If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \ + the regularization setting here in optimizer will be ignored for this parameter. \ + Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + + Raises: + ValueError: If learning_rate, rho, epsilon, momentum are None. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.disable_static() + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.99], dtype="float32") + + adam = paddle.optimizer.RMSProp(learning_rate=0.1, + parameters=linear.parameters(), + weight_decay=0.01) + out.backward() + adam.step() + adam.clear_grad() + + """ + + _momentum_acc_str = "momentum" + _mean_square_acc_str = "mean_square" + _mean_grad_acc_str = "mean_grad" + + def __init__(self, + learning_rate, + rho=0.95, + epsilon=1.0e-6, + momentum=0.0, + centered=False, + parameters=None, + weight_decay=None, + grad_clip=None, + name=None): + if learning_rate is None: + raise ValueError("learning_rate is not set.") + if rho is None: + raise ValueError("rho is not set.") + if epsilon is None: + raise ValueError("epsilon is not set.") + if momentum is None: + raise ValueError("momentum is not set.") + + super(RMSProp, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=weight_decay, + grad_clip=grad_clip, + name=name) + + self.type = "rmsprop" + self._rho = rho + self._epsilon = epsilon + self._momentum = momentum + self._centered = centered + + def _create_accumulators(self, block, parameters): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + for p in parameters: + self._add_accumulator(self._momentum_acc_str, p) + self._add_accumulator(self._mean_square_acc_str, p) + self._add_accumulator(self._mean_grad_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + if not isinstance(block, framework.Block): + raise TypeError("block is not instance of framework.Block.") + + momentum_acc = self._get_accumulator(self._momentum_acc_str, + param_and_grad[0]) + mean_square_acc = self._get_accumulator(self._mean_square_acc_str, + param_and_grad[0]) + mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str, + param_and_grad[0]) + rmsprop_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Moment": momentum_acc, + "MeanSquare": mean_square_acc, + "MeanGrad": mean_grad_acc, + "LearningRate": self._create_param_lr(param_and_grad), + }, + outputs={ + "ParamOut": param_and_grad[0], + "MomentOut": momentum_acc, + "MeanSquareOut": mean_square_acc, + "MeanGradOut": mean_grad_acc + }, + attrs={ + "epsilon": self._epsilon, + "decay": self._rho, + "momentum": self._momentum, + "centered": self._centered + }, + stop_gradient=True) + + return rmsprop_op