Skip to content

Commit

Permalink
[WIP] update optimizer for 2.0 (#26288)
Browse files Browse the repository at this point in the history
refine Optimizer/Adam/Admax/RMSProp && add Admw

* buf fix

* update comment

* unify arguments place; notest

* fix ut, test=develop

* bug fix

* fix conflicts, test=develop

* add examples code

* bug fix

* fix comments

* fix sample code

* add sample code for Optimizer

* add adamax ut, test=develop

* fix rmsprop ut, test=develop

* add ut for optimizer.py and adamw.py

* remove TestAdamOptimizerBetaVariable

* update api && add ut

* update doc && fix ut

* add ut

Co-authored-by: mapingshuo <mps2012@yeah.net>
  • Loading branch information
MRXLT and mapingshuo authored Aug 23, 2020
1 parent e2b82e0 commit eeda90d
Show file tree
Hide file tree
Showing 28 changed files with 3,992 additions and 76 deletions.
4 changes: 3 additions & 1 deletion python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from functools import reduce
from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
import paddle

__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
Expand Down Expand Up @@ -3690,7 +3691,8 @@ def train_reader():
def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
if framework.in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.")
if not isinstance(optimizer, Optimizer):
if not isinstance(optimizer, Optimizer) and not isinstance(
optimizer, paddle.optimizer.Optimizer):
raise ValueError("The 'optimizer' parameter for "
"PipelineOptimizer must be an instance of "
"Optimizer, but the given type is {}.".format(
Expand Down
144 changes: 105 additions & 39 deletions python/paddle/fluid/tests/unittests/test_adam_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from paddle.fluid import core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
import paddle


class TestAdamOp1(OpTest):
Expand Down Expand Up @@ -401,46 +402,111 @@ def test_check_output(self):
self.check_output()


class TestAdamOptimizerBetaVariable(unittest.TestCase):
def test_adam_optimizer(self):
def test_with_place(place, shape):
exe = fluid.Executor(place)

train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)

beta1 = fluid.layers.create_global_var(
shape=[1],
value=0.85,
dtype='float32',
persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1],
value=0.95,
dtype='float32',
persistable=True)
opt = fluid.optimizer.Adam(
learning_rate=1e-5, beta1=beta1, beta2=beta2)
opt.minimize(loss)

exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog,
feed={"data": data_np},
fetch_list=[loss])
assert rets[0] is not None

class TestAdamOpV2(unittest.TestCase):
def test_adam_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
test_with_place(place, shape)
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)

beta1 = fluid.layers.create_global_var(
shape=[1], value=0.85, dtype='float32', persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1], value=0.95, dtype='float32', persistable=True)
betas = [beta1, beta2]
opt = paddle.optimizer.Adam(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)

exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None

def test_adam_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")

adam = paddle.optimizer.Adam(
learning_rate=0.01, parameters=linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

def test_adam_op_with_state_dict(self):

import paddle
paddle.disable_static()
emb = paddle.nn.Embedding([10, 10])

adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)

#learning_rate is Decay
learning_rate = fluid.dygraph.CosineDecay(0.1, 10000, 120)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate,
weight_decay=fluid.regularizer.L2Decay(0.001),
parameters=emb.parameters())
lr = adam.get_lr()
state_dict = adam.state_dict()
adam.set_state_dict(state_dict)

#leanrning_rate is Tensor
with self.assertRaises(TypeError):
learning_rate = np.array([0.01]).astype("float32")
learning_rate = paddle.to_tensor(learning_rate)
adam = paddle.optimizer.Adam(
learning_rate=learning_rate, parameters=emb.parameters())

params = adam.get_opti_var_name_list()
assert (params is not None)

def test_adam_with_grad_clip(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
linear = fluid.Linear(13, 5, dtype="float32")
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

def test_adam_op_with_set_lr(self):
paddle.disable_static()
linear = paddle.nn.Linear(10, 10)
adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters())

lr = 0.01
adam.set_lr(lr)
cur_lr = adam.get_lr()
assert (lr == cur_lr)

lr_var = paddle.create_global_var(shape=[1], value=lr, dtype='float32')
adam.set_lr(lr_var)
cur_lr = adam.get_lr()
assert (np.float32(lr) == cur_lr)

with self.assertRaises(TypeError):
lr = int(1)
adam.set_lr(lr)


if __name__ == "__main__":
Expand Down
67 changes: 67 additions & 0 deletions python/paddle/fluid/tests/unittests/test_adamax_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid


class TestAdamaxAPI(unittest.TestCase):
def test_adamax_api_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.Adamax(
learning_rate=0.01,
parameters=linear.parameters(),
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

def test_adamax_api(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = paddle.mean(conv)
beta1 = 0.85
beta2 = 0.95
opt = paddle.optimizer.Adamax(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)

exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None


if __name__ == "__main__":
unittest.main()
81 changes: 81 additions & 0 deletions python/paddle/fluid/tests/unittests/test_adamw_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import paddle
import numpy as np
import paddle.fluid as fluid


class TestAdamWOp(unittest.TestCase):
def test_adamw_op_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.AdamW(
learning_rate=0.01,
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()

def test_adamw_op_coverage(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_variable(value)
linear = paddle.nn.Linear(13, 5, dtype="float32")
adam = paddle.optimizer.AdamW(
learning_rate=0.0,
parameters=linear.parameters(),
apply_decay_param_fun=lambda name: True,
weight_decay=0.01)
assert (adam.__str__() is not None)

def test_adamw_op(self):
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = paddle.mean(conv)

beta1 = fluid.layers.create_global_var(
shape=[1], value=0.85, dtype='float32', persistable=True)
beta2 = fluid.layers.create_global_var(
shape=[1], value=0.95, dtype='float32', persistable=True)
betas = [beta1, beta2]
opt = paddle.optimizer.AdamW(
learning_rate=1e-5,
beta1=beta1,
beta2=beta2,
weight_decay=0.01,
epsilon=1e-8)
opt.minimize(loss)

exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self):

strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)

Expand Down Expand Up @@ -100,7 +100,7 @@ def test_a_sync_optimizer_pserver(self):

strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self):
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
strategy.a_sync_configs = {"k_steps": 100}
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_gradient_merge_optimizer(self):

strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = False
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)

Expand Down
2 changes: 1 addition & 1 deletion python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test(self):

strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(loss)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_amp_optimizer(self):
"custom_black_list": ['tanh'],
}

optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)

Expand Down
Loading

0 comments on commit eeda90d

Please sign in to comment.