diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index a63dfd7b091f7..980f6594db15f 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -1,4 +1,5 @@ // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -178,6 +179,7 @@ message DistributedStrategy { optional bool without_graph_optimization = 30 [ default = false ]; optional int32 fuse_grad_size_in_num = 31 [ default = 1 ]; optional bool calc_comm_same_stream = 32 [ default = false ]; + optional bool asp = 33 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index c4aa921346973..cdab0088aa28a 100644 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -424,6 +425,31 @@ def amp_configs(self, configs): check_configs_key(self.strategy.amp_configs, configs, "amp_configs") assign_configs_value(self.strategy.amp_configs, configs) + @property + def asp(self): + """ + Indicating whether we are using automatic sparsity training + Default Value: False + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.asp = True # by default this is false + + """ + return self.strategy.asp + + @asp.setter + @is_strict_auto + def asp(self, flag): + if isinstance(flag, bool): + self.strategy.asp = flag + else: + print("WARNING: asp should have value of bool type") + @property def recompute(self): """ diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py index 1788e044fe885..739de0de57725 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py +++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +13,7 @@ # See the License for the specific language governing permissions and from .amp_optimizer import AMPOptimizer +from .asp_optimizer import ASPOptimizer from .recompute_optimizer import RecomputeOptimizer from .gradient_merge_optimizer import GradientMergeOptimizer from .graph_execution_optimizer import GraphExecutionOptimizer diff --git a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py new file mode 100644 index 0000000000000..ea9cb1c62bfec --- /dev/null +++ b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py @@ -0,0 +1,66 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from paddle.fluid.contrib.sparsity.asp import ASPHelper +from .meta_optimizer_base import MetaOptimizerBase + +__all__ = [] + + +class ASPOptimizer(MetaOptimizerBase): + def __init__(self, optimizer): + super(ASPOptimizer, self).__init__(optimizer) + self.inner_opt = optimizer + # we do not allow meta optimizer to be inner optimizer currently + self.meta_optimizers_white_list = [ + "AMPOptimizer", "LarsOptimizer", "LambOptimizer", + "GraphExecutionOptimizer", "RecomputeOptimizer", + "GradientMergeOptimizer" + ] + self.meta_optimizers_black_list = [] + + def _set_basic_info(self, loss, role_maker, user_defined_optimizer, + user_defined_strategy): + super(ASPOptimizer, self)._set_basic_info( + loss, role_maker, user_defined_optimizer, user_defined_strategy) + + def _can_apply(self): + if not self.role_maker._is_collective: + return False + + if self.user_defined_strategy.asp: + return True + + return False + + def _disable_strategy(self, dist_strategy): + dist_strategy.asp = False + + def _enable_strategy(self, dist_strategy, context): + dist_strategy.asp = True + + def minimize_impl(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + + optimize_ops, params_grads = ASPHelper._minimize( + self.inner_opt, + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set) + + return optimize_ops, params_grads diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index fbabc73f37bce..77c61faf23dee 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -64,12 +64,15 @@ def decorate(optimizer): Examples: .. code-block:: python + import paddle import paddle.fluid as fluid from paddle.fluid.contrib import sparsity main_program = fluid.Program() startup_program = fluid.Program() + paddle.enable_static() + with fluid.program_guard(main_program, startup_program): input_data = fluid.layers.data(name='data', shape=[None, 128]) label = fluid.layers.data(name='label', shape=[None, 10]) @@ -78,17 +81,13 @@ def decorate(optimizer): loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label)) optimizer = fluid.optimizer.SGD(learning_rate=0.1) - optimizer = sparsity.decorate(optimizer) - optimizer.minimize(loss, startup_program) + # if do sparse training with Fleet, please replace above decorate with: + # strategy = paddle.distributed.fleet.DistributedStrategy() + # strategy.asp = True + # optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - # When apply distributed training with Fleet - import paddle.distributed.fleet as fleet - - optimizer = fluid.optimizer.SGD(learning_rate=0.1) - optimizer = sparsity.decorate(optimizer) # Need to be called before `fleet.distributed_optimizer` - optimizer = fleet.distributed_optimizer(optimizer) - optimizer.minimize(loss, startup_program) + optimizer.minimize(loss, startup_program) """ return ASPHelper.decorate(optimizer) @@ -126,23 +125,38 @@ def prune_model(place, Examples: .. code-block:: python + import paddle import paddle.fluid as fluid + import paddle.fluid.core as core from paddle.fluid.contrib import sparsity + paddle.enable_static() + main_program = fluid.Program() startup_program = fluid.Program() - place = fluid.CUDAPlace(0) + place = paddle.CPUPlace() + if core.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) with fluid.program_guard(main_program, startup_program): input_data = fluid.layers.data(name='data', shape=[None, 128]) label = fluid.layers.data(name='label', shape=[None, 10]) - hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None) + hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None, name="need_sparse") + hidden = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=32, act=None, name="need_dense") prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None) loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label)) - optimizer = decorate(fluid.optimizer.SGD(learning_rate=0.1)) - optimizer.minimize(optimizer, loss, main_program, startup_program) + # Setup exluded layers out from ASP workflow. + # Please note, excluded_layers must be set before calling `optimizer.minimize()`. + sparsity.set_excluded_layers(main_program, ["need_dense"]) + + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = fluid.contrib.mixed_precision.decorator.decorate(optimizer ) + # Calling sparsity.decorate() to wrap minimize() in optimizer, which + # will insert necessary masking operations for ASP workflow. + optimizer = sparsity.decorate(optimizer) + optimizer.minimize(loss, startup_program) exe = fluid.Executor(place) exe.run(startup_program) diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt index f71e04c09aa38..9939a857f9ef3 100644 --- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt @@ -1,6 +1,14 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp"}) +list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_amp"}) + foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) + +if(WITH_DISTRIBUTE) + py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS}) + py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS}) +endif() diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py new file mode 100644 index 0000000000000..34d17f570e427 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py @@ -0,0 +1,89 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.distributed.fleet as fleet +import paddle.distributed.fleet.base.role_maker as role_maker +import unittest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import os +from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.asp import ASPHelper +import numpy as np +cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES') +if cuda_visible_devices is None or cuda_visible_devices == "": + os.environ['CUDA_VISIBLE_DEVICES'] = '0' +else: + os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0] + +paddle.enable_static() + + +class TestFleetWithASP(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_TRAINERS_NUM"] = "1" + os.environ["PADDLE_TRAINER_ID"] = "0" + + def net(self, main_prog, startup_prog): + with fluid.program_guard(main_prog, startup_prog): + input_x = paddle.static.data( + name="x", shape=[-1, 32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') + + fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + avg_cost = paddle.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.asp = True + return avg_cost, strategy, input_x, input_y + + def test_with_asp(self): + fleet.init(is_collective=True) + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy, input_x, input_y = self.net(train_prog, + startup_prog) + + with fluid.program_guard(train_prog, startup_prog): + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) + exe.run(startup_prog) + + sparsity.prune_model(place, train_prog) + + data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) + exe.run(train_prog, feed=feeder.feed([data])) + + for param in train_prog.global_block().all_parameters(): + if ASPHelper._is_supported_layer(train_prog, param.name): + mat = np.array(fluid.global_scope().find_var(param.name) + .get_tensor()) + self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py new file mode 100644 index 0000000000000..c4074b2ae7a3c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py @@ -0,0 +1,130 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.distributed.fleet as fleet +import paddle.distributed.fleet.base.role_maker as role_maker +import unittest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import os +from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.asp import ASPHelper +import numpy as np +cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES') +if cuda_visible_devices is None or cuda_visible_devices == "": + os.environ['CUDA_VISIBLE_DEVICES'] = '0' +else: + os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0] + +paddle.enable_static() + + +class TestFleetWithASP(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213" + os.environ["PADDLE_TRAINERS_NUM"] = "1" + os.environ["PADDLE_TRAINER_ID"] = "0" + + def net(self, main_prog, startup_prog): + with fluid.program_guard(main_prog, startup_prog): + input_x = paddle.static.data( + name="x", shape=[-1, 32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') + + fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=input_y) + avg_cost = paddle.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.asp = True + return avg_cost, strategy, input_x, input_y + + def test_with_asp_and_amp(self): + fleet.init(is_collective=True) + train_prog, startup_prog = fluid.Program(), fluid.Program() + avg_cost, strategy, input_x, input_y = self.net(train_prog, + startup_prog) + strategy.amp = True + + with fluid.program_guard(train_prog, startup_prog): + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) + exe.run(startup_prog) + + optimizer.amp_init(place) + + sparsity.prune_model(place, train_prog) + + data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) + exe.run(train_prog, feed=feeder.feed([data])) + + for param in train_prog.global_block().all_parameters(): + if ASPHelper._is_supported_layer(train_prog, param.name): + mat = np.array(fluid.global_scope().find_var(param.name) + .get_tensor()) + self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4)) + + def test_with_asp_and_pure_fp16(self): + fleet.init(is_collective=True) + train_prog, startup_prog = fluid.Program(), fluid.Program() + with paddle.static.amp.fp16_guard(): + avg_cost, strategy, \ + input_x, input_y = self.net(train_prog, + startup_prog) + strategy.amp = True + strategy.amp_configs = {'use_pure_fp16': True} + + with fluid.program_guard(train_prog, startup_prog): + with paddle.static.amp.fp16_guard(): + optimizer = optimizer = paddle.optimizer.Momentum( + learning_rate=0.01, multi_precision=True) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place) + exe.run(startup_prog) + + optimizer.amp_init(place) + + sparsity.prune_model(place, train_prog) + + data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1))) + exe.run(train_prog, feed=feeder.feed([data])) + + for param in train_prog.global_block().all_parameters(): + if ASPHelper._is_supported_layer(train_prog, param.name): + mat = np.array(fluid.global_scope().find_var(param.name) + .get_tensor()) + self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4)) + + +if __name__ == "__main__": + unittest.main()