From 85d88b8d08495663d5fa7631de004bdadb103c4e Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Mon, 9 Aug 2021 22:35:04 +0800 Subject: [PATCH 01/10] Add no_sync in data parallel for dynamic graph --- paddle/fluid/imperative/reducer.cc | 10 +- paddle/fluid/imperative/reducer.h | 1 + python/paddle/fluid/dygraph/parallel.py | 14 +- .../unittests/parallel_dygraph_no_sync.py | 163 +++++++++++++++++ .../parallel_dygraph_no_sync_control_flow.py | 164 +++++++++++++++++ .../parallel_dygraph_no_sync_unused_params.py | 167 ++++++++++++++++++ ...t_parallel_dygraph_dataparallel_no_sync.py | 100 +++++++++++ 7 files changed, 616 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 0f6676ed48f34..f5ef439520187 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -527,6 +527,7 @@ void Reducer::TraverseBackwardGraph( void Reducer::PrepareForBackward( const std::vector> &outputs) { VLOG(3) << "after forward, then reset count for backward."; + grad_need_hooks_ = true; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](Group &group) { group.pending_ = group.variable_indices_.size(); @@ -599,6 +600,10 @@ void Reducer::AddDistHook(size_t var_index) { "than %d, but it is %d", variable_locators_.size(), var_index)); + if (!grad_need_hooks_) { + return; + } + VLOG(3) << "Var[" << var_index << "] [" << vars_[var_index]->GradVarBase()->Name() << "] arrived and triggered disthook"; @@ -692,8 +697,8 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { auto var_base = vars_[var_index]->GradVarBase(); auto tensor = var_base->MutableVar()->GetMutable(); - TensorCopy(*tensor, place_, *dev_ctx, &group_tensor); - group_tensor.Resize({static_cast(length)}); + group_tensor.ShareDataWith(*tensor).Resize( + {static_cast(length)}); } else { group_tensor.Resize({static_cast(length)}); operators::math::set_constant(*dev_ctx, &group_tensor, 0.0); @@ -942,6 +947,7 @@ bool Reducer::HasGrad(size_t var_index) { void Reducer::FinalizeBackward() { groups_need_finalize_ = false; + grad_need_hooks_ = false; #ifdef PADDLE_WITH_XPU_BKCL { std::unique_lock lock(mutex_); diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 8392ab2c704d5..f86be6bbdff23 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -197,6 +197,7 @@ class Reducer { std::unordered_map node_deps_; std::unordered_map var_index_map_; std::vector unused_vars_; + bool grad_need_hooks_{false}; bool has_marked_unused_vars_{false}; bool find_unused_vars_each_step_{false}; bool find_unused_vars_once_{true}; diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index a905e1dba8467..d494a810ca91c 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -19,6 +19,7 @@ from collections import OrderedDict import itertools import warnings +from contextlib import contextmanager import paddle from paddle.fluid import core @@ -483,6 +484,7 @@ def __init__(self, self._layers = layers self.find_unused_parameters = find_unused_parameters + self.require_backward_grad_sync = True # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by @@ -576,9 +578,19 @@ def _find_varbase(self, obj): return itertools.chain(*map(self._find_varbase, obj.values())) return [] + @contextmanager + def no_sync(self): + old_require_backward_grad_sync = self.require_backward_grad_sync + self.require_backward_grad_sync = False + try: + yield + finally: + self.require_backward_grad_sync = old_require_backward_grad_sync + def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) - if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad: + if self._strategy.nranks > 1 and framework._dygraph_tracer( + )._has_grad and self.require_backward_grad_sync: self._reducer.prepare_for_backward( list(self._find_varbase(outputs))) return outputs diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py new file mode 100644 index 0000000000000..e876a82078559 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -0,0 +1,163 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import contextlib +import unittest +import numpy as np +import six +import pickle +import random + +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.dygraph.nn import Linear +from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase + +seed = 90 +RUN_STEP = 50 +batch_size = 4 +batch_num = 1000 + + +class SimpleNet(fluid.Layer): + def __init__(self): + super(SimpleNet, self).__init__() + self.net_a = Linear(input_dim=10, output_dim=20) + self.net_b = Linear(input_dim=20, output_dim=5) + self.net_c = Linear(input_dim=5, output_dim=10) + + def forward(self, x): + x = self.net_a(x) + x = self.net_b(x) + x = self.net_c(x) + return x + + +class TestNoSync(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNet() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = paddle.to_tensor(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + def run_trainer(self, args): + if fluid.core.is_compiled_with_cuda(): + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) + else: + assert ("Only support CUDAPlace for now.") + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + model, train_reader, opt = self.get_model() + + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") + print_to_err(type(self).__name__, "begin to run dygraph training") + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + + if not args.accumulate_gradient: + model.clear_gradients() + print_to_out(out_losses) + + def run_trainer_with_spawn(self, args): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + args.trainer_id = paddle.distributed.get_rank() + + dist.init_parallel_env() + model, train_reader, opt = self.get_model() + if args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=True) + else: + model = paddle.DataParallel(model, find_unused_parameters=False) + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + model.clear_gradients() + print_to_out(out_losses) + return out_losses + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +if __name__ == "__main__": + runtime_main(TestNoSync) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py new file mode 100644 index 0000000000000..fc1de79559566 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py @@ -0,0 +1,164 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import contextlib +import unittest +import numpy as np +import six +import pickle +import random + +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.dygraph.nn import Linear +from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase + +seed = 90 +RUN_STEP = 50 +batch_size = 4 +batch_num = 1000 + + +class SimpleNetControlFlow(fluid.Layer): + def __init__(self): + super(SimpleNetControlFlow, self).__init__() + self.net_a = Linear(input_dim=10, output_dim=20) + self.net_b = Linear(input_dim=20, output_dim=5) + self.net_c = Linear(input_dim=5, output_dim=10) + self.step = 0 + + def forward(self, x): + self.step = self.step + 1 + x = self.net_a(x) + if self.step > 20: + x.stop_gradient = True + x = self.net_b(x) + x = self.net_c(x) + return x + + +class TestNoSyncControlFlow(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNetControlFlow() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = paddle.to_tensor(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + def run_trainer(self, args): + if fluid.core.is_compiled_with_cuda(): + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) + else: + assert ("Only support CUDAPlace for now.") + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + model, train_reader, opt = self.get_model() + + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") + print_to_err(type(self).__name__, "begin to run dygraph training") + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + + if not args.accumulate_gradient: + model.clear_gradients() + print_to_out(out_losses) + + def run_trainer_with_spawn(self, args): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + args.trainer_id = paddle.distributed.get_rank() + + dist.init_parallel_env() + model, train_reader, opt = self.get_model() + model = paddle.DataParallel(model, find_unused_parameters=True) + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + model.clear_gradients() + print_to_out(out_losses) + return out_losses + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +if __name__ == "__main__": + runtime_main(TestNoSyncControlFlow) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py new file mode 100644 index 0000000000000..7a4b2357c7058 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py @@ -0,0 +1,167 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import contextlib +import unittest +import numpy as np +import six +import pickle +import random + +import paddle +import paddle.fluid as fluid +import paddle.distributed as dist +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.dygraph.nn import Linear +from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase + +seed = 90 +RUN_STEP = 50 +batch_size = 4 +batch_num = 1000 + + +class SimpleNetUnusedParam(fluid.Layer): + def __init__(self): + super(SimpleNetUnusedParam, self).__init__() + self.net_a = Linear(input_dim=10, output_dim=20) + self.net_b = Linear(input_dim=20, output_dim=5) + self.net_c = Linear(input_dim=5, output_dim=10) + + self.net_d = Linear(input_dim=20, output_dim=10) + + def forward(self, x): + x = self.net_a(x) + x.stop_gradient = True + x = self.net_b(x) + x = self.net_c(x) + return x + + +class TestNoSyncUnusedParam(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNetUnusedParam() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = paddle.to_tensor(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + def run_trainer(self, args): + if fluid.core.is_compiled_with_cuda(): + device_id = int(os.getenv("FLAGS_selected_gpus", "0")) + place = fluid.CUDAPlace(device_id) + else: + assert ("Only support CUDAPlace for now.") + + with fluid.dygraph.guard(place): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + model, train_reader, opt = self.get_model() + + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") + print_to_err(type(self).__name__, "begin to run dygraph training") + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + + if not args.accumulate_gradient: + model.clear_gradients() + print_to_out(out_losses) + + def run_trainer_with_spawn(self, args): + paddle.disable_static() + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + np.random.seed(seed) + random.seed(seed) + args.trainer_id = paddle.distributed.get_rank() + + paddle.distributed.init_parallel_env() + model, train_reader, opt = self.get_model() + if args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=True) + else: + model = paddle.DataParallel(model, find_unused_parameters=False) + + out_losses = [] + for step_id, data in enumerate(train_reader()): + data = self._get_data(data, args) + if step_id == RUN_STEP: + break + if step_id % 10 != 0: + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: + loss = self.run_one_loop(model, opt, data) + loss.backward() + opt.minimize(loss) + print_to_err( + type(self).__name__, + "loss at step %d: %f" % (step_id, loss.numpy())) + out_losses.append(loss.numpy()) + model.clear_gradients() + print_to_out(out_losses) + return out_losses + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +if __name__ == "__main__": + runtime_main(TestNoSyncUnusedParam) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py new file mode 100644 index 0000000000000..1488b53eb0a52 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_no_sync import TestNoSync +from parallel_dygraph_no_sync_unused_params import TestNoSyncUnusedParam +from parallel_dygraph_no_sync_control_flow import TestNoSyncControlFlow + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphNoSync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = False + + def test_no_sync(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncUnusedParam(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = True + + def test_no_sync(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync_unused_params.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncControlFlow(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = True + + def test_no_sync(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync_control_flow.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn(test_class=TestNoSync, delta=1e-5) + + +class TestParallelDygraphNoSyncUnusedParamSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestNoSyncUnusedParam, delta=1e-5) + + +class TestParallelDygraphNoSyncControlFlowSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestNoSyncControlFlow, delta=1e-5) + + +if __name__ == "__main__": + unittest.main() From 139bf61ac6eb8975fa2730aae7dad1b2ac1c7041 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Mon, 9 Aug 2021 23:21:39 +0800 Subject: [PATCH 02/10] modify UT of no_sync --- .../fluid/tests/unittests/parallel_dygraph_no_sync.py | 6 +++--- .../unittests/parallel_dygraph_no_sync_control_flow.py | 6 +++--- .../unittests/parallel_dygraph_no_sync_unused_params.py | 6 +++--- .../unittests/test_parallel_dygraph_dataparallel_no_sync.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py index e876a82078559..f23664aa88d0f 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -31,7 +31,7 @@ from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase seed = 90 -RUN_STEP = 50 +RUN_STEP = 20 batch_size = 4 batch_num = 1000 @@ -97,7 +97,7 @@ def run_trainer(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() @@ -133,7 +133,7 @@ def run_trainer_with_spawn(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py index fc1de79559566..2a65a38dfeb29 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py @@ -31,7 +31,7 @@ from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase seed = 90 -RUN_STEP = 50 +RUN_STEP = 20 batch_size = 4 batch_num = 1000 @@ -101,7 +101,7 @@ def run_trainer(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() @@ -134,7 +134,7 @@ def run_trainer_with_spawn(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py index 7a4b2357c7058..1ee6ed528ac27 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py @@ -31,7 +31,7 @@ from test_dist_base import print_to_err, print_to_out, runtime_main, TestParallelDyGraphRunnerBase seed = 90 -RUN_STEP = 50 +RUN_STEP = 20 batch_size = 4 batch_num = 1000 @@ -100,7 +100,7 @@ def run_trainer(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() @@ -137,7 +137,7 @@ def run_trainer_with_spawn(self, args): data = self._get_data(data, args) if step_id == RUN_STEP: break - if step_id % 10 != 0: + if step_id % 3 != 0: with model.no_sync(): loss = self.run_one_loop(model, opt, data) loss.backward() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py index 1488b53eb0a52..362b5219372ad 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py @@ -51,7 +51,7 @@ def _setup_config(self): self._dygraph = True self._find_unused_parameters = True - def test_no_sync(self): + def test_no_sync_ununsed_param(self): if fluid.core.is_compiled_with_cuda(): self.check_with_place( "parallel_dygraph_no_sync_unused_params.py", @@ -67,7 +67,7 @@ def _setup_config(self): self._dygraph = True self._find_unused_parameters = True - def test_no_sync(self): + def test_no_sync_control_flow(self): if fluid.core.is_compiled_with_cuda(): self.check_with_place( "parallel_dygraph_no_sync_control_flow.py", From 07834f1363bb6f7bcb7b75e1efab893430e8efd6 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 00:42:14 +0800 Subject: [PATCH 03/10] delete test_parallel_dygraph_dataparallel_no_sync.py --- ...t_parallel_dygraph_dataparallel_no_sync.py | 100 ------------------ 1 file changed, 100 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py deleted file mode 100644 index 362b5219372ad..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_no_sync.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -import sys -import unittest - -import paddle.fluid as fluid -from test_dist_base import TestDistBase -from spawn_runner_base import TestDistSpawnRunner -from parallel_dygraph_no_sync import TestNoSync -from parallel_dygraph_no_sync_unused_params import TestNoSyncUnusedParam -from parallel_dygraph_no_sync_control_flow import TestNoSyncControlFlow - -flag_name = os.path.splitext(__file__)[0] - - -class TestParallelDygraphNoSync(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = False - - def test_no_sync(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncUnusedParam(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_no_sync_ununsed_param(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync_unused_params.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncControlFlow(TestDistBase): - def _setup_config(self): - self._sync_mode = False - self._nccl2_mode = True - self._dygraph = True - self._find_unused_parameters = True - - def test_no_sync_control_flow(self): - if fluid.core.is_compiled_with_cuda(): - self.check_with_place( - "parallel_dygraph_no_sync_control_flow.py", - delta=1e-5, - check_error_log=True, - log_name=flag_name) - - -class TestParallelDygraphNoSyncSpawn(TestDistSpawnRunner): - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn(test_class=TestNoSync, delta=1e-5) - - -class TestParallelDygraphNoSyncUnusedParamSpawn(TestDistSpawnRunner): - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestNoSyncUnusedParam, delta=1e-5) - - -class TestParallelDygraphNoSyncControlFlowSpawn(TestDistSpawnRunner): - def test_no_sync_with_spawn(self): - if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): - self.check_dist_result_with_spawn( - test_class=TestNoSyncControlFlow, delta=1e-5) - - -if __name__ == "__main__": - unittest.main() From e97105198bd56765726b6d5e04af5c76d8eb83b3 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 00:43:08 +0800 Subject: [PATCH 04/10] add test_parallel_dygraph_no_sync.py --- .../test_parallel_dygraph_no_sync.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py new file mode 100644 index 0000000000000..362b5219372ad --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_no_sync import TestNoSync +from parallel_dygraph_no_sync_unused_params import TestNoSyncUnusedParam +from parallel_dygraph_no_sync_control_flow import TestNoSyncControlFlow + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphNoSync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = False + + def test_no_sync(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncUnusedParam(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = True + + def test_no_sync_ununsed_param(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync_unused_params.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncControlFlow(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._find_unused_parameters = True + + def test_no_sync_control_flow(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_no_sync_control_flow.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoSyncSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn(test_class=TestNoSync, delta=1e-5) + + +class TestParallelDygraphNoSyncUnusedParamSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestNoSyncUnusedParam, delta=1e-5) + + +class TestParallelDygraphNoSyncControlFlowSpawn(TestDistSpawnRunner): + def test_no_sync_with_spawn(self): + if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): + self.check_dist_result_with_spawn( + test_class=TestNoSyncControlFlow, delta=1e-5) + + +if __name__ == "__main__": + unittest.main() From c3ac5ecae1aaefb220f382efad89bc1111d8379e Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 11:23:18 +0800 Subject: [PATCH 05/10] modify run_trainer_with_spawn in UTs --- .../unittests/parallel_dygraph_no_sync.py | 48 +++++++++++-------- .../parallel_dygraph_no_sync_control_flow.py | 44 ++++++++++------- .../parallel_dygraph_no_sync_unused_params.py | 48 +++++++++++-------- 3 files changed, 85 insertions(+), 55 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py index f23664aa88d0f..2f12c1562fa54 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -81,24 +81,28 @@ def run_trainer(self, args): random.seed(seed) model, train_reader, opt = self.get_model() - dist.init_parallel_env() - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) - else: - model = paddle.DataParallel(model, find_unused_parameters=True) - print_to_err(type(self).__name__, "model built in dygraph") - print_to_err(type(self).__name__, "begin to run dygraph training") - + if args.update_method == "nccl2": + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] + print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: @@ -119,14 +123,16 @@ def run_trainer_with_spawn(self, args): fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) - args.trainer_id = paddle.distributed.get_rank() + args.trainer_id = dist.get_rank() - dist.init_parallel_env() + if args.update_method == "nccl2": + dist.init_parallel_env() model, train_reader, opt = self.get_model() - if args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=True) - else: - model = paddle.DataParallel(model, find_unused_parameters=False) + if args.update_method == "nccl2": + if args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=True) + else: + model = paddle.DataParallel(model, find_unused_parameters=False) out_losses = [] for step_id, data in enumerate(train_reader()): @@ -134,7 +140,11 @@ def run_trainer_with_spawn(self, args): if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py index 2a65a38dfeb29..cb505ba36ed29 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py @@ -47,7 +47,7 @@ def __init__(self): def forward(self, x): self.step = self.step + 1 x = self.net_a(x) - if self.step > 20: + if self.step > 10: x.stop_gradient = True x = self.net_b(x) x = self.net_c(x) @@ -85,24 +85,28 @@ def run_trainer(self, args): random.seed(seed) model, train_reader, opt = self.get_model() - dist.init_parallel_env() - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) - else: - model = paddle.DataParallel(model, find_unused_parameters=True) - print_to_err(type(self).__name__, "model built in dygraph") - print_to_err(type(self).__name__, "begin to run dygraph training") - + if args.update_method == "nccl2": + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] + print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: @@ -123,11 +127,13 @@ def run_trainer_with_spawn(self, args): fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) - args.trainer_id = paddle.distributed.get_rank() + args.trainer_id = dist.get_rank() - dist.init_parallel_env() + if args.update_method == "nccl2": + dist.init_parallel_env() model, train_reader, opt = self.get_model() - model = paddle.DataParallel(model, find_unused_parameters=True) + if args.update_method == "nccl2": + model = paddle.DataParallel(model, find_unused_parameters=True) out_losses = [] for step_id, data in enumerate(train_reader()): @@ -135,7 +141,11 @@ def run_trainer_with_spawn(self, args): if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py index 1ee6ed528ac27..d45e0be5c29fb 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py @@ -84,24 +84,28 @@ def run_trainer(self, args): random.seed(seed) model, train_reader, opt = self.get_model() - dist.init_parallel_env() - print_to_err( - type(self).__name__, - "begin to prepare context in dygraph with nccl2") - if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) - else: - model = paddle.DataParallel(model, find_unused_parameters=True) - print_to_err(type(self).__name__, "model built in dygraph") - print_to_err(type(self).__name__, "begin to run dygraph training") - + if args.update_method == "nccl2": + dist.init_parallel_env() + print_to_err( + type(self).__name__, + "begin to prepare context in dygraph with nccl2") + if not args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=False) + else: + model = paddle.DataParallel(model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] + print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): data = self._get_data(data, args) if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: @@ -123,14 +127,16 @@ def run_trainer_with_spawn(self, args): fluid.default_main_program().random_seed = seed np.random.seed(seed) random.seed(seed) - args.trainer_id = paddle.distributed.get_rank() + args.trainer_id = dist.get_rank() - paddle.distributed.init_parallel_env() + if args.update_method == "nccl2": + dist.init_parallel_env() model, train_reader, opt = self.get_model() - if args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=True) - else: - model = paddle.DataParallel(model, find_unused_parameters=False) + if args.update_method == "nccl2": + if args.find_unused_parameters: + model = paddle.DataParallel(model, find_unused_parameters=True) + else: + model = paddle.DataParallel(model, find_unused_parameters=False) out_losses = [] for step_id, data in enumerate(train_reader()): @@ -138,7 +144,11 @@ def run_trainer_with_spawn(self, args): if step_id == RUN_STEP: break if step_id % 3 != 0: - with model.no_sync(): + if args.update_method == "nccl2": + with model.no_sync(): + loss = self.run_one_loop(model, opt, data) + loss.backward() + else: loss = self.run_one_loop(model, opt, data) loss.backward() else: From 4a5cc9aa62e4d31fea48172a2c875c60b3e8a7fb Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 12:24:20 +0800 Subject: [PATCH 06/10] Add UT of complex control flow in no_sync --- paddle/fluid/imperative/reducer.cc | 1 + .../fluid/tests/unittests/CMakeLists.txt | 6 + .../unittests/parallel_dygraph_no_sync.py | 10 +- .../parallel_dygraph_no_sync_control_flow.py | 12 +- ...parallel_dygraph_no_sync_gradient_check.py | 138 ++++++++++++++++++ .../parallel_dygraph_no_sync_unused_params.py | 12 +- ...parallel_dygraph_no_sync_gradient_check.py | 29 ++++ 7 files changed, 194 insertions(+), 14 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index f5ef439520187..56f92cc31e445 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -912,6 +912,7 @@ void Reducer::ProcessUnusedDenseVars() { // 3. create grad var base or get grad var base auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); + grad_var_base_tmp->SharedVar()->SetIsEmpty(false); // 4. set grad tensor auto *dest_grad_tensor = diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 007221ca4f9ca..d6d6992ed91b9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -23,6 +23,8 @@ list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND DIST_TEST_OPS test_gen_nccl_id_op) list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables) list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow) +list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync) +list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync_gradient_check) list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) @@ -186,6 +188,8 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow) + list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync) + list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_no_sync_gradient_check) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel) @@ -902,6 +906,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) + set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py index 2f12c1562fa54..f9d6c697b2bb3 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -87,9 +87,11 @@ def run_trainer(self, args): type(self).__name__, "begin to prepare context in dygraph with nccl2") if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) + model = paddle.DataParallel( + model, find_unused_parameters=False) else: - model = paddle.DataParallel(model, find_unused_parameters=True) + model = paddle.DataParallel( + model, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") @@ -126,9 +128,9 @@ def run_trainer_with_spawn(self, args): args.trainer_id = dist.get_rank() if args.update_method == "nccl2": - dist.init_parallel_env() + dist.init_parallel_env() model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": + if args.update_method == "nccl2": if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py index cb505ba36ed29..392f75b35df5d 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py @@ -91,10 +91,12 @@ def run_trainer(self, args): type(self).__name__, "begin to prepare context in dygraph with nccl2") if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) + model = paddle.DataParallel( + model, find_unused_parameters=False) else: - model = paddle.DataParallel(model, find_unused_parameters=True) - print_to_err(type(self).__name__, "model built in dygraph") + model = paddle.DataParallel( + model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): @@ -130,9 +132,9 @@ def run_trainer_with_spawn(self, args): args.trainer_id = dist.get_rank() if args.update_method == "nccl2": - dist.init_parallel_env() + dist.init_parallel_env() model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": + if args.update_method == "nccl2": model = paddle.DataParallel(model, find_unused_parameters=True) out_losses = [] diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py new file mode 100644 index 0000000000000..642ea14d8a87d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_gradient_check.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear + +paddle.seed(1024) +np.random.seed(2021) + +batch = 1 +in_dim = 10 +out_dim = 20 + + +class SimpleNet(fluid.Layer): + def __init__(self, train_id): + super(SimpleNet, self).__init__() + self.w1 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.w2 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.share_net = Linear(out_dim, 1) + + self.unused_param = self.create_parameter( + shape=[out_dim, in_dim], dtype="float32") + + # just for test sync_params_buffers + self.register_buffer("queue", paddle.randn([10, 5])) + self.queue = paddle.nn.functional.normalize(self.queue, axis=0) + self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) + + self.trainer_id = train_id + + def forward(self, x): + is_use = (paddle.equal_all( + x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and + self.trainer_id == 1) + + if is_use: + tmp = paddle.matmul(x, self.w1) + else: + tmp = paddle.matmul(x, self.w2) + + return self.share_net(tmp) + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + self.trainer_id = dist.get_rank() + dist.init_parallel_env() + + model_a = SimpleNet(self.trainer_id) + model_b = SimpleNet(self.trainer_id) + + state_dict = model_a.state_dict() + model_b.set_state_dict(state_dict) + + model_a = paddle.DataParallel(model_a, find_unused_parameters=True) + model_b = paddle.DataParallel(model_b, find_unused_parameters=True) + + ones_input = paddle.ones(shape=(batch, in_dim)) + ones_input.stop_gradient = True + + for step_id in range(1, 31): + random_input = paddle.rand(shape=(batch, in_dim)) + random_input.stop_gradient = True + + if step_id % 5 != 0: + with model_a.no_sync(): + self.dp_layer(step_id, model_a, model_b, random_input, + ones_input) + else: + self.dp_layer(step_id, model_a, model_b, random_input, + ones_input) + + self.check_gradient(model_a.parameters()) + self.check_gradient(model_b.parameters()) + + self.check_acc(model_a._layers.w1.grad, model_b._layers.w1.grad) + self.check_acc(model_a._layers.w2.grad, model_b._layers.w2.grad) + + model_a.clear_gradients() + model_b.clear_gradients() + + def dp_layer(self, step_id, model_a, model_b, random_input, ones_input): + if step_id % 2 == 0: + out_a = model_a(random_input) + out_b = model_b(random_input) + else: + out_a = model_a(ones_input) + out_b = model_b(ones_input) + out_a.sum().backward() + out_b.sum().backward() + + def check_acc(self, grad, acc_grad): + grad = grad.numpy() if grad is not None else None + acc_grad = acc_grad.numpy() if acc_grad is not None else None + return np.testing.assert_allclose(grad, acc_grad, rtol=1e-6) + + def print_trainer_0(self, *args): + if self.trainer_id == 0: + print(*args) + + def broadcast_param(self, param, root): + paddle.distributed.broadcast(param, root) + return param + + def check_gradient(self, params): + other_param = [] + for param in params: + if param.trainable and (param._grad_ivar() is not None): + grad = param._grad_ivar() + other_grad = self.broadcast_param(grad.clone(), root=1) + if self.trainer_id == 0: + np.testing.assert_allclose(other_grad.numpy(), grad.numpy()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py index d45e0be5c29fb..a12d3fb043ddd 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py @@ -90,10 +90,12 @@ def run_trainer(self, args): type(self).__name__, "begin to prepare context in dygraph with nccl2") if not args.find_unused_parameters: - model = paddle.DataParallel(model, find_unused_parameters=False) + model = paddle.DataParallel( + model, find_unused_parameters=False) else: - model = paddle.DataParallel(model, find_unused_parameters=True) - print_to_err(type(self).__name__, "model built in dygraph") + model = paddle.DataParallel( + model, find_unused_parameters=True) + print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): @@ -130,9 +132,9 @@ def run_trainer_with_spawn(self, args): args.trainer_id = dist.get_rank() if args.update_method == "nccl2": - dist.init_parallel_env() + dist.init_parallel_env() model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": + if args.update_method == "nccl2": if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py new file mode 100644 index 0000000000000..0fd64de5e2288 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestModelParallelLayer(TestMultipleGpus): + def test_parallel_dygraph_dataparallel_no_sync(self): + self.run_mnist_2gpu('parallel_dygraph_dataparallel_no_sync.py') + + +if __name__ == "__main__": + unittest.main() From 60c4b42962c5faa518288db1c03b7326a9743db0 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 12:28:50 +0800 Subject: [PATCH 07/10] modify UT --- .../unittests/test_parallel_dygraph_no_sync_gradient_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py index 0fd64de5e2288..f3fc13f3eea1b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py @@ -22,7 +22,7 @@ class TestModelParallelLayer(TestMultipleGpus): def test_parallel_dygraph_dataparallel_no_sync(self): - self.run_mnist_2gpu('parallel_dygraph_dataparallel_no_sync.py') + self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py') if __name__ == "__main__": From 10a08402862e3262e1c47ee5da8201eecbddabcd Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 10 Aug 2021 16:52:55 +0800 Subject: [PATCH 08/10] add specific descriptions and notes for no_sync --- paddle/fluid/imperative/reducer.cc | 4 ++ paddle/fluid/imperative/reducer.h | 7 ++- python/paddle/fluid/dygraph/parallel.py | 46 +++++++++++++++++-- .../unittests/parallel_dygraph_no_sync.py | 2 +- .../parallel_dygraph_no_sync_control_flow.py | 2 +- .../parallel_dygraph_no_sync_unused_params.py | 2 +- .../test_parallel_dygraph_no_sync.py | 2 +- 7 files changed, 55 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 56f92cc31e445..053d3f0a1f281 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -600,6 +600,7 @@ void Reducer::AddDistHook(size_t var_index) { "than %d, but it is %d", variable_locators_.size(), var_index)); + // gradient synchronization is not required when grad_need_hooks_ is false. if (!grad_need_hooks_) { return; } @@ -912,6 +913,9 @@ void Reducer::ProcessUnusedDenseVars() { // 3. create grad var base or get grad var base auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); + // NOTE(haohongxiang): Calling SetIsEmpty here is to make sure that + // gradient accumulation can continue normally after clear_gradients() + // especiall in cases including complex control flow. grad_var_base_tmp->SharedVar()->SetIsEmpty(false); // 4. set grad tensor diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index f86be6bbdff23..89519f1b8d6f1 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -197,7 +197,6 @@ class Reducer { std::unordered_map node_deps_; std::unordered_map var_index_map_; std::vector unused_vars_; - bool grad_need_hooks_{false}; bool has_marked_unused_vars_{false}; bool find_unused_vars_each_step_{false}; bool find_unused_vars_once_{true}; @@ -210,6 +209,12 @@ class Reducer { std::condition_variable cv_; #endif + // grad_need_hooks_ is used to mark whether gradient synchronization is + // required across process. The default value is false. When backward() + // is called, grad_need_hooks_ will be assigned to true during preparation + // of backward and revert to false while finalizing backward. + bool grad_need_hooks_{false}; + // it just for checking hook, each parameter can only trigger one hook std::vector vars_marked_ready_; diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index d494a810ca91c..15f4eece4487c 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -484,7 +484,7 @@ def __init__(self, self._layers = layers self.find_unused_parameters = find_unused_parameters - self.require_backward_grad_sync = True + self.grad_need_sync = True # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by @@ -580,17 +580,53 @@ def _find_varbase(self, obj): @contextmanager def no_sync(self): - old_require_backward_grad_sync = self.require_backward_grad_sync - self.require_backward_grad_sync = False + """ + A context manager to stop gradient synchronization. Within no_sync(), + gradients of parameters will only be accumulated on model and not + synchronized util the first forward-backward out of this context. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.nn as nn + import paddle.distributed as dist + + class SimpleNet(nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() + self._linear = nn.Linear(10, 1) + + def forward(self, x): + return self._linear(x) + + dist.init_parallel_env() + model = SimpleNet() + dp_model = paddle.DataParallel(model) + + inputs_1 = paddle.randn([10, 10], 'float32') + inputs_2 = paddle.ones([10, 10], 'float32') + + with dp_model.no_sync(): + # gradients will not be synchronized + dp_model(inputs_1).backward() + + # synchronization happens here + dp_model(inputs_2).backward() + + """ + tmp_grad_need_sync = self.grad_need_sync + self.grad_need_sync = False try: yield finally: - self.require_backward_grad_sync = old_require_backward_grad_sync + self.grad_need_sync = tmp_grad_need_sync def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) if self._strategy.nranks > 1 and framework._dygraph_tracer( - )._has_grad and self.require_backward_grad_sync: + )._has_grad and self.grad_need_sync: self._reducer.prepare_for_backward( list(self._find_varbase(outputs))) return outputs diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py index f9d6c697b2bb3..0e7e1a32cfa05 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py index 392f75b35df5d..ebc0cd7d6f3de 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_control_flow.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py index a12d3fb043ddd..a5ab327b7788a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_no_sync_unused_params.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py index 362b5219372ad..a1a8ae52eb787 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From bca069be27ae0ab6830e3b876157c252bba32724 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Thu, 12 Aug 2021 14:09:24 +0800 Subject: [PATCH 09/10] check code style --- paddle/fluid/imperative/reducer.cc | 2 +- paddle/fluid/imperative/reducer.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 053d3f0a1f281..4166edf32573c 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -913,7 +913,7 @@ void Reducer::ProcessUnusedDenseVars() { // 3. create grad var base or get grad var base auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); - // NOTE(haohongxiang): Calling SetIsEmpty here is to make sure that + // NOTE(haohongxiang): Calling SetIsEmpty here is to make sure that // gradient accumulation can continue normally after clear_gradients() // especiall in cases including complex control flow. grad_var_base_tmp->SharedVar()->SetIsEmpty(false); diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 89519f1b8d6f1..3cc40f7b1306a 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -209,12 +209,12 @@ class Reducer { std::condition_variable cv_; #endif - // grad_need_hooks_ is used to mark whether gradient synchronization is + // grad_need_hooks_ is used to mark whether gradient synchronization is // required across process. The default value is false. When backward() // is called, grad_need_hooks_ will be assigned to true during preparation // of backward and revert to false while finalizing backward. bool grad_need_hooks_{false}; - + // it just for checking hook, each parameter can only trigger one hook std::vector vars_marked_ready_; From a700d7d795d5945e1e2691f04ebf39796019c5c5 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 18 Aug 2021 14:57:27 +0800 Subject: [PATCH 10/10] modify UT's TIMEOUT in CMakeLists.txt --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d6d6992ed91b9..eba1a970d0cfe 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -906,8 +906,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) - set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)