From 8e7cf77d2509856bd8843cc329f1d6cc966ee71d Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Fri, 25 Nov 2022 19:04:16 +0800 Subject: [PATCH 1/3] refactor: move wait --- python/paddle/distributed/__init__.py | 2 +- python/paddle/distributed/collective.py | 70 ------------------- .../distributed/communication/__init__.py | 24 +------ .../paddle/distributed/communication/group.py | 67 ++++++++++++++++++ .../sharding_optimizer_stage2.py | 5 +- 5 files changed, 71 insertions(+), 97 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 35d95c305778e..1add4a9172359 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -30,7 +30,6 @@ from .collective import barrier # noqa: F401 from .collective import split # noqa: F401 from .collective import new_group # noqa: F401 -from .collective import wait # noqa: F401 from .communication import ( stream, @@ -53,6 +52,7 @@ is_initialized, destroy_process_group, get_group, + wait, ) # noqa: F401 from .auto_parallel import shard_op # noqa: F401 diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index ef4def05c239c..85b860eee8bda 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -361,73 +361,3 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): paddle.distributed.all_reduce(tmp, sync_op=True) paddle.distributed.wait(tmp) return gp - - -def wait(tensor, group=None, use_calc_stream=True): - """ - - wait to sync stream for group. - - Args: - tensor (Tensor): The Tensor used before sync. - group (Group): The Group instance to perform sync. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). - Default to True. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle - - paddle.distributed.init_parallel_env() - tindata = paddle.randn(shape=[2, 3]) - paddle.distributed.all_reduce(tindata, sync_op=True) - paddle.distributed.wait(tindata) - - """ - - if group is not None and not group.is_member(): - return - - ring_id = 0 if group is None else group.id - - if use_calc_stream: - _sync_calc_stream(tensor) - else: - _sync_comm_stream(tensor, ring_id) - - -def _sync_calc_stream(tensor): - - if _non_static_mode(): - return _legacy_C_ops.c_sync_calc_stream(tensor, tensor) - - op_type = 'c_sync_calc_stream' - - helper = LayerHelper(op_type, **locals()) - helper.append_op( - type=op_type, - inputs={'X': [tensor]}, - outputs={'Out': [tensor]}, - ) - - -def _sync_comm_stream(tensor, ring_id=0): - - if _non_static_mode(): - return _legacy_C_ops.c_sync_comm_stream( - [tensor], [tensor], 'ring_id', ring_id - ) - - op_type = 'c_sync_comm_stream' - - helper = LayerHelper(op_type, **locals()) - helper.append_op( - type=op_type, - inputs={'X': [tensor]}, - outputs={'Out': [tensor]}, - attrs={'ring_id': ring_id}, - ) diff --git a/python/paddle/distributed/communication/__init__.py b/python/paddle/distributed/communication/__init__.py index 3b5872ba2c8ff..e640a3df0bff3 100644 --- a/python/paddle/distributed/communication/__init__.py +++ b/python/paddle/distributed/communication/__init__.py @@ -21,26 +21,4 @@ from .batch_isend_irecv import batch_isend_irecv, P2POp from .reduce_scatter import reduce_scatter from .all_to_all import alltoall, alltoall_single -from .group import is_initialized, destroy_process_group, get_group - -__all__ = [ - "ReduceOp", - "all_gather", - "all_gather_object", - "all_reduce", - "alltoall", - "alltoall_single", - "broadcast", - "reduce", - "send", - "scatter", - "isend", - "recv", - "irecv", - "batch_isend_irecv", - "P2POp", - "reduce_scatter", - "is_initialized", - "destroy_process_group", - "get_group", -] +from .group import is_initialized, destroy_process_group, get_group, wait diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py index 8a2c9304aa96e..ce6de3153a2ca 100644 --- a/python/paddle/distributed/communication/group.py +++ b/python/paddle/distributed/communication/group.py @@ -13,7 +13,10 @@ # limitations under the License. import warnings +import paddle import paddle.distributed as dist +import paddle.fluid.framework as framework +import paddle.fluid.layer_helper as layer_helper class Group: @@ -227,3 +230,67 @@ def get_group(id=0): return _GroupManager.group_map_by_id[id] warnings.warn("Group {} is not initialized.".format(id)) return None + + +def _sync_calc_stream(tensor): + if framework._non_static_mode(): + return paddle._legacy_C_ops.c_sync_calc_stream(tensor, tensor) + + op_type = 'c_sync_calc_stream' + helper = layer_helper.LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + ) + + +def _sync_comm_stream(tensor, ring_id=0): + if framework._non_static_mode(): + return paddle._legacy_C_ops.c_sync_comm_stream( + [tensor], [tensor], 'ring_id', ring_id + ) + + op_type = 'c_sync_comm_stream' + helper = layer_helper.LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + attrs={'ring_id': ring_id}, + ) + + +def wait(tensor, group=None, use_calc_stream=True): + """ + + wait to sync stream for group. + + Args: + tensor (Tensor): The Tensor used before sync. + group (Group): The Group instance to perform sync. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle + + paddle.distributed.init_parallel_env() + tindata = paddle.randn(shape=[2, 3]) + paddle.distributed.all_reduce(tindata, sync_op=True) + paddle.distributed.wait(tindata) + + """ + if group is not None and not group.is_member(): + return + + if use_calc_stream: + _sync_calc_stream(tensor) + else: + ring_id = 0 if group is None else group.id + _sync_comm_stream(tensor, ring_id) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index 615980ab5230f..129ba772a058b 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -34,7 +34,6 @@ from paddle.distributed.collective import ( _get_global_group, new_group, - wait, ) from ...utils.internal_storage import ParamStorage, GradStorage @@ -174,7 +173,7 @@ def _sync_params_and_buffers(self): ) # Multi stream operation will be supported later - wait(tensor=p, group=self.group, use_calc_stream=True) + dist.wait(tensor=p, group=self.group, use_calc_stream=True) def _generate_master_params(self, trainable_params): if self.offload: @@ -464,7 +463,7 @@ def _broadcast_params(self): ) # Multi stream operation will be supported later - wait( + dist.wait( tensor=internal_storage.buffer, group=self.group, use_calc_stream=True, From 42548463496f3f493979f4dc247d1aa3e8258337 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Fri, 25 Nov 2022 20:33:01 +0800 Subject: [PATCH 2/3] refactor: move barrier --- python/paddle/distributed/__init__.py | 2 +- python/paddle/distributed/collective.py | 59 +------------------ .../distributed/communication/__init__.py | 8 ++- .../paddle/distributed/communication/group.py | 56 ++++++++++++++++++ 4 files changed, 65 insertions(+), 60 deletions(-) diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 1add4a9172359..e7832758a8013 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -27,7 +27,6 @@ from paddle.distributed.fleet.dataset import QueueDataset # noqa: F401 from paddle.distributed.fleet.base.topology import ParallelMode # noqa: F401 -from .collective import barrier # noqa: F401 from .collective import split # noqa: F401 from .collective import new_group # noqa: F401 @@ -53,6 +52,7 @@ destroy_process_group, get_group, wait, + barrier, ) # noqa: F401 from .auto_parallel import shard_op # noqa: F401 diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 85b860eee8bda..eeb45959ee630 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -13,13 +13,10 @@ # limitations under the License. import datetime -from ..fluid.layer_helper import LayerHelper from ..fluid.framework import in_dygraph_mode from ..fluid.framework import _non_static_mode -from ..fluid.layers.tensor import fill_constant import paddle import paddle.fluid.core as core -from paddle import _legacy_C_ops from .fleet.layers.mpu.mp_ops import split # noqa: F401 from .fleet.layers.mpu.mp_ops import _c_identity # noqa: F401 from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401 @@ -160,60 +157,6 @@ def _new_process_group_impl( return pg -def barrier(group=None): - """ - - Barrier among all participators in the group. - - Args: - group (Group): The group instance return by new_group or None for global default group. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle - from paddle.distributed import init_parallel_env - - paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id) - init_parallel_env() - paddle.distributed.barrier() - """ - if group is not None and not group.is_member(): - return - - if in_dygraph_mode(): - group = _get_default_group() if group is None else group - place = paddle.fluid.framework._current_expected_place() - if isinstance(place, paddle.fluid.core.CPUPlace): - task = group.process_group.barrier() - else: - device_id = place.get_device_id() - task = group.process_group.barrier(device_id) - task.wait() - return - - ring_id = 0 if group is None else group.id - - temp = fill_constant([1], dtype="int32", value="1") - if _non_static_mode(): - return _legacy_C_ops.barrier(temp, temp, 'ring_id', ring_id) - - op_type = 'barrier' - - if not isinstance(ring_id, int): - raise ValueError("The type of 'group' for barrier must be int.") - helper = LayerHelper(op_type, **locals()) - helper.append_op( - type=op_type, - inputs={'X': [temp]}, - outputs={'Out': [temp]}, - attrs={'ring_id': ring_id}, - ) - - # _custom_gid provides a way for users to # set the group id, which is usually useful # to be compatible with the static mode. @@ -356,7 +299,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): tmp = ( paddle.to_tensor([1], dtype="int32") if _non_static_mode() - else fill_constant([0], dtype="int32", value="1") + else paddle.full([0], 1, dtype="int32") ) paddle.distributed.all_reduce(tmp, sync_op=True) paddle.distributed.wait(tmp) diff --git a/python/paddle/distributed/communication/__init__.py b/python/paddle/distributed/communication/__init__.py index e640a3df0bff3..fb3408020d624 100644 --- a/python/paddle/distributed/communication/__init__.py +++ b/python/paddle/distributed/communication/__init__.py @@ -21,4 +21,10 @@ from .batch_isend_irecv import batch_isend_irecv, P2POp from .reduce_scatter import reduce_scatter from .all_to_all import alltoall, alltoall_single -from .group import is_initialized, destroy_process_group, get_group, wait +from .group import ( + is_initialized, + destroy_process_group, + get_group, + wait, + barrier, +) diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py index ce6de3153a2ca..ea0fd6c3aaa91 100644 --- a/python/paddle/distributed/communication/group.py +++ b/python/paddle/distributed/communication/group.py @@ -15,6 +15,7 @@ import warnings import paddle import paddle.distributed as dist +import paddle.fluid.core as core import paddle.fluid.framework as framework import paddle.fluid.layer_helper as layer_helper @@ -294,3 +295,58 @@ def wait(tensor, group=None, use_calc_stream=True): else: ring_id = 0 if group is None else group.id _sync_comm_stream(tensor, ring_id) + + +def barrier(group=None): + """ + + Barrier among all participators in the group. + + Args: + group (Group): The group instance return by new_group or None for global default group. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle + from paddle.distributed import init_parallel_env + + paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id) + init_parallel_env() + paddle.distributed.barrier() + """ + if group is not None and not group.is_member(): + return + + if framework.in_dygraph_mode(): + group = _get_global_group() if group is None else group + place = framework._current_expected_place() + if isinstance(place, core.CPUPlace): + task = group.process_group.barrier() + else: + device_id = place.get_device_id() + task = group.process_group.barrier(device_id) + task.wait() + return + + ring_id = 0 if group is None else group.id + + barrier_tensor = paddle.full([1], 1, dtype="int32") + if framework._non_static_mode(): + return paddle._legacy_C_ops.barrier( + barrier_tensor, barrier_tensor, 'ring_id', ring_id + ) + + op_type = 'barrier' + if not isinstance(ring_id, int): + raise ValueError("The type of 'group' for barrier must be int.") + helper = layer_helper.LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [barrier_tensor]}, + outputs={'Out': [barrier_tensor]}, + attrs={'ring_id': ring_id}, + ) From 16480feeda3bedbdc094543254fe10ed3a94ec73 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Sat, 26 Nov 2022 00:02:28 +0800 Subject: [PATCH 3/3] fix: fix incorrect import --- .../fleet/meta_parallel/sharding/sharding_stage2.py | 6 +++--- .../fleet/meta_parallel/sharding/sharding_stage3.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py index a6ac8edc063db..e19b08a7d9c33 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py @@ -318,7 +318,7 @@ def __sync_buffers(self): buffer, self._global_root_rank, self._group, sync_op=True ) # Multi stream operation will be supported later - collective.wait(tensor=buffer, group=self._group, use_calc_stream=True) + dist.wait(tensor=buffer, group=self._group, use_calc_stream=True) def __getattr__(self, name): """Forward missing attributes to wrapped layer.""" @@ -382,7 +382,7 @@ def cleanup(): ) # Multi stream operation will be supported later - collective.wait( + dist.wait( tensor=param.grad, group=self._group, use_calc_stream=True, @@ -448,7 +448,7 @@ def cleanup(): ) # Multi stream operation will be supported later - collective.wait( + dist.wait( tensor=grad_storage.buffer, group=self._group, use_calc_stream=True, diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index c9a14fd17bede..7da9762f8cb26 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -184,7 +184,7 @@ def _sync_params_and_buffers(self): ) # Multi stream operation will be supported later - collective.wait(tensor=p, group=self._group, use_calc_stream=True) + dist.wait(tensor=p, group=self._group, use_calc_stream=True) def _clear_gradients(self): assert len(self._trainable_params.keys()) > 0 @@ -485,7 +485,7 @@ def _sync_buffers(self): buffer, self._global_root_rank, self._group, sync_op=True ) # Multi stream operation will be supported later - collective.wait(tensor=buffer, group=self._group, use_calc_stream=True) + dist.wait(tensor=buffer, group=self._group, use_calc_stream=True) def __getattr__(self, name): """Forward missing attributes to wrapped layer.""" @@ -529,7 +529,7 @@ def _update_params(self): dist.all_reduce( tensor=grad_storage.buffer, group=self._group, sync_op=True ) - collective.wait( + dist.wait( tensor=grad_storage.buffer, group=self._group, use_calc_stream=True, @@ -601,7 +601,7 @@ def allreduce_(*_): dist.all_reduce( tensor=full_grad, group=self._group, sync_op=True ) - collective.wait( + dist.wait( tensor=full_grad, group=self._group, use_calc_stream=True ) @@ -946,7 +946,7 @@ def _allgather_buffer( # Allgather current layer in the 1st step synchronously if sync_wait: with paddle.amp.auto_cast(enable=False): - collective.wait( + dist.wait( tensor=full_param, group=group, use_calc_stream=use_calc_stream,