From 8e7cf77d2509856bd8843cc329f1d6cc966ee71d Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Fri, 25 Nov 2022 19:04:16 +0800
Subject: [PATCH 1/3] refactor: move wait

---
 python/paddle/distributed/__init__.py         |  2 +-
 python/paddle/distributed/collective.py       | 70 -------------------
 .../distributed/communication/__init__.py     | 24 +------
 .../paddle/distributed/communication/group.py | 67 ++++++++++++++++++
 .../sharding_optimizer_stage2.py              |  5 +-
 5 files changed, 71 insertions(+), 97 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 35d95c305778e..1add4a9172359 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -30,7 +30,6 @@
 from .collective import barrier  # noqa: F401
 from .collective import split  # noqa: F401
 from .collective import new_group  # noqa: F401
-from .collective import wait  # noqa: F401
 
 from .communication import (
     stream,
@@ -53,6 +52,7 @@
     is_initialized,
     destroy_process_group,
     get_group,
+    wait,
 )  # noqa: F401
 
 from .auto_parallel import shard_op  # noqa: F401
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index ef4def05c239c..85b860eee8bda 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -361,73 +361,3 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
     paddle.distributed.all_reduce(tmp, sync_op=True)
     paddle.distributed.wait(tmp)
     return gp
-
-
-def wait(tensor, group=None, use_calc_stream=True):
-    """
-
-    wait to sync stream for group.
-
-    Args:
-        tensor (Tensor): The Tensor used before sync.
-        group (Group): The Group instance to perform sync.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-
-            paddle.distributed.init_parallel_env()
-            tindata = paddle.randn(shape=[2, 3])
-            paddle.distributed.all_reduce(tindata, sync_op=True)
-            paddle.distributed.wait(tindata)
-
-    """
-
-    if group is not None and not group.is_member():
-        return
-
-    ring_id = 0 if group is None else group.id
-
-    if use_calc_stream:
-        _sync_calc_stream(tensor)
-    else:
-        _sync_comm_stream(tensor, ring_id)
-
-
-def _sync_calc_stream(tensor):
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_sync_calc_stream(tensor, tensor)
-
-    op_type = 'c_sync_calc_stream'
-
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        outputs={'Out': [tensor]},
-    )
-
-
-def _sync_comm_stream(tensor, ring_id=0):
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_sync_comm_stream(
-            [tensor], [tensor], 'ring_id', ring_id
-        )
-
-    op_type = 'c_sync_comm_stream'
-
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [tensor]},
-        outputs={'Out': [tensor]},
-        attrs={'ring_id': ring_id},
-    )
diff --git a/python/paddle/distributed/communication/__init__.py b/python/paddle/distributed/communication/__init__.py
index 3b5872ba2c8ff..e640a3df0bff3 100644
--- a/python/paddle/distributed/communication/__init__.py
+++ b/python/paddle/distributed/communication/__init__.py
@@ -21,26 +21,4 @@
 from .batch_isend_irecv import batch_isend_irecv, P2POp
 from .reduce_scatter import reduce_scatter
 from .all_to_all import alltoall, alltoall_single
-from .group import is_initialized, destroy_process_group, get_group
-
-__all__ = [
-    "ReduceOp",
-    "all_gather",
-    "all_gather_object",
-    "all_reduce",
-    "alltoall",
-    "alltoall_single",
-    "broadcast",
-    "reduce",
-    "send",
-    "scatter",
-    "isend",
-    "recv",
-    "irecv",
-    "batch_isend_irecv",
-    "P2POp",
-    "reduce_scatter",
-    "is_initialized",
-    "destroy_process_group",
-    "get_group",
-]
+from .group import is_initialized, destroy_process_group, get_group, wait
diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py
index 8a2c9304aa96e..ce6de3153a2ca 100644
--- a/python/paddle/distributed/communication/group.py
+++ b/python/paddle/distributed/communication/group.py
@@ -13,7 +13,10 @@
 # limitations under the License.
 
 import warnings
+import paddle
 import paddle.distributed as dist
+import paddle.fluid.framework as framework
+import paddle.fluid.layer_helper as layer_helper
 
 
 class Group:
@@ -227,3 +230,67 @@ def get_group(id=0):
         return _GroupManager.group_map_by_id[id]
     warnings.warn("Group {} is not initialized.".format(id))
     return None
+
+
+def _sync_calc_stream(tensor):
+    if framework._non_static_mode():
+        return paddle._legacy_C_ops.c_sync_calc_stream(tensor, tensor)
+
+    op_type = 'c_sync_calc_stream'
+    helper = layer_helper.LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+    )
+
+
+def _sync_comm_stream(tensor, ring_id=0):
+    if framework._non_static_mode():
+        return paddle._legacy_C_ops.c_sync_comm_stream(
+            [tensor], [tensor], 'ring_id', ring_id
+        )
+
+    op_type = 'c_sync_comm_stream'
+    helper = layer_helper.LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': ring_id},
+    )
+
+
+def wait(tensor, group=None, use_calc_stream=True):
+    """
+
+    wait to sync stream for group.
+
+    Args:
+        tensor (Tensor): The Tensor used before sync.
+        group (Group): The Group instance to perform sync.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            paddle.distributed.all_reduce(tindata, sync_op=True)
+            paddle.distributed.wait(tindata)
+
+    """
+    if group is not None and not group.is_member():
+        return
+
+    if use_calc_stream:
+        _sync_calc_stream(tensor)
+    else:
+        ring_id = 0 if group is None else group.id
+        _sync_comm_stream(tensor, ring_id)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 615980ab5230f..129ba772a058b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -34,7 +34,6 @@
 from paddle.distributed.collective import (
     _get_global_group,
     new_group,
-    wait,
 )
 
 from ...utils.internal_storage import ParamStorage, GradStorage
@@ -174,7 +173,7 @@ def _sync_params_and_buffers(self):
             )
 
         # Multi stream operation will be supported later
-        wait(tensor=p, group=self.group, use_calc_stream=True)
+        dist.wait(tensor=p, group=self.group, use_calc_stream=True)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -464,7 +463,7 @@ def _broadcast_params(self):
                 )
 
             # Multi stream operation will be supported later
-            wait(
+            dist.wait(
                 tensor=internal_storage.buffer,
                 group=self.group,
                 use_calc_stream=True,

From 42548463496f3f493979f4dc247d1aa3e8258337 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Fri, 25 Nov 2022 20:33:01 +0800
Subject: [PATCH 2/3] refactor: move barrier

---
 python/paddle/distributed/__init__.py         |  2 +-
 python/paddle/distributed/collective.py       | 59 +------------------
 .../distributed/communication/__init__.py     |  8 ++-
 .../paddle/distributed/communication/group.py | 56 ++++++++++++++++++
 4 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 1add4a9172359..e7832758a8013 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -27,7 +27,6 @@
 from paddle.distributed.fleet.dataset import QueueDataset  # noqa: F401
 from paddle.distributed.fleet.base.topology import ParallelMode  # noqa: F401
 
-from .collective import barrier  # noqa: F401
 from .collective import split  # noqa: F401
 from .collective import new_group  # noqa: F401
 
@@ -53,6 +52,7 @@
     destroy_process_group,
     get_group,
     wait,
+    barrier,
 )  # noqa: F401
 
 from .auto_parallel import shard_op  # noqa: F401
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 85b860eee8bda..eeb45959ee630 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -13,13 +13,10 @@
 # limitations under the License.
 
 import datetime
-from ..fluid.layer_helper import LayerHelper
 from ..fluid.framework import in_dygraph_mode
 from ..fluid.framework import _non_static_mode
-from ..fluid.layers.tensor import fill_constant
 import paddle
 import paddle.fluid.core as core
-from paddle import _legacy_C_ops
 from .fleet.layers.mpu.mp_ops import split  # noqa: F401
 from .fleet.layers.mpu.mp_ops import _c_identity  # noqa: F401
 from .fleet.layers.mpu.mp_ops import _c_concat  # noqa: F401
@@ -160,60 +157,6 @@ def _new_process_group_impl(
     return pg
 
 
-def barrier(group=None):
-    """
-
-    Barrier among all participators in the group.
-
-    Args:
-        group (Group): The group instance return by new_group or None for global default group.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.distributed import init_parallel_env
-
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            init_parallel_env()
-            paddle.distributed.barrier()
-    """
-    if group is not None and not group.is_member():
-        return
-
-    if in_dygraph_mode():
-        group = _get_default_group() if group is None else group
-        place = paddle.fluid.framework._current_expected_place()
-        if isinstance(place, paddle.fluid.core.CPUPlace):
-            task = group.process_group.barrier()
-        else:
-            device_id = place.get_device_id()
-            task = group.process_group.barrier(device_id)
-        task.wait()
-        return
-
-    ring_id = 0 if group is None else group.id
-
-    temp = fill_constant([1], dtype="int32", value="1")
-    if _non_static_mode():
-        return _legacy_C_ops.barrier(temp, temp, 'ring_id', ring_id)
-
-    op_type = 'barrier'
-
-    if not isinstance(ring_id, int):
-        raise ValueError("The type of 'group' for barrier must be int.")
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(
-        type=op_type,
-        inputs={'X': [temp]},
-        outputs={'Out': [temp]},
-        attrs={'ring_id': ring_id},
-    )
-
-
 # _custom_gid provides a way for users to
 # set the group id, which is usually useful
 # to be compatible with the static mode.
@@ -356,7 +299,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
     tmp = (
         paddle.to_tensor([1], dtype="int32")
         if _non_static_mode()
-        else fill_constant([0], dtype="int32", value="1")
+        else paddle.full([0], 1, dtype="int32")
     )
     paddle.distributed.all_reduce(tmp, sync_op=True)
     paddle.distributed.wait(tmp)
diff --git a/python/paddle/distributed/communication/__init__.py b/python/paddle/distributed/communication/__init__.py
index e640a3df0bff3..fb3408020d624 100644
--- a/python/paddle/distributed/communication/__init__.py
+++ b/python/paddle/distributed/communication/__init__.py
@@ -21,4 +21,10 @@
 from .batch_isend_irecv import batch_isend_irecv, P2POp
 from .reduce_scatter import reduce_scatter
 from .all_to_all import alltoall, alltoall_single
-from .group import is_initialized, destroy_process_group, get_group, wait
+from .group import (
+    is_initialized,
+    destroy_process_group,
+    get_group,
+    wait,
+    barrier,
+)
diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py
index ce6de3153a2ca..ea0fd6c3aaa91 100644
--- a/python/paddle/distributed/communication/group.py
+++ b/python/paddle/distributed/communication/group.py
@@ -15,6 +15,7 @@
 import warnings
 import paddle
 import paddle.distributed as dist
+import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 import paddle.fluid.layer_helper as layer_helper
 
@@ -294,3 +295,58 @@ def wait(tensor, group=None, use_calc_stream=True):
     else:
         ring_id = 0 if group is None else group.id
         _sync_comm_stream(tensor, ring_id)
+
+
+def barrier(group=None):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (Group): The group instance return by new_group or None for global default group.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
+    """
+    if group is not None and not group.is_member():
+        return
+
+    if framework.in_dygraph_mode():
+        group = _get_global_group() if group is None else group
+        place = framework._current_expected_place()
+        if isinstance(place, core.CPUPlace):
+            task = group.process_group.barrier()
+        else:
+            device_id = place.get_device_id()
+            task = group.process_group.barrier(device_id)
+        task.wait()
+        return
+
+    ring_id = 0 if group is None else group.id
+
+    barrier_tensor = paddle.full([1], 1, dtype="int32")
+    if framework._non_static_mode():
+        return paddle._legacy_C_ops.barrier(
+            barrier_tensor, barrier_tensor, 'ring_id', ring_id
+        )
+
+    op_type = 'barrier'
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = layer_helper.LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [barrier_tensor]},
+        outputs={'Out': [barrier_tensor]},
+        attrs={'ring_id': ring_id},
+    )

From 16480feeda3bedbdc094543254fe10ed3a94ec73 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Sat, 26 Nov 2022 00:02:28 +0800
Subject: [PATCH 3/3] fix: fix incorrect import

---
 .../fleet/meta_parallel/sharding/sharding_stage2.py    |  6 +++---
 .../fleet/meta_parallel/sharding/sharding_stage3.py    | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index a6ac8edc063db..e19b08a7d9c33 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -318,7 +318,7 @@ def __sync_buffers(self):
                 buffer, self._global_root_rank, self._group, sync_op=True
             )
         # Multi stream operation will be supported later
-        collective.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
@@ -382,7 +382,7 @@ def cleanup():
                     )
 
                     # Multi stream operation will be supported later
-                    collective.wait(
+                    dist.wait(
                         tensor=param.grad,
                         group=self._group,
                         use_calc_stream=True,
@@ -448,7 +448,7 @@ def cleanup():
                         )
 
                         # Multi stream operation will be supported later
-                        collective.wait(
+                        dist.wait(
                             tensor=grad_storage.buffer,
                             group=self._group,
                             use_calc_stream=True,
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index c9a14fd17bede..7da9762f8cb26 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -184,7 +184,7 @@ def _sync_params_and_buffers(self):
             )
 
         # Multi stream operation will be supported later
-        collective.wait(tensor=p, group=self._group, use_calc_stream=True)
+        dist.wait(tensor=p, group=self._group, use_calc_stream=True)
 
     def _clear_gradients(self):
         assert len(self._trainable_params.keys()) > 0
@@ -485,7 +485,7 @@ def _sync_buffers(self):
                 buffer, self._global_root_rank, self._group, sync_op=True
             )
         # Multi stream operation will be supported later
-        collective.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
@@ -529,7 +529,7 @@ def _update_params(self):
             dist.all_reduce(
                 tensor=grad_storage.buffer, group=self._group, sync_op=True
             )
-            collective.wait(
+            dist.wait(
                 tensor=grad_storage.buffer,
                 group=self._group,
                 use_calc_stream=True,
@@ -601,7 +601,7 @@ def allreduce_(*_):
                 dist.all_reduce(
                     tensor=full_grad, group=self._group, sync_op=True
                 )
-                collective.wait(
+                dist.wait(
                     tensor=full_grad, group=self._group, use_calc_stream=True
                 )
 
@@ -946,7 +946,7 @@ def _allgather_buffer(
         # Allgather current layer in the 1st step synchronously
         if sync_wait:
             with paddle.amp.auto_cast(enable=False):
-                collective.wait(
+                dist.wait(
                     tensor=full_param,
                     group=group,
                     use_calc_stream=use_calc_stream,