From ca83fbce596cd0a76b44d64972d005fa88cefedf Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 7 Sep 2022 16:23:01 +0800
Subject: [PATCH 1/7] feat(python/distributed/collective): add bfloat16 support
 for collective ops

---
 .../collective/ProcessGroupGloo.cc            |   3 +
 .../collective/ProcessGroupNCCL.cc            |   5 +
 .../fluid/platform/device/gpu/nccl_helper.h   |   4 +-
 python/paddle/distributed/collective.py       |  32 +--
 .../collective_allgather_api_dygraph.py       |  15 +-
 .../collective_allreduce_api_dygraph.py       |  13 +-
 .../collective_alltoall_api_dygraph.py        |  22 +-
 .../collective_alltoall_single_api_dygraph.py |  16 +-
 .../collective_broadcast_api_dygraph.py       |  13 +-
 .../collective_isend_irecv_api_dygraph.py     |  23 +-
 .../collective_reduce_api_dygraph.py          |  13 +-
 .../collective_reduce_scatter_api_dygraph.py  |  16 +-
 .../collective_scatter_api_dygraph.py         |  29 ++-
 .../collective_sendrecv_api_dygraph.py        |  26 +-
 .../test_collective_allgather_api.py          | 242 +++---------------
 .../test_collective_allreduce_api.py          |  10 +-
 .../test_collective_alltoall_api.py           |   6 +-
 .../test_collective_alltoall_single_api.py    |   6 +-
 .../test_collective_broadcast_api.py          |  10 +-
 .../test_collective_isend_irecv_api.py        |   6 +-
 .../collective/test_collective_reduce_api.py  |  10 +-
 .../test_collective_reduce_scatter_api.py     |   6 +-
 .../collective/test_collective_scatter_api.py |  10 +-
 .../test_collective_sendrecv_api.py           |   6 +-
 .../unittests/test_collective_api_base.py     |  38 ++-
 25 files changed, 270 insertions(+), 310 deletions(-)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 097c9799b70f2..07065ac908e4e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -88,6 +88,9 @@ namespace distributed {
     case experimental::DataType::BOOL:       \
       func<bool>(args);                      \
       break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
     default:                                 \
       VLOG(0) << "Error: Unknown DataType."; \
       exit(-1);                              \
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index b406f596401ef..90917229f3cc2 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -996,6 +996,11 @@ void* GetPointerByOffset(void* raw_pointer,
   } else if (type == experimental::DataType::BOOL) {
     return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
                                    offset);
+#if NCCL_VERSION_CODE >= 21000
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<bfloat16*>(raw_pointer) +
+                                   offset);
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index a5d89f6001fa1..5d89da86efa6c 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == framework::proto::VarType::BF16) {
     return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt8;
   } else if (type == experimental::DataType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == experimental::DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 41cb3256c8f5d..4c14638130af8 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -626,7 +626,7 @@ def broadcast(tensor, src, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -709,7 +709,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -817,7 +817,7 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
 
     Args:
         tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
         group (Group, optional): The group instance return by new_group or None for global default group.
@@ -999,9 +999,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
         src (int): The source rank id. Default value is 0.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1096,7 +1096,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
 
     Args:
         in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
@@ -1197,7 +1197,7 @@ def alltoall_single(in_tensor,
         ``alltoall_single`` is only supported in eager mode.
 
     Args:
-        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
         in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
@@ -1286,7 +1286,7 @@ def send(tensor, dst=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1352,7 +1352,7 @@ def recv(tensor, src=0, group=None, sync_op=True):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1435,7 +1435,7 @@ def isend(tensor, dst, group=None):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
@@ -1485,7 +1485,7 @@ def irecv(tensor, src=None, group=None):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
@@ -1668,9 +1668,9 @@ def reduce_scatter(tensor,
     Reduces, then scatters a list of tensors to all processes in a group
 
     Args:
-        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
@@ -1736,9 +1736,9 @@ def _reduce_scatter_base(output,
     Reduces, then scatters a flattened tensor to all processes in a group.
 
     Args:
-        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index 11fe3e4c0259a..4d5f82e288220 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,18 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            paddle.distributed.all_gather(tensor_list, tindata)
-            return [tensor.numpy() for tensor in tensor_list]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_gather(tensor_list, tindata)
+                return [
+                    tensor.cast("float32").numpy() for tensor in tensor_list
+                ]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_gather(tensor_list, tindata)
+                return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 44446bd84a164..9bdbaa18177e1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.all_reduce(tindata)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_reduce(tindata)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_reduce(tindata)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index e0589072ab2ad..eb19cadb11426 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -13,23 +13,31 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            paddle.distributed.alltoall(tindata, toutdata)
-            return [data.numpy() for data in toutdata]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.cast("float32").numpy() for data in toutdata]
+            else:
+                tindata = paddle.to_tensor(indata)
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
+    test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index 8a1492b779b62..f66b3a74bfd21 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            toutdata = paddle.to_tensor(indata)
-            paddle.distributed.alltoall_single(tindata, toutdata)
-            return [toutdata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                toutdata = paddle.to_tensor(indata)
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index acb1b4a5866c8..9004d27d56183 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.broadcast(tindata, src=1)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.broadcast(tindata, src=1)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.broadcast(tindata, src=1)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 5434706234535..37a38b218c5dc 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,13 +25,23 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                task = paddle.distributed.isend(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.cast("float32").numpy()]
             else:
-                task = paddle.distributed.irecv(tindata, src=0)
-            task.wait()
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index 5525bd8fa4aab..5e9dfc8265ea1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.reduce(tindata, dst=0)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.reduce(tindata, dst=0)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.reduce(tindata, dst=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index 19777260b6e89..c9df2459a78e0 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
-            return [subdata1.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index fa65928967bdf..8f27f84a32d52 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,15 +25,27 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            if rank == 0:
-                paddle.distributed.scatter(subdata1, src=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.cast("float32").numpy()]
             else:
-                paddle.distributed.scatter(subdata1,
-                                           tensor_list=[subdata1, subdata2],
-                                           src=1)
-            return [subdata1.numpy()]
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index ac8ffde7a48b3..b4bf24ffbfaa9 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -13,24 +13,34 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                paddle.distributed.send(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.cast("float32").numpy()]
             else:
-                paddle.distributed.recv(tindata, src=0)
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
+    test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index af4e6c10baaf9..eb51453387bb4 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -26,213 +26,53 @@ def _setup_config(self):
         pass
 
     def test_allgather_nccl(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "nccl",
+                                  dtype=dtype)
 
     def test_allgather_gloo(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  dtype=dtype)
 
     def test_allgatther_nccl_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "nccl",
+                                  static_mode="0",
+                                  dtype=dtype)
 
     def test_allgather_gloo_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  static_mode="0",
+                                  dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index c0bd54a6fad7a..8e1febf121374 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -41,8 +41,8 @@ def test_allreduce_gloo(self):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -53,8 +53,8 @@ def test_allreduce_nccl_dygraph(self):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -65,5 +65,5 @@ def test_allreduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index a042507ede1d4..511d66f4567f6 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -30,8 +30,8 @@ def test_alltoall_nccl(self):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
@@ -41,5 +41,5 @@ def test_alltoall_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index 2f18903068edb..26bc4a777f0f5 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -23,8 +23,8 @@ def _setup_config(self):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
@@ -34,5 +34,5 @@ def test_alltooall_single_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index f0c7682805247..5c25693d1f29f 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -35,8 +35,8 @@ def test_broadcast_gloo(self):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -47,8 +47,8 @@ def test_broadcast_nccl_dygraph(self):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -59,5 +59,5 @@ def test_broadcast_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index 333da7e6807aa..4cbbc88bc3bf2 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -23,8 +23,8 @@ def _setup_config(self):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
@@ -34,5 +34,5 @@ def test_isend_irecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index ccaf61472fe8a..579a23e4d3e49 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -38,8 +38,8 @@ def test_reduce_gloo(self):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -50,8 +50,8 @@ def test_reduce_nccl_dygraph(self):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -62,5 +62,5 @@ def test_reduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index d490a8bbce5df..e6a16234e4280 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -23,8 +23,8 @@ def _setup_config(self):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
@@ -34,5 +34,5 @@ def test_reduce_scatter_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index d5e8e7cc62e16..b693df152f6b8 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -34,8 +34,8 @@ def test_scatter_nccl(self):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -46,8 +46,8 @@ def test_scatter_nccl_dygraph(self):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -58,5 +58,5 @@ def test_scatter_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index ee8ada3d22be6..fe2f94e8cd6ff 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -32,8 +32,8 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
@@ -43,5 +43,5 @@ def test_sendrecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index b05481191533e..30a95af6cf8ff 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -28,6 +28,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.unique_name as nameGen
 from paddle.fluid import core
+from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -81,6 +82,9 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
+    elif dtype == "bfloat16":
+        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
+        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -311,6 +315,10 @@ def check_with_place(self,
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
+        # cast bfloat16 to float32 for numeric comparison
+        if dtype == "bfloat16":
+            input1 = input1.astype("float32")
+            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -327,7 +335,13 @@ def check_with_place(self,
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
+            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
+            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -338,18 +352,28 @@ def check_with_place(self,
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
         elif col_type == "allreduce":
             need_result = input1 + input2
+            if dtype == "bfloat16":
+                rtol = 8e-03
+                atol = 8e-03
+            else:
+                rtol = 1e-05
+                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)

From 68865ed8e3e55688c5b88ec66d39b9e323df073f Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 7 Sep 2022 16:27:52 +0800
Subject: [PATCH 2/7]  chore(python/distributed/collective): update tests
 timeout

---
 .../tests/unittests/collective/CMakeLists.txt  | 18 +++++++++---------
 .../tests/unittests/collective/testslist.csv   | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 19d6f848792a3..6631b7f46e0d0 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_alltoall_single_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_single_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_broadcast_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
     ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_isend_irecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_api MODULES test_collective_reduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_scatter_api MODULES test_collective_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_sendrecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 08c7c394ab788..883cf7941e368 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -7,27 +7,27 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy=
 test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,

From 4c2aac1fcb2cc1fb727f9053e2af0300ff23f023 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Wed, 7 Sep 2022 22:27:31 +0800
Subject: [PATCH 3/7] fix(python/distributed/collective): add nccl version hack

---
 .../collective/test_collective_allgather_api.py          | 4 +++-
 .../collective/test_collective_allreduce_api.py          | 4 +++-
 .../unittests/collective/test_collective_alltoall_api.py | 4 +++-
 .../collective/test_collective_alltoall_single_api.py    | 4 +++-
 .../collective/test_collective_broadcast_api.py          | 4 +++-
 .../collective/test_collective_isend_irecv_api.py        | 4 +++-
 .../unittests/collective/test_collective_reduce_api.py   | 4 +++-
 .../collective/test_collective_reduce_scatter_api.py     | 4 +++-
 .../unittests/collective/test_collective_scatter_api.py  | 4 +++-
 .../unittests/collective/test_collective_sendrecv_api.py | 4 +++-
 .../fluid/tests/unittests/test_collective_api_base.py    | 9 +++++++++
 11 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index eb51453387bb4..9040564ce1206 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -51,8 +51,10 @@ def test_allgather_gloo(self):
     def test_allgatther_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16", "complex64", "complex128"
+            "bool", "complex64", "complex128"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allgather_api_dygraph.py",
                                   "allgather",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index 8e1febf121374..a5080f78bcee2 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -42,8 +42,10 @@ def test_allreduce_gloo(self):
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index 511d66f4567f6..1edb06ae512d6 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -31,8 +31,10 @@ def test_alltoall_nccl(self):
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index 26bc4a777f0f5..e3ef3f302f33e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -24,8 +24,10 @@ def _setup_config(self):
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index 5c25693d1f29f..8f4e747b622eb 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -36,8 +36,10 @@ def test_broadcast_gloo(self):
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index 4cbbc88bc3bf2..2b0727cae0c8e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -24,8 +24,10 @@ def _setup_config(self):
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index 579a23e4d3e49..35bff97f91619 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -39,8 +39,10 @@ def test_reduce_gloo(self):
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index e6a16234e4280..669478f58a37d 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -24,8 +24,10 @@ def _setup_config(self):
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index b693df152f6b8..ab7de7975feed 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -35,8 +35,10 @@ def test_scatter_nccl(self):
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index fe2f94e8cd6ff..3db6df5d46e19 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -33,8 +33,10 @@ def _setup_config(self):
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
             "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 30a95af6cf8ff..2251081e8310e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -177,6 +177,15 @@ def setUp(self):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
+        # NOTE: this is a hack to get int format nccl version, like 2134
+        # if current platform is not linux, version number will be 0
+        nccl_version_str = subprocess.check_output(
+            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
+            stderr=subprocess.DEVNULL,
+            shell=True).decode('utf-8')
+        self._nccl_version = int("".join(
+            nccl_version_str.split("."))) if nccl_version_str else 0
+
     def tearDown(self):
         self.temp_dir.cleanup()
 

From c4c6260ea669f0fc0ee29a372240758abe7fbb47 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Thu, 8 Sep 2022 14:05:03 +0800
Subject: [PATCH 4/7] revert(python/distributed/collective): remove bfloat16
 tests temporarily

---
 .../collective_allgather_api_dygraph.py       |  16 +-
 .../collective_allreduce_api_dygraph.py       |  14 +-
 .../collective_alltoall_api_dygraph.py        |  17 +-
 .../collective_alltoall_single_api_dygraph.py |  16 +-
 .../collective_broadcast_api_dygraph.py       |  14 +-
 .../collective_isend_irecv_api_dygraph.py     |  24 +-
 .../collective_reduce_api_dygraph.py          |  14 +-
 .../collective_reduce_scatter_api_dygraph.py  |  17 +-
 .../collective_scatter_api_dygraph.py         |  30 +--
 .../collective_sendrecv_api_dygraph.py        |  21 +-
 .../test_collective_allgather_api.py          | 244 +++++++++++++++---
 .../test_collective_allreduce_api.py          |  12 +-
 .../test_collective_alltoall_api.py           |   8 +-
 .../test_collective_alltoall_single_api.py    |   8 +-
 .../test_collective_broadcast_api.py          |  12 +-
 .../test_collective_isend_irecv_api.py        |   8 +-
 .../collective/test_collective_reduce_api.py  |  12 +-
 .../test_collective_reduce_scatter_api.py     |   8 +-
 .../collective/test_collective_scatter_api.py |  12 +-
 .../test_collective_sendrecv_api.py           |   8 +-
 .../unittests/test_collective_api_base.py     |  47 +---
 21 files changed, 295 insertions(+), 267 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index 4d5f82e288220..2491297a7e1c3 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,18 +25,10 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
+            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                dist.all_gather(tensor_list, tindata)
-                return [
-                    tensor.cast("float32").numpy() for tensor in tensor_list
-                ]
-            else:
-                tindata = paddle.to_tensor(indata)
-                dist.all_gather(tensor_list, tindata)
-                return [tensor.numpy() for tensor in tensor_list]
+            paddle.distributed.all_gather(tensor_list, tindata)
+            return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 9bdbaa18177e1..933e9e9838ed4 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,15 +25,9 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                dist.all_reduce(tindata)
-                return [tindata.cast("float32").numpy()]
-            else:
-                tindata = paddle.to_tensor(indata)
-                dist.all_reduce(tindata)
-                return [tindata.numpy()]
+            tindata = paddle.to_tensor(indata)
+            paddle.distributed.all_reduce(tindata)
+            return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index eb19cadb11426..4515f12b35a1b 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,18 +25,11 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
+            tindata = paddle.to_tensor(indata)
+            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                tindata = paddle.split(tindata, 2, axis=0)
-                dist.alltoall(tindata, toutdata)
-                return [data.cast("float32").numpy() for data in toutdata]
-            else:
-                tindata = paddle.to_tensor(indata)
-                tindata = paddle.split(tindata, 2, axis=0)
-                dist.alltoall(tindata, toutdata)
-                return [data.numpy() for data in toutdata]
+            paddle.distributed.alltoall(tindata, toutdata)
+            return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index f66b3a74bfd21..8a1492b779b62 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -25,17 +24,10 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
-                dist.alltoall_single(tindata, toutdata)
-                return [toutdata.cast("float32").numpy()]
-            else:
-                tindata = paddle.to_tensor(indata)
-                toutdata = paddle.to_tensor(indata)
-                dist.alltoall_single(tindata, toutdata)
-                return [toutdata.numpy()]
+            tindata = paddle.to_tensor(indata)
+            toutdata = paddle.to_tensor(indata)
+            paddle.distributed.alltoall_single(tindata, toutdata)
+            return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index 9004d27d56183..7357af6693549 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,15 +25,9 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                dist.broadcast(tindata, src=1)
-                return [tindata.cast("float32").numpy()]
-            else:
-                tindata = paddle.to_tensor(indata)
-                dist.broadcast(tindata, src=1)
-                return [tindata.numpy()]
+            tindata = paddle.to_tensor(indata)
+            paddle.distributed.broadcast(tindata, src=1)
+            return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 37a38b218c5dc..0a034b6e629d0 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,23 +25,13 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                if rank == 0:
-                    task = dist.isend(tindata, dst=1)
-                else:
-                    task = dist.irecv(tindata, src=0)
-                task.wait()
-                return [tindata.cast("float32").numpy()]
+            tindata = paddle.to_tensor(indata)
+            if rank == 0:
+                task = paddle.distributed.isend(tindata, dst=1)
             else:
-                tindata = paddle.to_tensor(indata)
-                if rank == 0:
-                    task = dist.isend(tindata, dst=1)
-                else:
-                    task = dist.irecv(tindata, src=0)
-                task.wait()
-                return [tindata.numpy()]
+                task = paddle.distributed.irecv(tindata, src=0)
+            task.wait()
+            return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index 5e9dfc8265ea1..c2489bbcfcfa6 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,15 +25,9 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                dist.reduce(tindata, dst=0)
-                return [tindata.cast("float32").numpy()]
-            else:
-                tindata = paddle.to_tensor(indata)
-                dist.reduce(tindata, dst=0)
-                return [tindata.numpy()]
+            tindata = paddle.to_tensor(indata)
+            paddle.distributed.reduce(tindata, dst=0)
+            return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index c9df2459a78e0..c5c07fe307a75 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,17 +25,10 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-                dist.reduce_scatter(subdata1, [subdata1, subdata2])
-                return [subdata1.cast("float32").numpy()]
-            else:
-                tindata = paddle.to_tensor(indata)
-                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-                dist.reduce_scatter(subdata1, [subdata1, subdata2])
-                return [subdata1.numpy()]
+            tindata = paddle.to_tensor(indata)
+            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
+            return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index 8f27f84a32d52..5647a4c5b9255 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,27 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-                if rank == 0:
-                    dist.scatter(subdata1, src=1)
-                else:
-                    dist.scatter(subdata1,
-                                 tensor_list=[subdata1, subdata2],
-                                 src=1)
-                return [subdata1.cast("float32").numpy()]
+            tindata = paddle.to_tensor(indata)
+            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+            if rank == 0:
+                paddle.distributed.scatter(subdata1, src=1)
             else:
-                tindata = paddle.to_tensor(indata)
-                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-                if rank == 0:
-                    dist.scatter(subdata1, src=1)
-                else:
-                    dist.scatter(subdata1,
-                                 tensor_list=[subdata1, subdata2],
-                                 src=1)
-                return [subdata1.numpy()]
+                paddle.distributed.scatter(subdata1,
+                                           tensor_list=[subdata1, subdata2],
+                                           src=1)
+            return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index b4bf24ffbfaa9..4b3e8221f0797 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid as fluid
+import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,21 +25,12 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
-            if indata.dtype == "bfloat16":
-                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
-                if rank == 0:
-                    dist.send(tindata, dst=1)
-                else:
-                    dist.recv(tindata, src=0)
-                return [tindata.cast("float32").numpy()]
+            tindata = paddle.to_tensor(indata)
+            if rank == 0:
+                paddle.distributed.send(tindata, dst=1)
             else:
-                tindata = paddle.to_tensor(indata)
-                if rank == 0:
-                    dist.send(tindata, dst=1)
-                else:
-                    dist.recv(tindata, src=0)
-                return [tindata.numpy()]
+                paddle.distributed.recv(tindata, src=0)
+            return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index 9040564ce1206..af4e6c10baaf9 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -26,55 +26,213 @@ def _setup_config(self):
         pass
 
     def test_allgather_nccl(self):
-        dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "complex64", "complex128"
-        ]
-        for dtype in dtypes_to_test:
-            self.check_with_place("collective_allgather_api.py",
-                                  "allgather",
-                                  "nccl",
-                                  dtype=dtype)
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="float16")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="float32")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="float64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="bool")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="uint8")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="int8")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="int32")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="int64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="complex64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "nccl",
+                              dtype="complex128")
 
     def test_allgather_gloo(self):
-        dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "complex64", "complex128"
-        ]
-        for dtype in dtypes_to_test:
-            self.check_with_place("collective_allgather_api.py",
-                                  "allgather",
-                                  "gloo",
-                                  "3",
-                                  dtype=dtype)
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="float16")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="float32")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="float64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="bool")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="uint8")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="int8")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="int32")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="int64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="complex64")
+        self.check_with_place("collective_allgather_api.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              dtype="complex128")
 
     def test_allgatther_nccl_dygraph(self):
-        dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "complex64", "complex128"
-        ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
-        for dtype in dtypes_to_test:
-            self.check_with_place("collective_allgather_api_dygraph.py",
-                                  "allgather",
-                                  "nccl",
-                                  static_mode="0",
-                                  dtype=dtype)
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="float16")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="float32")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="float64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="bool")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="uint8")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="int8")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="int32")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="int64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="complex64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "nccl",
+                              static_mode="0",
+                              dtype="complex128")
 
     def test_allgather_gloo_dygraph(self):
-        dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16", "complex64", "complex128"
-        ]
-        for dtype in dtypes_to_test:
-            self.check_with_place("collective_allgather_api_dygraph.py",
-                                  "allgather",
-                                  "gloo",
-                                  "3",
-                                  static_mode="0",
-                                  dtype=dtype)
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="float16")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="float32")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="float64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="bool")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="uint8")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="int8")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="int32")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="int64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="complex64")
+        self.check_with_place("collective_allgather_api_dygraph.py",
+                              "allgather",
+                              "gloo",
+                              "3",
+                              static_mode="0",
+                              dtype="complex128")
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index a5080f78bcee2..c0bd54a6fad7a 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -41,11 +41,9 @@ def test_allreduce_gloo(self):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
@@ -55,8 +53,8 @@ def test_allreduce_nccl_dygraph(self):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -67,5 +65,5 @@ def test_allreduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index 1edb06ae512d6..a042507ede1d4 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -30,11 +30,9 @@ def test_alltoall_nccl(self):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
@@ -43,5 +41,5 @@ def test_alltoall_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index e3ef3f302f33e..2f18903068edb 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -23,11 +23,9 @@ def _setup_config(self):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
@@ -36,5 +34,5 @@ def test_alltooall_single_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index 8f4e747b622eb..f0c7682805247 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -35,11 +35,9 @@ def test_broadcast_gloo(self):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
@@ -49,8 +47,8 @@ def test_broadcast_nccl_dygraph(self):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -61,5 +59,5 @@ def test_broadcast_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index 2b0727cae0c8e..333da7e6807aa 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -23,11 +23,9 @@ def _setup_config(self):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
@@ -36,5 +34,5 @@ def test_isend_irecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index 35bff97f91619..ccaf61472fe8a 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -38,11 +38,9 @@ def test_reduce_gloo(self):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
@@ -52,8 +50,8 @@ def test_reduce_nccl_dygraph(self):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -64,5 +62,5 @@ def test_reduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index 669478f58a37d..d490a8bbce5df 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -23,11 +23,9 @@ def _setup_config(self):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
@@ -36,5 +34,5 @@ def test_reduce_scatter_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index ab7de7975feed..d5e8e7cc62e16 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -34,11 +34,9 @@ def test_scatter_nccl(self):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
@@ -48,8 +46,8 @@ def test_scatter_nccl_dygraph(self):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool", "bfloat16"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -60,5 +58,5 @@ def test_scatter_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index 3db6df5d46e19..ee8ada3d22be6 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -32,11 +32,9 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
-            "bool"
+            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+            'bool'
         ]
-        if self._nccl_version >= 2100:
-            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
@@ -45,5 +43,5 @@ def test_sendrecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 2251081e8310e..b05481191533e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -28,7 +28,6 @@
 import paddle.fluid as fluid
 import paddle.fluid.unique_name as nameGen
 from paddle.fluid import core
-from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -82,9 +81,6 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
-    elif dtype == "bfloat16":
-        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
-        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -177,15 +173,6 @@ def setUp(self):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
-        # NOTE: this is a hack to get int format nccl version, like 2134
-        # if current platform is not linux, version number will be 0
-        nccl_version_str = subprocess.check_output(
-            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
-            stderr=subprocess.DEVNULL,
-            shell=True).decode('utf-8')
-        self._nccl_version = int("".join(
-            nccl_version_str.split("."))) if nccl_version_str else 0
-
     def tearDown(self):
         self.temp_dir.cleanup()
 
@@ -324,10 +311,6 @@ def check_with_place(self,
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
-        # cast bfloat16 to float32 for numeric comparison
-        if dtype == "bfloat16":
-            input1 = input1.astype("float32")
-            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -344,13 +327,7 @@ def check_with_place(self,
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
-            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
-            if dtype == "bfloat16":
-                rtol = 8e-03
-            else:
-                rtol = 1e-05
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -361,28 +338,18 @@ def check_with_place(self,
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            if dtype == "bfloat16":
-                rtol = 8e-03
-            else:
-                rtol = 1e-05
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
         elif col_type == "allreduce":
             need_result = input1 + input2
-            if dtype == "bfloat16":
-                rtol = 8e-03
-                atol = 8e-03
-            else:
-                rtol = 1e-05
-                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=rtol,
-                                       atol=atol)
+                                       rtol=1e-05,
+                                       atol=1e-05)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=rtol,
-                                       atol=atol)
+                                       rtol=1e-05,
+                                       atol=1e-05)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)

From 35eec2bb4104b878a11877da156df28f71dd85e2 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Fri, 9 Sep 2022 10:15:05 +0800
Subject: [PATCH 5/7] refactor(python/distributed/collective): remove useless
 version macro

---
 paddle/fluid/distributed/collective/ProcessGroupNCCL.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 90917229f3cc2..75f061f693b9b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -996,11 +996,9 @@ void* GetPointerByOffset(void* raw_pointer,
   } else if (type == experimental::DataType::BOOL) {
     return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
                                    offset);
-#if NCCL_VERSION_CODE >= 21000
   } else if (type == experimental::DataType::BFLOAT16) {
-    return reinterpret_cast<void*>(reinterpret_cast<bfloat16*>(raw_pointer) +
+    return reinterpret_cast<void*>(reinterpret_cast<uint16_t*>(raw_pointer) +
                                    offset);
-#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));

From 74d862a38c71804dce366caf1256b8705a1d3f71 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Sun, 9 Oct 2022 19:00:24 +0800
Subject: [PATCH 6/7] revert(python/distributed/collective): recover temporary
 bfloat16 tests

---
 .../collective_allgather_api_dygraph.py       |  16 +-
 .../collective_allreduce_api_dygraph.py       |  14 +-
 .../collective_alltoall_api_dygraph.py        |  17 +-
 .../collective_alltoall_single_api_dygraph.py |  16 +-
 .../collective_broadcast_api_dygraph.py       |  14 +-
 .../collective_isend_irecv_api_dygraph.py     |  24 +-
 .../collective_reduce_api_dygraph.py          |  14 +-
 .../collective_reduce_scatter_api_dygraph.py  |  17 +-
 .../collective_scatter_api_dygraph.py         |  30 ++-
 .../collective_sendrecv_api_dygraph.py        |  21 +-
 .../test_collective_allgather_api.py          | 244 +++---------------
 .../test_collective_allreduce_api.py          |  12 +-
 .../test_collective_alltoall_api.py           |   8 +-
 .../test_collective_alltoall_single_api.py    |   8 +-
 .../test_collective_broadcast_api.py          |  12 +-
 .../test_collective_isend_irecv_api.py        |   8 +-
 .../collective/test_collective_reduce_api.py  |  12 +-
 .../test_collective_reduce_scatter_api.py     |   8 +-
 .../collective/test_collective_scatter_api.py |  12 +-
 .../test_collective_sendrecv_api.py           |   8 +-
 .../unittests/test_collective_api_base.py     |  47 +++-
 21 files changed, 267 insertions(+), 295 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index 2491297a7e1c3..4d5f82e288220 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,10 +25,18 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            paddle.distributed.all_gather(tensor_list, tindata)
-            return [tensor.numpy() for tensor in tensor_list]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_gather(tensor_list, tindata)
+                return [
+                    tensor.cast("float32").numpy() for tensor in tensor_list
+                ]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_gather(tensor_list, tindata)
+                return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 933e9e9838ed4..9bdbaa18177e1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.all_reduce(tindata)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_reduce(tindata)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_reduce(tindata)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index 4515f12b35a1b..eb19cadb11426 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,11 +25,18 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            paddle.distributed.alltoall(tindata, toutdata)
-            return [data.numpy() for data in toutdata]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.cast("float32").numpy() for data in toutdata]
+            else:
+                tindata = paddle.to_tensor(indata)
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index 8a1492b779b62..f66b3a74bfd21 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -24,10 +25,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            toutdata = paddle.to_tensor(indata)
-            paddle.distributed.alltoall_single(tindata, toutdata)
-            return [toutdata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                toutdata = paddle.to_tensor(indata)
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index 7357af6693549..9004d27d56183 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.broadcast(tindata, src=1)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.broadcast(tindata, src=1)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.broadcast(tindata, src=1)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 0a034b6e629d0..37a38b218c5dc 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,13 +25,23 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                task = paddle.distributed.isend(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.cast("float32").numpy()]
             else:
-                task = paddle.distributed.irecv(tindata, src=0)
-            task.wait()
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index c2489bbcfcfa6..5e9dfc8265ea1 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,9 +25,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.reduce(tindata, dst=0)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.reduce(tindata, dst=0)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.reduce(tindata, dst=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index c5c07fe307a75..c9df2459a78e0 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,10 +25,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
-            return [subdata1.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index 5647a4c5b9255..8f27f84a32d52 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,15 +25,27 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            if rank == 0:
-                paddle.distributed.scatter(subdata1, src=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.cast("float32").numpy()]
             else:
-                paddle.distributed.scatter(subdata1,
-                                           tensor_list=[subdata1, subdata2],
-                                           src=1)
-            return [subdata1.numpy()]
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index 4b3e8221f0797..b4bf24ffbfaa9 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import unittest
 import test_collective_api_base as test_base
 
 
@@ -25,12 +25,21 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                paddle.distributed.send(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.cast("float32").numpy()]
             else:
-                paddle.distributed.recv(tindata, src=0)
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index af4e6c10baaf9..9040564ce1206 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -26,213 +26,55 @@ def _setup_config(self):
         pass
 
     def test_allgather_nccl(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "nccl",
+                                  dtype=dtype)
 
     def test_allgather_gloo(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  dtype=dtype)
 
     def test_allgatther_nccl_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "nccl",
+                                  static_mode="0",
+                                  dtype=dtype)
 
     def test_allgather_gloo_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  static_mode="0",
+                                  dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index c0bd54a6fad7a..a5080f78bcee2 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -41,9 +41,11 @@ def test_allreduce_gloo(self):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
@@ -53,8 +55,8 @@ def test_allreduce_nccl_dygraph(self):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -65,5 +67,5 @@ def test_allreduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index a042507ede1d4..1edb06ae512d6 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -30,9 +30,11 @@ def test_alltoall_nccl(self):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
@@ -41,5 +43,5 @@ def test_alltoall_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index 2f18903068edb..e3ef3f302f33e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -23,9 +23,11 @@ def _setup_config(self):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
@@ -34,5 +36,5 @@ def test_alltooall_single_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index f0c7682805247..8f4e747b622eb 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -35,9 +35,11 @@ def test_broadcast_gloo(self):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
@@ -47,8 +49,8 @@ def test_broadcast_nccl_dygraph(self):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -59,5 +61,5 @@ def test_broadcast_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index 333da7e6807aa..2b0727cae0c8e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -23,9 +23,11 @@ def _setup_config(self):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
@@ -34,5 +36,5 @@ def test_isend_irecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index ccaf61472fe8a..35bff97f91619 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -38,9 +38,11 @@ def test_reduce_gloo(self):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
@@ -50,8 +52,8 @@ def test_reduce_nccl_dygraph(self):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -62,5 +64,5 @@ def test_reduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index d490a8bbce5df..669478f58a37d 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -23,9 +23,11 @@ def _setup_config(self):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
@@ -34,5 +36,5 @@ def test_reduce_scatter_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index d5e8e7cc62e16..ab7de7975feed 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -34,9 +34,11 @@ def test_scatter_nccl(self):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
@@ -46,8 +48,8 @@ def test_scatter_nccl_dygraph(self):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -58,5 +60,5 @@ def test_scatter_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index ee8ada3d22be6..3db6df5d46e19 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -32,9 +32,11 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
@@ -43,5 +45,5 @@ def test_sendrecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index b05481191533e..2251081e8310e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -28,6 +28,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.unique_name as nameGen
 from paddle.fluid import core
+from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -81,6 +82,9 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
+    elif dtype == "bfloat16":
+        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
+        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -173,6 +177,15 @@ def setUp(self):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
+        # NOTE: this is a hack to get int format nccl version, like 2134
+        # if current platform is not linux, version number will be 0
+        nccl_version_str = subprocess.check_output(
+            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
+            stderr=subprocess.DEVNULL,
+            shell=True).decode('utf-8')
+        self._nccl_version = int("".join(
+            nccl_version_str.split("."))) if nccl_version_str else 0
+
     def tearDown(self):
         self.temp_dir.cleanup()
 
@@ -311,6 +324,10 @@ def check_with_place(self,
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
+        # cast bfloat16 to float32 for numeric comparison
+        if dtype == "bfloat16":
+            input1 = input1.astype("float32")
+            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -327,7 +344,13 @@ def check_with_place(self,
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
+            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
+            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -338,18 +361,28 @@ def check_with_place(self,
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
         elif col_type == "allreduce":
             need_result = input1 + input2
+            if dtype == "bfloat16":
+                rtol = 8e-03
+                atol = 8e-03
+            else:
+                rtol = 1e-05
+                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)

From 911d02e437edd35f96ad1e69e0231da5acba40f2 Mon Sep 17 00:00:00 2001
From: Wen Sun <syl1887415157@126.com>
Date: Sun, 9 Oct 2022 21:11:16 +0800
Subject: [PATCH 7/7] style(python/distributed/collective): please newer
 codestyle

---
 python/paddle/distributed/collective.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 4c14638130af8..82f1f70cd2163 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -478,7 +478,8 @@ def is_initialized():
 
     Check whether the distributed environment has been initialized
 
-    Returns (bool): `True` if distributed environment has been initialized, otherwise `False`.
+    Returns:
+        `True` if distributed environment has been initialized, otherwise `False`.
 
     Examples:
         .. code-block:: python
@@ -1594,7 +1595,7 @@ def batch_isend_irecv(p2p_op_list):
     corresponding tasks. NCCL are currently supported.
 
     Args:
-        p2p_op_list: A list of point-to-point operations(type of each operator is
+        p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
             ``paddle.distributed.P2POp``). The order of the isend/irecv in the list
             matters and it needs to match with corresponding isend/irecv on the
             remote end.
@@ -1737,7 +1738,7 @@ def _reduce_scatter_base(output,
 
     Args:
         output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,