From ca83fbce596cd0a76b44d64972d005fa88cefedf Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Wed, 7 Sep 2022 16:23:01 +0800 Subject: [PATCH 1/7] feat(python/distributed/collective): add bfloat16 support for collective ops --- .../collective/ProcessGroupGloo.cc | 3 + .../collective/ProcessGroupNCCL.cc | 5 + .../fluid/platform/device/gpu/nccl_helper.h | 4 +- python/paddle/distributed/collective.py | 32 +-- .../collective_allgather_api_dygraph.py | 15 +- .../collective_allreduce_api_dygraph.py | 13 +- .../collective_alltoall_api_dygraph.py | 22 +- .../collective_alltoall_single_api_dygraph.py | 16 +- .../collective_broadcast_api_dygraph.py | 13 +- .../collective_isend_irecv_api_dygraph.py | 23 +- .../collective_reduce_api_dygraph.py | 13 +- .../collective_reduce_scatter_api_dygraph.py | 16 +- .../collective_scatter_api_dygraph.py | 29 ++- .../collective_sendrecv_api_dygraph.py | 26 +- .../test_collective_allgather_api.py | 242 +++--------------- .../test_collective_allreduce_api.py | 10 +- .../test_collective_alltoall_api.py | 6 +- .../test_collective_alltoall_single_api.py | 6 +- .../test_collective_broadcast_api.py | 10 +- .../test_collective_isend_irecv_api.py | 6 +- .../collective/test_collective_reduce_api.py | 10 +- .../test_collective_reduce_scatter_api.py | 6 +- .../collective/test_collective_scatter_api.py | 10 +- .../test_collective_sendrecv_api.py | 6 +- .../unittests/test_collective_api_base.py | 38 ++- 25 files changed, 270 insertions(+), 310 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 097c9799b70f2..07065ac908e4e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -88,6 +88,9 @@ namespace distributed { case experimental::DataType::BOOL: \ func(args); \ break; \ + case experimental::DataType::BFLOAT16: \ + func(args); \ + break; \ default: \ VLOG(0) << "Error: Unknown DataType."; \ exit(-1); \ diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index b406f596401ef..90917229f3cc2 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -996,6 +996,11 @@ void* GetPointerByOffset(void* raw_pointer, } else if (type == experimental::DataType::BOOL) { return reinterpret_cast(reinterpret_cast(raw_pointer) + offset); +#if NCCL_VERSION_CODE >= 21000 + } else if (type == experimental::DataType::BFLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); +#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index a5d89f6001fa1..5d89da86efa6c 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclUint8; } else if (type == framework::proto::VarType::BOOL) { return ncclUint8; -#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 +#if NCCL_VERSION_CODE >= 21000 } else if (type == framework::proto::VarType::BF16) { return ncclBfloat16; #endif @@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) { return ncclInt8; } else if (type == experimental::DataType::BOOL) { return ncclUint8; -#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000 +#if NCCL_VERSION_CODE >= 21000 } else if (type == experimental::DataType::BFLOAT16) { return ncclBfloat16; #endif diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 41cb3256c8f5d..4c14638130af8 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -626,7 +626,7 @@ def broadcast(tensor, src, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank. group (Group, optional): The group instance return by new_group or None for global default group. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -709,7 +709,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True): Args: tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank id. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM. group (Group, optional): The group instance return by new_group or None for global default group. @@ -817,7 +817,7 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True): Args: tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128. + should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128. tensor (Tensor): The Tensor to send. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128. group (Group, optional): The group instance return by new_group or None for global default group. @@ -999,9 +999,9 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True): Args: tensor (Tensor): The output Tensor. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None. src (int): The source rank id. Default value is 0. group (Group, optional): The group instance return by new_group or None for global default group. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1096,7 +1096,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True): Args: in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the data type of the input Tensors. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1197,7 +1197,7 @@ def alltoall_single(in_tensor, ``alltoall_single`` is only supported in eager mode. Args: - in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. + in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor. in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None. @@ -1286,7 +1286,7 @@ def send(tensor, dst=0, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to send. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1352,7 +1352,7 @@ def recv(tensor, src=0, group=None, sync_op=True): Args: tensor (Tensor): The Tensor to receive. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. sync_op (bool, optional): Whether this op is a sync op. The default value is True. @@ -1435,7 +1435,7 @@ def isend(tensor, dst, group=None): Args: tensor (Tensor): The Tensor to send. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. dst (int): The destination rank. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1485,7 +1485,7 @@ def irecv(tensor, src=None, group=None): Args: tensor (Tensor): The Tensor to receive. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. src (int): The source rank id. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1668,9 +1668,9 @@ def reduce_scatter(tensor, Reduces, then scatters a list of tensors to all processes in a group Args: - tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. + tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. @@ -1736,9 +1736,9 @@ def _reduce_scatter_base(output, Reduces, then scatters a flattened tensor to all processes in a group. Args: - output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool. - input (Tensor): Input tensor that is of size output tensor size times world size. Its data type - should be float16, float32, float64, int32, int64, int8, uint8 or bool. + output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. + input (Tensor): Input tensor that is of size output tensor size times world size. Its data type + should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. group (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py index 11fe3e4c0259a..4d5f82e288220 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,10 +25,18 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) tensor_list = [] - paddle.distributed.all_gather(tensor_list, tindata) - return [tensor.numpy() for tensor in tensor_list] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_gather(tensor_list, tindata) + return [ + tensor.cast("float32").numpy() for tensor in tensor_list + ] + else: + tindata = paddle.to_tensor(indata) + dist.all_gather(tensor_list, tindata) + return [tensor.numpy() for tensor in tensor_list] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py index 44446bd84a164..9bdbaa18177e1 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.all_reduce(tindata) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_reduce(tindata) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.all_reduce(tindata) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py index e0589072ab2ad..eb19cadb11426 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py @@ -13,23 +13,31 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main +import test_collective_api_base as test_base -class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase): +class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - tindata = paddle.split(tindata, 2, axis=0) toutdata = [] - paddle.distributed.alltoall(tindata, toutdata) - return [data.numpy() for data in toutdata] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.cast("float32").numpy() for data in toutdata] + else: + tindata = paddle.to_tensor(indata) + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.numpy() for data in toutdata] if __name__ == "__main__": - runtime_main(TestCollectiveAllToAllAPI, "alltoall") + test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall") diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py index 8a1492b779b62..f66b3a74bfd21 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,10 +25,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - toutdata = paddle.to_tensor(indata) - paddle.distributed.alltoall_single(tindata, toutdata) - return [toutdata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + toutdata = paddle.to_tensor(tindata, "float32").cast("uint16") + dist.alltoall_single(tindata, toutdata) + return [toutdata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + toutdata = paddle.to_tensor(indata) + dist.alltoall_single(tindata, toutdata) + return [toutdata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py index acb1b4a5866c8..9004d27d56183 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.broadcast(tindata, src=1) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.broadcast(tindata, src=1) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.broadcast(tindata, src=1) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py index 5434706234535..37a38b218c5dc 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,13 +25,23 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - task = paddle.distributed.isend(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.cast("float32").numpy()] else: - task = paddle.distributed.irecv(tindata, src=0) - task.wait() - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py index 5525bd8fa4aab..5e9dfc8265ea1 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.reduce(tindata, dst=0) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.reduce(tindata, dst=0) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.reduce(tindata, dst=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py index 19777260b6e89..c9df2459a78e0 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,10 +25,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2]) - return [subdata1.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py index fa65928967bdf..8f27f84a32d52 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,15 +25,27 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - if rank == 0: - paddle.distributed.scatter(subdata1, src=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.cast("float32").numpy()] else: - paddle.distributed.scatter(subdata1, - tensor_list=[subdata1, subdata2], - src=1) - return [subdata1.numpy()] + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py index ac8ffde7a48b3..b4bf24ffbfaa9 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py @@ -13,24 +13,34 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main +import test_collective_api_base as test_base -class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase): +class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - paddle.distributed.send(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.cast("float32").numpy()] else: - paddle.distributed.recv(tindata, src=0) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.numpy()] if __name__ == "__main__": - runtime_main(TestCollectiveSendRecvAPI, "sendrecv") + test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv") diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index af4e6c10baaf9..eb51453387bb4 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -26,213 +26,53 @@ def _setup_config(self): pass def test_allgather_nccl(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype=dtype) def test_allgather_gloo(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype=dtype) def test_allgatther_nccl_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype=dtype) def test_allgather_gloo_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index c0bd54a6fad7a..8e1febf121374 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -41,8 +41,8 @@ def test_allreduce_gloo(self): def test_allreduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", @@ -53,8 +53,8 @@ def test_allreduce_nccl_dygraph(self): def test_allreduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", @@ -65,5 +65,5 @@ def test_allreduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index a042507ede1d4..511d66f4567f6 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -30,8 +30,8 @@ def test_alltoall_nccl(self): def test_alltoall_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_api_dygraph.py", @@ -41,5 +41,5 @@ def test_alltoall_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index 2f18903068edb..26bc4a777f0f5 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -23,8 +23,8 @@ def _setup_config(self): def test_alltooall_single_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_single_api_dygraph.py", @@ -34,5 +34,5 @@ def test_alltooall_single_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index f0c7682805247..5c25693d1f29f 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -35,8 +35,8 @@ def test_broadcast_gloo(self): def test_broadcast_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", @@ -47,8 +47,8 @@ def test_broadcast_nccl_dygraph(self): def test_broadcast_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", @@ -59,5 +59,5 @@ def test_broadcast_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index 333da7e6807aa..4cbbc88bc3bf2 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -23,8 +23,8 @@ def _setup_config(self): def test_isend_irecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_isend_irecv_api_dygraph.py", @@ -34,5 +34,5 @@ def test_isend_irecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index ccaf61472fe8a..579a23e4d3e49 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -38,8 +38,8 @@ def test_reduce_gloo(self): def test_reduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", @@ -50,8 +50,8 @@ def test_reduce_nccl_dygraph(self): def test_reduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", @@ -62,5 +62,5 @@ def test_reduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index d490a8bbce5df..e6a16234e4280 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -23,8 +23,8 @@ def _setup_config(self): def test_reduce_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_scatter_api_dygraph.py", @@ -34,5 +34,5 @@ def test_reduce_scatter_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index d5e8e7cc62e16..b693df152f6b8 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -34,8 +34,8 @@ def test_scatter_nccl(self): def test_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", @@ -46,8 +46,8 @@ def test_scatter_nccl_dygraph(self): def test_scatter_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", @@ -58,5 +58,5 @@ def test_scatter_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index ee8ada3d22be6..fe2f94e8cd6ff 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -32,8 +32,8 @@ def _setup_config(self): def test_sendrecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_sendrecv_api_dygraph.py", @@ -43,5 +43,5 @@ def test_sendrecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index b05481191533e..30a95af6cf8ff 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -28,6 +28,7 @@ import paddle.fluid as fluid import paddle.fluid.unique_name as nameGen from paddle.fluid import core +from paddle_bfloat import bfloat16 def create_bool_test_data(shape=None, seed=None): @@ -81,6 +82,9 @@ def create_test_data(shape=None, dtype=None, seed=None): assert shape, "Shape should be specified" if dtype == "float32" or dtype == "float16" or dtype == "float64": return create_float_test_data(shape=shape, dtype=dtype, seed=seed) + elif dtype == "bfloat16": + # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace + return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed) elif dtype == "bool": return create_bool_test_data(shape=shape, seed=seed) elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8": @@ -311,6 +315,10 @@ def check_with_place(self, model_file, required_envs) input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0) input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1) + # cast bfloat16 to float32 for numeric comparison + if dtype == "bfloat16": + input1 = input1.astype("float32") + input2 = input2.astype("float32") if col_type == "allgather": need_result = np.vstack((input1, input2)) tr_out0 = np.vstack((tr0_out[0], tr0_out[1])) @@ -327,7 +335,13 @@ def check_with_place(self, np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05) elif col_type == "reduce": need_result = input1 + input2 - np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05) + # bfloat16 precision loss comes from truncating the last 16 bits of float32, + # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078 + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol) elif col_type == "scatter": need_result = input2 need_result1 = need_result[0:need_result.shape[0] // 2] @@ -338,18 +352,28 @@ def check_with_place(self, need_result = input1 + input2 need_result1 = need_result[0:need_result.shape[0] // 2] need_result2 = need_result[need_result.shape[0] // 2:] - np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05) - np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05) + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol) + np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol) elif col_type == "allreduce": need_result = input1 + input2 + if dtype == "bfloat16": + rtol = 8e-03 + atol = 8e-03 + else: + rtol = 1e-05 + atol = 1e-05 np.testing.assert_allclose(tr0_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) np.testing.assert_allclose(tr1_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) elif col_type == "parallel_embedding": result_data = tr0_out[0] np.random.seed(2020) From 68865ed8e3e55688c5b88ec66d39b9e323df073f Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Wed, 7 Sep 2022 16:27:52 +0800 Subject: [PATCH 2/7] chore(python/distributed/collective): update tests timeout --- .../tests/unittests/collective/CMakeLists.txt | 18 +++++++++--------- .../tests/unittests/collective/testslist.csv | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 19d6f848792a3..6631b7f46e0d0 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_allreduce_api - PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_alltoall_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) bash_test_modules( @@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_alltoall_single_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_alltoall_single_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_broadcast_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_isend_irecv_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_api MODULES test_collective_reduce_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_reduce_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) bash_test_modules( @@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_reduce_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_reduce_scatter_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_scatter_api MODULES test_collective_scatter_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_scatter_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( @@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python") set_tests_properties(test_collective_sendrecv_api - PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST") + PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() if((WITH_GPU OR WITH_ROCM) AND (LINUX)) py_test_modules( diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index 08c7c394ab788..883cf7941e368 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -7,27 +7,27 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy= test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=, test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., -test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., +test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., From 4c2aac1fcb2cc1fb727f9053e2af0300ff23f023 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Wed, 7 Sep 2022 22:27:31 +0800 Subject: [PATCH 3/7] fix(python/distributed/collective): add nccl version hack --- .../collective/test_collective_allgather_api.py | 4 +++- .../collective/test_collective_allreduce_api.py | 4 +++- .../unittests/collective/test_collective_alltoall_api.py | 4 +++- .../collective/test_collective_alltoall_single_api.py | 4 +++- .../collective/test_collective_broadcast_api.py | 4 +++- .../collective/test_collective_isend_irecv_api.py | 4 +++- .../unittests/collective/test_collective_reduce_api.py | 4 +++- .../collective/test_collective_reduce_scatter_api.py | 4 +++- .../unittests/collective/test_collective_scatter_api.py | 4 +++- .../unittests/collective/test_collective_sendrecv_api.py | 4 +++- .../fluid/tests/unittests/test_collective_api_base.py | 9 +++++++++ 11 files changed, 39 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index eb51453387bb4..9040564ce1206 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -51,8 +51,10 @@ def test_allgather_gloo(self): def test_allgatther_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16", "complex64", "complex128" + "bool", "complex64", "complex128" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_allgather_api_dygraph.py", "allgather", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index 8e1febf121374..a5080f78bcee2 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -42,8 +42,10 @@ def test_allreduce_gloo(self): def test_allreduce_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", "allreduce", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index 511d66f4567f6..1edb06ae512d6 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -31,8 +31,10 @@ def test_alltoall_nccl(self): def test_alltoall_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_api_dygraph.py", "alltoall", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index 26bc4a777f0f5..e3ef3f302f33e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -24,8 +24,10 @@ def _setup_config(self): def test_alltooall_single_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_single_api_dygraph.py", "alltoall", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index 5c25693d1f29f..8f4e747b622eb 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -36,8 +36,10 @@ def test_broadcast_gloo(self): def test_broadcast_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", "broadcast", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index 4cbbc88bc3bf2..2b0727cae0c8e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -24,8 +24,10 @@ def _setup_config(self): def test_isend_irecv_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_isend_irecv_api_dygraph.py", "sendrecv", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index 579a23e4d3e49..35bff97f91619 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -39,8 +39,10 @@ def test_reduce_gloo(self): def test_reduce_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", "reduce", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index e6a16234e4280..669478f58a37d 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -24,8 +24,10 @@ def _setup_config(self): def test_reduce_scatter_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_scatter_api_dygraph.py", "reduce_scatter", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index b693df152f6b8..ab7de7975feed 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -35,8 +35,10 @@ def test_scatter_nccl(self): def test_scatter_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", "scatter", diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index fe2f94e8cd6ff..3db6df5d46e19 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -33,8 +33,10 @@ def _setup_config(self): def test_sendrecv_nccl_dygraph(self): dtypes_to_test = [ "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_sendrecv_api_dygraph.py", "sendrecv", diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index 30a95af6cf8ff..2251081e8310e 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -177,6 +177,15 @@ def setUp(self): self.temp_dir = tempfile.TemporaryDirectory() + # NOTE: this is a hack to get int format nccl version, like 2134 + # if current platform is not linux, version number will be 0 + nccl_version_str = subprocess.check_output( + r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'", + stderr=subprocess.DEVNULL, + shell=True).decode('utf-8') + self._nccl_version = int("".join( + nccl_version_str.split("."))) if nccl_version_str else 0 + def tearDown(self): self.temp_dir.cleanup() From c4c6260ea669f0fc0ee29a372240758abe7fbb47 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Thu, 8 Sep 2022 14:05:03 +0800 Subject: [PATCH 4/7] revert(python/distributed/collective): remove bfloat16 tests temporarily --- .../collective_allgather_api_dygraph.py | 16 +- .../collective_allreduce_api_dygraph.py | 14 +- .../collective_alltoall_api_dygraph.py | 17 +- .../collective_alltoall_single_api_dygraph.py | 16 +- .../collective_broadcast_api_dygraph.py | 14 +- .../collective_isend_irecv_api_dygraph.py | 24 +- .../collective_reduce_api_dygraph.py | 14 +- .../collective_reduce_scatter_api_dygraph.py | 17 +- .../collective_scatter_api_dygraph.py | 30 +-- .../collective_sendrecv_api_dygraph.py | 21 +- .../test_collective_allgather_api.py | 244 +++++++++++++++--- .../test_collective_allreduce_api.py | 12 +- .../test_collective_alltoall_api.py | 8 +- .../test_collective_alltoall_single_api.py | 8 +- .../test_collective_broadcast_api.py | 12 +- .../test_collective_isend_irecv_api.py | 8 +- .../collective/test_collective_reduce_api.py | 12 +- .../test_collective_reduce_scatter_api.py | 8 +- .../collective/test_collective_scatter_api.py | 12 +- .../test_collective_sendrecv_api.py | 8 +- .../unittests/test_collective_api_base.py | 47 +--- 21 files changed, 295 insertions(+), 267 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py index 4d5f82e288220..2491297a7e1c3 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,18 +25,10 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): + tindata = paddle.to_tensor(indata) tensor_list = [] - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - dist.all_gather(tensor_list, tindata) - return [ - tensor.cast("float32").numpy() for tensor in tensor_list - ] - else: - tindata = paddle.to_tensor(indata) - dist.all_gather(tensor_list, tindata) - return [tensor.numpy() for tensor in tensor_list] + paddle.distributed.all_gather(tensor_list, tindata) + return [tensor.numpy() for tensor in tensor_list] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py index 9bdbaa18177e1..933e9e9838ed4 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,15 +25,9 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - dist.all_reduce(tindata) - return [tindata.cast("float32").numpy()] - else: - tindata = paddle.to_tensor(indata) - dist.all_reduce(tindata) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + paddle.distributed.all_reduce(tindata) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py index eb19cadb11426..4515f12b35a1b 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,18 +25,11 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): + tindata = paddle.to_tensor(indata) + tindata = paddle.split(tindata, 2, axis=0) toutdata = [] - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - tindata = paddle.split(tindata, 2, axis=0) - dist.alltoall(tindata, toutdata) - return [data.cast("float32").numpy() for data in toutdata] - else: - tindata = paddle.to_tensor(indata) - tindata = paddle.split(tindata, 2, axis=0) - dist.alltoall(tindata, toutdata) - return [data.numpy() for data in toutdata] + paddle.distributed.alltoall(tindata, toutdata) + return [data.numpy() for data in toutdata] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py index f66b3a74bfd21..8a1492b779b62 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py @@ -13,7 +13,6 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -25,17 +24,10 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - toutdata = paddle.to_tensor(tindata, "float32").cast("uint16") - dist.alltoall_single(tindata, toutdata) - return [toutdata.cast("float32").numpy()] - else: - tindata = paddle.to_tensor(indata) - toutdata = paddle.to_tensor(indata) - dist.alltoall_single(tindata, toutdata) - return [toutdata.numpy()] + tindata = paddle.to_tensor(indata) + toutdata = paddle.to_tensor(indata) + paddle.distributed.alltoall_single(tindata, toutdata) + return [toutdata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py index 9004d27d56183..7357af6693549 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,15 +25,9 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - dist.broadcast(tindata, src=1) - return [tindata.cast("float32").numpy()] - else: - tindata = paddle.to_tensor(indata) - dist.broadcast(tindata, src=1) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + paddle.distributed.broadcast(tindata, src=1) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py index 37a38b218c5dc..0a034b6e629d0 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,23 +25,13 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - if rank == 0: - task = dist.isend(tindata, dst=1) - else: - task = dist.irecv(tindata, src=0) - task.wait() - return [tindata.cast("float32").numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + task = paddle.distributed.isend(tindata, dst=1) else: - tindata = paddle.to_tensor(indata) - if rank == 0: - task = dist.isend(tindata, dst=1) - else: - task = dist.irecv(tindata, src=0) - task.wait() - return [tindata.numpy()] + task = paddle.distributed.irecv(tindata, src=0) + task.wait() + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py index 5e9dfc8265ea1..c2489bbcfcfa6 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,15 +25,9 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - dist.reduce(tindata, dst=0) - return [tindata.cast("float32").numpy()] - else: - tindata = paddle.to_tensor(indata) - dist.reduce(tindata, dst=0) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + paddle.distributed.reduce(tindata, dst=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py index c9df2459a78e0..c5c07fe307a75 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,17 +25,10 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - dist.reduce_scatter(subdata1, [subdata1, subdata2]) - return [subdata1.cast("float32").numpy()] - else: - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - dist.reduce_scatter(subdata1, [subdata1, subdata2]) - return [subdata1.numpy()] + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py index 8f27f84a32d52..5647a4c5b9255 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,27 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - if rank == 0: - dist.scatter(subdata1, src=1) - else: - dist.scatter(subdata1, - tensor_list=[subdata1, subdata2], - src=1) - return [subdata1.cast("float32").numpy()] + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + paddle.distributed.scatter(subdata1, src=1) else: - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - if rank == 0: - dist.scatter(subdata1, src=1) - else: - dist.scatter(subdata1, - tensor_list=[subdata1, subdata2], - src=1) - return [subdata1.numpy()] + paddle.distributed.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py index b4bf24ffbfaa9..4b3e8221f0797 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -import paddle.distributed as dist import paddle.fluid as fluid +import unittest import test_collective_api_base as test_base @@ -25,21 +25,12 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 - if indata.dtype == "bfloat16": - tindata = paddle.to_tensor(indata, "float32").cast("uint16") - if rank == 0: - dist.send(tindata, dst=1) - else: - dist.recv(tindata, src=0) - return [tindata.cast("float32").numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + paddle.distributed.send(tindata, dst=1) else: - tindata = paddle.to_tensor(indata) - if rank == 0: - dist.send(tindata, dst=1) - else: - dist.recv(tindata, src=0) - return [tindata.numpy()] + paddle.distributed.recv(tindata, src=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index 9040564ce1206..af4e6c10baaf9 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -26,55 +26,213 @@ def _setup_config(self): pass def test_allgather_nccl(self): - dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "complex64", "complex128" - ] - for dtype in dtypes_to_test: - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype=dtype) + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="float16") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="float32") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="float64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="bool") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="uint8") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="int8") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="int32") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="int64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="complex64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype="complex128") def test_allgather_gloo(self): - dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "complex64", "complex128" - ] - for dtype in dtypes_to_test: - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype=dtype) + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="float16") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="float32") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="float64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="bool") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="uint8") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="int8") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="int32") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="int64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="complex64") + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype="complex128") def test_allgatther_nccl_dygraph(self): - dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "complex64", "complex128" - ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") - for dtype in dtypes_to_test: - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype=dtype) + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="float16") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="float32") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="float64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="bool") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="uint8") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="int8") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="int32") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="int64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="complex64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype="complex128") def test_allgather_gloo_dygraph(self): - dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16", "complex64", "complex128" - ] - for dtype in dtypes_to_test: - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype=dtype) + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="float16") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="float32") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="float64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="bool") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="uint8") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="int8") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="int32") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="int64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="complex64") + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype="complex128") -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index a5080f78bcee2..c0bd54a6fad7a 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -41,11 +41,9 @@ def test_allreduce_gloo(self): def test_allreduce_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", "allreduce", @@ -55,8 +53,8 @@ def test_allreduce_nccl_dygraph(self): def test_allreduce_gloo_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", @@ -67,5 +65,5 @@ def test_allreduce_gloo_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index 1edb06ae512d6..a042507ede1d4 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -30,11 +30,9 @@ def test_alltoall_nccl(self): def test_alltoall_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_api_dygraph.py", "alltoall", @@ -43,5 +41,5 @@ def test_alltoall_nccl_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index e3ef3f302f33e..2f18903068edb 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -23,11 +23,9 @@ def _setup_config(self): def test_alltooall_single_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_single_api_dygraph.py", "alltoall", @@ -36,5 +34,5 @@ def test_alltooall_single_nccl_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index 8f4e747b622eb..f0c7682805247 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -35,11 +35,9 @@ def test_broadcast_gloo(self): def test_broadcast_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", "broadcast", @@ -49,8 +47,8 @@ def test_broadcast_nccl_dygraph(self): def test_broadcast_gloo_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", @@ -61,5 +59,5 @@ def test_broadcast_gloo_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index 2b0727cae0c8e..333da7e6807aa 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -23,11 +23,9 @@ def _setup_config(self): def test_isend_irecv_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_isend_irecv_api_dygraph.py", "sendrecv", @@ -36,5 +34,5 @@ def test_isend_irecv_nccl_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index 35bff97f91619..ccaf61472fe8a 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -38,11 +38,9 @@ def test_reduce_gloo(self): def test_reduce_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", "reduce", @@ -52,8 +50,8 @@ def test_reduce_nccl_dygraph(self): def test_reduce_gloo_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", @@ -64,5 +62,5 @@ def test_reduce_gloo_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index 669478f58a37d..d490a8bbce5df 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -23,11 +23,9 @@ def _setup_config(self): def test_reduce_scatter_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_scatter_api_dygraph.py", "reduce_scatter", @@ -36,5 +34,5 @@ def test_reduce_scatter_nccl_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index ab7de7975feed..d5e8e7cc62e16 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -34,11 +34,9 @@ def test_scatter_nccl(self): def test_scatter_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", "scatter", @@ -48,8 +46,8 @@ def test_scatter_nccl_dygraph(self): def test_scatter_gloo_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool", "bfloat16" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", @@ -60,5 +58,5 @@ def test_scatter_gloo_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index 3db6df5d46e19..ee8ada3d22be6 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -32,11 +32,9 @@ def _setup_config(self): def test_sendrecv_nccl_dygraph(self): dtypes_to_test = [ - "float16", "float32", "float64", "int32", "int64", "int8", "uint8", - "bool" + 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', + 'bool' ] - if self._nccl_version >= 2100: - dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_sendrecv_api_dygraph.py", "sendrecv", @@ -45,5 +43,5 @@ def test_sendrecv_nccl_dygraph(self): dtype=dtype) -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index 2251081e8310e..b05481191533e 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -28,7 +28,6 @@ import paddle.fluid as fluid import paddle.fluid.unique_name as nameGen from paddle.fluid import core -from paddle_bfloat import bfloat16 def create_bool_test_data(shape=None, seed=None): @@ -82,9 +81,6 @@ def create_test_data(shape=None, dtype=None, seed=None): assert shape, "Shape should be specified" if dtype == "float32" or dtype == "float16" or dtype == "float64": return create_float_test_data(shape=shape, dtype=dtype, seed=seed) - elif dtype == "bfloat16": - # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace - return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed) elif dtype == "bool": return create_bool_test_data(shape=shape, seed=seed) elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8": @@ -177,15 +173,6 @@ def setUp(self): self.temp_dir = tempfile.TemporaryDirectory() - # NOTE: this is a hack to get int format nccl version, like 2134 - # if current platform is not linux, version number will be 0 - nccl_version_str = subprocess.check_output( - r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'", - stderr=subprocess.DEVNULL, - shell=True).decode('utf-8') - self._nccl_version = int("".join( - nccl_version_str.split("."))) if nccl_version_str else 0 - def tearDown(self): self.temp_dir.cleanup() @@ -324,10 +311,6 @@ def check_with_place(self, model_file, required_envs) input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0) input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1) - # cast bfloat16 to float32 for numeric comparison - if dtype == "bfloat16": - input1 = input1.astype("float32") - input2 = input2.astype("float32") if col_type == "allgather": need_result = np.vstack((input1, input2)) tr_out0 = np.vstack((tr0_out[0], tr0_out[1])) @@ -344,13 +327,7 @@ def check_with_place(self, np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05) elif col_type == "reduce": need_result = input1 + input2 - # bfloat16 precision loss comes from truncating the last 16 bits of float32, - # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078 - if dtype == "bfloat16": - rtol = 8e-03 - else: - rtol = 1e-05 - np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol) + np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05) elif col_type == "scatter": need_result = input2 need_result1 = need_result[0:need_result.shape[0] // 2] @@ -361,28 +338,18 @@ def check_with_place(self, need_result = input1 + input2 need_result1 = need_result[0:need_result.shape[0] // 2] need_result2 = need_result[need_result.shape[0] // 2:] - if dtype == "bfloat16": - rtol = 8e-03 - else: - rtol = 1e-05 - np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol) - np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol) + np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05) + np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05) elif col_type == "allreduce": need_result = input1 + input2 - if dtype == "bfloat16": - rtol = 8e-03 - atol = 8e-03 - else: - rtol = 1e-05 - atol = 1e-05 np.testing.assert_allclose(tr0_out[0], need_result, - rtol=rtol, - atol=atol) + rtol=1e-05, + atol=1e-05) np.testing.assert_allclose(tr1_out[0], need_result, - rtol=rtol, - atol=atol) + rtol=1e-05, + atol=1e-05) elif col_type == "parallel_embedding": result_data = tr0_out[0] np.random.seed(2020) From 35eec2bb4104b878a11877da156df28f71dd85e2 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Fri, 9 Sep 2022 10:15:05 +0800 Subject: [PATCH 5/7] refactor(python/distributed/collective): remove useless version macro --- paddle/fluid/distributed/collective/ProcessGroupNCCL.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 90917229f3cc2..75f061f693b9b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -996,11 +996,9 @@ void* GetPointerByOffset(void* raw_pointer, } else if (type == experimental::DataType::BOOL) { return reinterpret_cast(reinterpret_cast(raw_pointer) + offset); -#if NCCL_VERSION_CODE >= 21000 } else if (type == experimental::DataType::BFLOAT16) { - return reinterpret_cast(reinterpret_cast(raw_pointer) + + return reinterpret_cast(reinterpret_cast(raw_pointer) + offset); -#endif } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); From 74d862a38c71804dce366caf1256b8705a1d3f71 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Sun, 9 Oct 2022 19:00:24 +0800 Subject: [PATCH 6/7] revert(python/distributed/collective): recover temporary bfloat16 tests --- .../collective_allgather_api_dygraph.py | 16 +- .../collective_allreduce_api_dygraph.py | 14 +- .../collective_alltoall_api_dygraph.py | 17 +- .../collective_alltoall_single_api_dygraph.py | 16 +- .../collective_broadcast_api_dygraph.py | 14 +- .../collective_isend_irecv_api_dygraph.py | 24 +- .../collective_reduce_api_dygraph.py | 14 +- .../collective_reduce_scatter_api_dygraph.py | 17 +- .../collective_scatter_api_dygraph.py | 30 ++- .../collective_sendrecv_api_dygraph.py | 21 +- .../test_collective_allgather_api.py | 244 +++--------------- .../test_collective_allreduce_api.py | 12 +- .../test_collective_alltoall_api.py | 8 +- .../test_collective_alltoall_single_api.py | 8 +- .../test_collective_broadcast_api.py | 12 +- .../test_collective_isend_irecv_api.py | 8 +- .../collective/test_collective_reduce_api.py | 12 +- .../test_collective_reduce_scatter_api.py | 8 +- .../collective/test_collective_scatter_api.py | 12 +- .../test_collective_sendrecv_api.py | 8 +- .../unittests/test_collective_api_base.py | 47 +++- 21 files changed, 267 insertions(+), 295 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py index 2491297a7e1c3..4d5f82e288220 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,10 +25,18 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) tensor_list = [] - paddle.distributed.all_gather(tensor_list, tindata) - return [tensor.numpy() for tensor in tensor_list] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_gather(tensor_list, tindata) + return [ + tensor.cast("float32").numpy() for tensor in tensor_list + ] + else: + tindata = paddle.to_tensor(indata) + dist.all_gather(tensor_list, tindata) + return [tensor.numpy() for tensor in tensor_list] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py index 933e9e9838ed4..9bdbaa18177e1 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.all_reduce(tindata) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.all_reduce(tindata) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.all_reduce(tindata) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py index 4515f12b35a1b..eb19cadb11426 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,11 +25,18 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - tindata = paddle.split(tindata, 2, axis=0) toutdata = [] - paddle.distributed.alltoall(tindata, toutdata) - return [data.numpy() for data in toutdata] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.cast("float32").numpy() for data in toutdata] + else: + tindata = paddle.to_tensor(indata) + tindata = paddle.split(tindata, 2, axis=0) + dist.alltoall(tindata, toutdata) + return [data.numpy() for data in toutdata] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py index 8a1492b779b62..f66b3a74bfd21 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid import test_collective_api_base as test_base @@ -24,10 +25,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - toutdata = paddle.to_tensor(indata) - paddle.distributed.alltoall_single(tindata, toutdata) - return [toutdata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + toutdata = paddle.to_tensor(tindata, "float32").cast("uint16") + dist.alltoall_single(tindata, toutdata) + return [toutdata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + toutdata = paddle.to_tensor(indata) + dist.alltoall_single(tindata, toutdata) + return [toutdata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py index 7357af6693549..9004d27d56183 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.broadcast(tindata, src=1) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.broadcast(tindata, src=1) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.broadcast(tindata, src=1) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py index 0a034b6e629d0..37a38b218c5dc 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,13 +25,23 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - task = paddle.distributed.isend(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.cast("float32").numpy()] else: - task = paddle.distributed.irecv(tindata, src=0) - task.wait() - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + task = dist.isend(tindata, dst=1) + else: + task = dist.irecv(tindata, src=0) + task.wait() + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py index c2489bbcfcfa6..5e9dfc8265ea1 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,9 +25,15 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - paddle.distributed.reduce(tindata, dst=0) - return [tindata.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + dist.reduce(tindata, dst=0) + return [tindata.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + dist.reduce(tindata, dst=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py index c5c07fe307a75..c9df2459a78e0 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,10 +25,17 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2]) - return [subdata1.numpy()] + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.cast("float32").numpy()] + else: + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + dist.reduce_scatter(subdata1, [subdata1, subdata2]) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py index 5647a4c5b9255..8f27f84a32d52 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,15 +25,27 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - subdata1, subdata2 = paddle.split(tindata, 2, axis=0) - if rank == 0: - paddle.distributed.scatter(subdata1, src=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.cast("float32").numpy()] else: - paddle.distributed.scatter(subdata1, - tensor_list=[subdata1, subdata2], - src=1) - return [subdata1.numpy()] + tindata = paddle.to_tensor(indata) + subdata1, subdata2 = paddle.split(tindata, 2, axis=0) + if rank == 0: + dist.scatter(subdata1, src=1) + else: + dist.scatter(subdata1, + tensor_list=[subdata1, subdata2], + src=1) + return [subdata1.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py index 4b3e8221f0797..b4bf24ffbfaa9 100644 --- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py +++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle +import paddle.distributed as dist import paddle.fluid as fluid -import unittest import test_collective_api_base as test_base @@ -25,12 +25,21 @@ def __init__(self): def get_model(self, main_prog, startup_program, rank, indata=None): with fluid.program_guard(main_prog, startup_program): - tindata = paddle.to_tensor(indata) - if rank == 0: - paddle.distributed.send(tindata, dst=1) + # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16 + if indata.dtype == "bfloat16": + tindata = paddle.to_tensor(indata, "float32").cast("uint16") + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.cast("float32").numpy()] else: - paddle.distributed.recv(tindata, src=0) - return [tindata.numpy()] + tindata = paddle.to_tensor(indata) + if rank == 0: + dist.send(tindata, dst=1) + else: + dist.recv(tindata, src=0) + return [tindata.numpy()] if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py index af4e6c10baaf9..9040564ce1206 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py @@ -26,213 +26,55 @@ def _setup_config(self): pass def test_allgather_nccl(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "nccl", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "nccl", + dtype=dtype) def test_allgather_gloo(self): - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float16") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="float64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="bool") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="uint8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int8") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int32") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="int64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex64") - self.check_with_place("collective_allgather_api.py", - "allgather", - "gloo", - "3", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api.py", + "allgather", + "gloo", + "3", + dtype=dtype) def test_allgatther_nccl_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "nccl", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "complex64", "complex128" + ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "nccl", + static_mode="0", + dtype=dtype) def test_allgather_gloo_dygraph(self): - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float16") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="float64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="bool") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="uint8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int8") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int32") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="int64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex64") - self.check_with_place("collective_allgather_api_dygraph.py", - "allgather", - "gloo", - "3", - static_mode="0", - dtype="complex128") + dtypes_to_test = [ + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16", "complex64", "complex128" + ] + for dtype in dtypes_to_test: + self.check_with_place("collective_allgather_api_dygraph.py", + "allgather", + "gloo", + "3", + static_mode="0", + dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py index c0bd54a6fad7a..a5080f78bcee2 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py @@ -41,9 +41,11 @@ def test_allreduce_gloo(self): def test_allreduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", "allreduce", @@ -53,8 +55,8 @@ def test_allreduce_nccl_dygraph(self): def test_allreduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_allreduce_api_dygraph.py", @@ -65,5 +67,5 @@ def test_allreduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py index a042507ede1d4..1edb06ae512d6 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py @@ -30,9 +30,11 @@ def test_alltoall_nccl(self): def test_alltoall_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_api_dygraph.py", "alltoall", @@ -41,5 +43,5 @@ def test_alltoall_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py index 2f18903068edb..e3ef3f302f33e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py @@ -23,9 +23,11 @@ def _setup_config(self): def test_alltooall_single_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_alltoall_single_api_dygraph.py", "alltoall", @@ -34,5 +36,5 @@ def test_alltooall_single_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py index f0c7682805247..8f4e747b622eb 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py @@ -35,9 +35,11 @@ def test_broadcast_gloo(self): def test_broadcast_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", "broadcast", @@ -47,8 +49,8 @@ def test_broadcast_nccl_dygraph(self): def test_broadcast_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_broadcast_api_dygraph.py", @@ -59,5 +61,5 @@ def test_broadcast_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py index 333da7e6807aa..2b0727cae0c8e 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py @@ -23,9 +23,11 @@ def _setup_config(self): def test_isend_irecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_isend_irecv_api_dygraph.py", "sendrecv", @@ -34,5 +36,5 @@ def test_isend_irecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py index ccaf61472fe8a..35bff97f91619 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py @@ -38,9 +38,11 @@ def test_reduce_gloo(self): def test_reduce_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", "reduce", @@ -50,8 +52,8 @@ def test_reduce_nccl_dygraph(self): def test_reduce_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_reduce_api_dygraph.py", @@ -62,5 +64,5 @@ def test_reduce_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py index d490a8bbce5df..669478f58a37d 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py @@ -23,9 +23,11 @@ def _setup_config(self): def test_reduce_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_reduce_scatter_api_dygraph.py", "reduce_scatter", @@ -34,5 +36,5 @@ def test_reduce_scatter_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py index d5e8e7cc62e16..ab7de7975feed 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py @@ -34,9 +34,11 @@ def test_scatter_nccl(self): def test_scatter_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", "scatter", @@ -46,8 +48,8 @@ def test_scatter_nccl_dygraph(self): def test_scatter_gloo_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool", "bfloat16" ] for dtype in dtypes_to_test: self.check_with_place("collective_scatter_api_dygraph.py", @@ -58,5 +60,5 @@ def test_scatter_gloo_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py index ee8ada3d22be6..3db6df5d46e19 100644 --- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py +++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py @@ -32,9 +32,11 @@ def _setup_config(self): def test_sendrecv_nccl_dygraph(self): dtypes_to_test = [ - 'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', - 'bool' + "float16", "float32", "float64", "int32", "int64", "int8", "uint8", + "bool" ] + if self._nccl_version >= 2100: + dtypes_to_test.append("bfloat16") for dtype in dtypes_to_test: self.check_with_place("collective_sendrecv_api_dygraph.py", "sendrecv", @@ -43,5 +45,5 @@ def test_sendrecv_nccl_dygraph(self): dtype=dtype) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index b05481191533e..2251081e8310e 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -28,6 +28,7 @@ import paddle.fluid as fluid import paddle.fluid.unique_name as nameGen from paddle.fluid import core +from paddle_bfloat import bfloat16 def create_bool_test_data(shape=None, seed=None): @@ -81,6 +82,9 @@ def create_test_data(shape=None, dtype=None, seed=None): assert shape, "Shape should be specified" if dtype == "float32" or dtype == "float16" or dtype == "float64": return create_float_test_data(shape=shape, dtype=dtype, seed=seed) + elif dtype == "bfloat16": + # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace + return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed) elif dtype == "bool": return create_bool_test_data(shape=shape, seed=seed) elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8": @@ -173,6 +177,15 @@ def setUp(self): self.temp_dir = tempfile.TemporaryDirectory() + # NOTE: this is a hack to get int format nccl version, like 2134 + # if current platform is not linux, version number will be 0 + nccl_version_str = subprocess.check_output( + r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'", + stderr=subprocess.DEVNULL, + shell=True).decode('utf-8') + self._nccl_version = int("".join( + nccl_version_str.split("."))) if nccl_version_str else 0 + def tearDown(self): self.temp_dir.cleanup() @@ -311,6 +324,10 @@ def check_with_place(self, model_file, required_envs) input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0) input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1) + # cast bfloat16 to float32 for numeric comparison + if dtype == "bfloat16": + input1 = input1.astype("float32") + input2 = input2.astype("float32") if col_type == "allgather": need_result = np.vstack((input1, input2)) tr_out0 = np.vstack((tr0_out[0], tr0_out[1])) @@ -327,7 +344,13 @@ def check_with_place(self, np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05) elif col_type == "reduce": need_result = input1 + input2 - np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05) + # bfloat16 precision loss comes from truncating the last 16 bits of float32, + # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078 + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol) elif col_type == "scatter": need_result = input2 need_result1 = need_result[0:need_result.shape[0] // 2] @@ -338,18 +361,28 @@ def check_with_place(self, need_result = input1 + input2 need_result1 = need_result[0:need_result.shape[0] // 2] need_result2 = need_result[need_result.shape[0] // 2:] - np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05) - np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05) + if dtype == "bfloat16": + rtol = 8e-03 + else: + rtol = 1e-05 + np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol) + np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol) elif col_type == "allreduce": need_result = input1 + input2 + if dtype == "bfloat16": + rtol = 8e-03 + atol = 8e-03 + else: + rtol = 1e-05 + atol = 1e-05 np.testing.assert_allclose(tr0_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) np.testing.assert_allclose(tr1_out[0], need_result, - rtol=1e-05, - atol=1e-05) + rtol=rtol, + atol=atol) elif col_type == "parallel_embedding": result_data = tr0_out[0] np.random.seed(2020) From 911d02e437edd35f96ad1e69e0231da5acba40f2 Mon Sep 17 00:00:00 2001 From: Wen Sun Date: Sun, 9 Oct 2022 21:11:16 +0800 Subject: [PATCH 7/7] style(python/distributed/collective): please newer codestyle --- python/paddle/distributed/collective.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 4c14638130af8..82f1f70cd2163 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -478,7 +478,8 @@ def is_initialized(): Check whether the distributed environment has been initialized - Returns (bool): `True` if distributed environment has been initialized, otherwise `False`. + Returns: + `True` if distributed environment has been initialized, otherwise `False`. Examples: .. code-block:: python @@ -1594,7 +1595,7 @@ def batch_isend_irecv(p2p_op_list): corresponding tasks. NCCL are currently supported. Args: - p2p_op_list: A list of point-to-point operations(type of each operator is + p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is ``paddle.distributed.P2POp``). The order of the isend/irecv in the list matters and it needs to match with corresponding isend/irecv on the remote end. @@ -1737,7 +1738,7 @@ def _reduce_scatter_base(output, Args: output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. - input (Tensor): Input tensor that is of size output tensor size times world size. Its data type + input (Tensor): Input tensor that is of size output tensor size times world size. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. group (ProcessGroup, optional): The process group to work on. If None,