From 9c751be61cf3f566b3cf3e6d588a841d5c422aab Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Tue, 10 Sep 2024 23:59:03 -0700 Subject: [PATCH] [aDAG] Fix ranks ordering for custom NCCL group (#47594) The ranks should be in the order of the actors. Signed-off-by: ujjawal-khare --- .../experimental/channel/torch_tensor_nccl_channel.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/ray/experimental/channel/torch_tensor_nccl_channel.py b/python/ray/experimental/channel/torch_tensor_nccl_channel.py index da7448137c8a..c97f9913838e 100644 --- a/python/ray/experimental/channel/torch_tensor_nccl_channel.py +++ b/python/ray/experimental/channel/torch_tensor_nccl_channel.py @@ -480,8 +480,9 @@ def _get_ranks( actors: List[ray.actor.ActorHandle], custom_nccl_group: Optional[GPUCommunicator] ) -> List[int]: """ - Get sorted ranks for the NCCL group to use. If custom_nccl_group is specified, - return all ranks from it, otherwise, return list(range(len(actors))). + Get ranks for the NCCL group to use. If custom_nccl_group is specified, + return the ranks of the actors in the custom NCCL group, in the same + order of the actors; otherwise, return list(range(len(actors))). Args: actors: A list of actors that participate in the NCCL group. @@ -494,18 +495,18 @@ def _get_ranks( "The world size of the custom NCCL group does not match the number " "of actors." ) - ranks = set() + ranks = [] for actor in actors: rank = custom_nccl_group.get_rank(actor) assert rank not in ranks, "Duplicate rank in custom NCCL group" - ranks.add(rank) + ranks.append(rank) assert custom_nccl_group.get_world_size() == len(actors), ( "The world size of the custom NCCL group " f"({custom_nccl_group.get_world_size()}) " "does not match the number of actors " f"({len(actors)})." ) - return sorted(ranks) + return ranks def _init_nccl_group(