Skip to content

Commit

Permalink
[aDAG] Allow custom NCCL group for aDAG (ray-project#47141)
Browse files Browse the repository at this point in the history
Allow custom NCCL group for aDAG so that we can reuse what the user already created.

Marking NcclGroupInterface as DeveloperAPI for now. After validation by using it in vLLM we can change to alpha stability.

vLLM prototype: vllm-project/vllm#7568

Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
  • Loading branch information
ruisearch42 authored and ujjawal-khare committed Oct 15, 2024
1 parent 1e4e4d0 commit 4792e1d
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
4 changes: 4 additions & 0 deletions python/ray/dag/compiled_dag_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import traceback

from ray.experimental.channel.cached_channel import CachedChannel
from ray.experimental.channel.gpu_communicator import GPUCommunicator
import ray
from ray.exceptions import RayTaskError, RayChannelError
from ray.experimental.compiled_dag_ref import (
Expand Down Expand Up @@ -711,6 +712,9 @@ def __init__(
# Mapping from the actor handle to the node ID that the actor is on.
self.actor_to_node_id: Dict["ray.actor.ActorHandle", str] = {}

# Type hints specified by the user for DAG (intermediate) outputs.
self._type_hints = []

# This is set to true when type hint of `transport="nccl"`` is used
self._use_default_nccl_group = False
# This is set to the specified custom nccl group
Expand Down
11 changes: 5 additions & 6 deletions python/ray/experimental/channel/torch_tensor_nccl_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,8 @@ def _get_ranks(
actors: List[ray.actor.ActorHandle], custom_nccl_group: Optional[GPUCommunicator]
) -> List[int]:
"""
Get ranks for the NCCL group to use. If custom_nccl_group is specified,
return the ranks of the actors in the custom NCCL group, in the same
order of the actors; otherwise, return list(range(len(actors))).
Get sorted ranks for the NCCL group to use. If custom_nccl_group is specified,
return all ranks from it, otherwise, return list(range(len(actors))).
Args:
actors: A list of actors that participate in the NCCL group.
Expand All @@ -495,18 +494,18 @@ def _get_ranks(
"The world size of the custom NCCL group does not match the number "
"of actors."
)
ranks = []
ranks = set()
for actor in actors:
rank = custom_nccl_group.get_rank(actor)
assert rank not in ranks, "Duplicate rank in custom NCCL group"
ranks.append(rank)
ranks.add(rank)
assert custom_nccl_group.get_world_size() == len(actors), (
"The world size of the custom NCCL group "
f"({custom_nccl_group.get_world_size()}) "
"does not match the number of actors "
f"({len(actors)})."
)
return ranks
return sorted(ranks)


def _init_nccl_group(
Expand Down

0 comments on commit 4792e1d

Please sign in to comment.