[DP][ray] Support different VLLM_RAY_DP_PACK_STRATEGY

ruisearch42 · ruisearch42 · commit 26eaeccc8cb0 · 2025-09-03T23:54:03.000Z
Signed-off-by: Rui Qiao &lt;ruisearch42@gmail.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -123,6 +123,7 @@
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
+    VLLM_RAY_DP_PACK_STRATEGY: str = None
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
@@ -913,6 +914,17 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
     lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
 
+    # Strategy to pack the data parallel ranks for Ray.
+    # Available options:
+    # - "fill":
+    #   for DP master node, allocate exactly data-parallel-size-local DP ranks,
+    #   for non-master nodes, allocate as many DP ranks as can fit;
+    # - "strict":
+    #   allocate exactly data-parallel-size-local DP ranks to each picked node;
+    # This environment variable is ignored if data-parallel-backend is not Ray.
+    "VLLM_RAY_DP_PACK_STRATEGY":
+    lambda: os.getenv("VLLM_RAY_DP_PACK_STRATEGY", "fill"),
+
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
@@ -15,6 +15,7 @@
 import msgspec
 import zmq
 
+from vllm import envs
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -319,8 +320,8 @@ def create_dp_placement_groups(
         logger.info("Creating placement groups for data parallel")
         dp_master_ip = \
             vllm_config.parallel_config.data_parallel_master_ip
-        num_pg_to_create = vllm_config.parallel_config.data_parallel_size
-        local_engine_count = \
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_size_local = \
             vllm_config.parallel_config.data_parallel_size_local
 
         available_resources = available_resources_per_node()
@@ -334,50 +335,80 @@ def create_dp_placement_groups(
             "No nodes with resources found in Ray cluster.")
         assert dp_master_ip_key in nodes[0], (
             "The DP master node (ip: %s) is missing or dead", dp_master_ip)
+
+        if envs.VLLM_RAY_DP_PACK_STRATEGY == "strict":
+            logger.info(
+                "Using strict local size packing strategy based "
+                "on VLLM_RAY_DP_PACK_STRATEGY (%s)",
+                envs.VLLM_RAY_DP_PACK_STRATEGY)
+            strict_local_size = True
+        elif (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
+              or envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency"):
+            logger.info(
+                "Using strict local size packing strategy based "
+                "on VLLM_ALL2ALL_BACKEND (%s)", envs.VLLM_ALL2ALL_BACKEND)
+            strict_local_size = True
+        else:
+            logger.info(
+                "Using fill packing strategy based "
+                "on VLLM_RAY_DP_PACK_STRATEGY (%s)",
+                envs.VLLM_RAY_DP_PACK_STRATEGY)
+            strict_local_size = False
+
         for node_resources in nodes:
             if "GPU" not in node_resources:
                 continue
             # For now, each DP rank can only be assigned to one node
             # TODO(rui): support allocating a single DP rank
             # to multiple nodes
-            available_engine_count = int(node_resources["GPU"]) // world_size
-            if dp_master_ip_key in node_resources:
-                assert available_engine_count >= local_engine_count, (
-                    "Not enough resources to allocate DP ranks "
-                    f"on DP master node {dp_master_ip}")
-                for i in range(local_engine_count):
-                    bundles = [{
-                        "GPU": 1.0,
-                        "node:" + dp_master_ip: 0.001
-                    }] * world_size + [{
-                        "CPU": 1.0
-                    }]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
-                    )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
+            node_ip_keys = [
+                key for key in node_resources if key.startswith('node:')
+            ]
+            assert len(node_ip_keys) == 1, (
+                "Zero or multiple node IP keys found in node resources: %s",
+                node_ip_keys)
+            node_ip_key = node_ip_keys[0]
+            node_ip = node_ip_key.split(":")[1]
+            dp_size_available = int(node_resources["GPU"]) // world_size
+            if strict_local_size:
+                if dp_size_available < dp_size_local:
+                    if node_ip == dp_master_ip:
+                        raise ValueError(
+                            "Not enough resources to allocate DP ranks "
+                            f"on DP master node {dp_master_ip}")
+                    else:
+                        logger.info(
+                            "Skipping node %s as %s DP ranks could not fit, "
+                            "possible to fit %s DP ranks", node_ip,
+                            dp_size_local, dp_size_available)
+                        continue
+                dp_size_to_allocate = dp_size_local
+            elif node_ip == dp_master_ip:
+                dp_size_to_allocate = dp_size_local
             else:
-                for i in range(available_engine_count):
-                    if len(placement_groups) == num_pg_to_create:
-                        break
-                    bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}]
-                    pg = ray.util.placement_group(
-                        name=f"dp_rank_{len(placement_groups)}",
-                        strategy="STRICT_PACK",
-                        bundles=bundles,
-                    )
-                    placement_groups.append(pg)
-                    local_dp_ranks.append(i)
-        if len(placement_groups) < num_pg_to_create:
-            raise ValueError(
-                f"Not enough resources to allocate {num_pg_to_create} "
-                "placement groups, only created "
-                f"{len(placement_groups)} placement groups. "
-                "Available resources: "
-                f"{available_resources}")
+                dp_size_to_allocate = dp_size_available
+
+            for i in range(dp_size_to_allocate):
+                bundles = [{
+                    "GPU": 1.0,
+                    "node:" + node_ip: 0.001
+                }] * world_size + [{
+                    "CPU": 1.0
+                }]
+                pg = ray.util.placement_group(
+                    name=f"dp_rank_{len(placement_groups)}",
+                    strategy="STRICT_PACK",
+                    bundles=bundles,
+                )
+                placement_groups.append(pg)
+                local_dp_ranks.append(i)
+
+        if len(placement_groups) < dp_size:
+            raise ValueError(f"Not enough resources to allocate {dp_size} "
+                             "placement groups, only created "
+                             f"{len(placement_groups)} placement groups. "
+                             "Available resources: "
+                             f"{available_resources}")
         return placement_groups, local_dp_ranks
 
     @staticmethod