xuebwang-amd
diff --git a/‎tests/v1/attention/test_attention_splitting.py‎
Lines changed: 82 additions & 1 deletion b/‎tests/v1/attention/test_attention_splitting.py‎
Lines changed: 82 additions & 1 deletion
diff --git a/‎tests/v1/spec_decode/test_eagle.py‎
Lines changed: 2 additions & 4 deletions b/‎tests/v1/spec_decode/test_eagle.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎vllm/config/__init__.py‎
Lines changed: 7 additions & 5 deletions b/‎vllm/config/__init__.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 10 additions & 4 deletions b/‎vllm/config/parallel.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 19 additions & 9 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 6 additions & 0 deletions b/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 6 additions & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 11 additions & 0 deletions b/‎vllm/envs.py‎
Lines changed: 11 additions & 0 deletions
@@ -5,11 +5,12 @@
 import torch
 
 from tests.v1.attention.test_attention_backends import BATCH_SPECS
-from tests.v1.attention.utils import create_common_attn_metadata
+from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm.v1.attention.backends.utils import (UBatchSlice,
                                               _make_metadata_with_slice,
                                               slice_query_start_locs,
                                               split_attn_metadata)
+from vllm.v1.worker.ubatch_utils import create_ubatch_slices
 
 
 @pytest.fixture
@@ -155,3 +156,83 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
     assert results[1].num_reqs == mid_point
     assert results[1].num_actual_tokens == mid_point
     assert torch.equal(results[1].seq_lens, torch.tensor([2048] * mid_point))
+
+
+@pytest.mark.parametrize(
+    "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs",
+    [
+        # Split in the middle of request 1
+        ([32, 40], [8, 8], 12, 2, 1),
+        # Split inside the first request
+        ([32, 40], [8, 8], 4, 1, 2),
+    ],
+)
+def test_prefill_split_across_ubatches(seq_lens, query_lens, split_point,
+                                       expected_first_reqs,
+                                       expected_second_reqs):
+    """Test splitting a prefill across ubatches"""
+    import numpy as np
+
+    device = torch.device("cpu")
+    batch_spec = BatchSpec(seq_lens=seq_lens, query_lens=query_lens)
+    common = create_common_attn_metadata(batch_spec,
+                                         block_size=16,
+                                         device=device)
+
+    num_scheduled_tokens = np.array(query_lens, dtype=np.int32)
+    qsl_np = common.query_start_loc_cpu.numpy()
+    num_tokens = common.num_actual_tokens
+
+    ubatch_slices = create_ubatch_slices(num_scheduled_tokens, split_point)
+    assert len(ubatch_slices) == 2
+
+    first_meta = _make_metadata_with_slice(ubatch_slices[0], common)
+    second_meta = _make_metadata_with_slice(ubatch_slices[1], common)
+
+    # Token counts match the split
+    assert first_meta.num_actual_tokens == split_point
+    assert second_meta.num_actual_tokens == num_tokens - split_point
+
+    # Number of requests per ubatch
+    assert first_meta.num_reqs == expected_first_reqs
+    assert second_meta.num_reqs == expected_second_reqs
+
+    # Identify which request is split and how many tokens are in the first chunk
+    split_req_idx = int(np.searchsorted(qsl_np, split_point, side="right") - 1)
+    tokens_in_first_chunk = split_point - int(qsl_np[split_req_idx])
+    orig_q_lens = (common.query_start_loc_cpu[1:] -
+                   common.query_start_loc_cpu[:-1])
+
+    # Check query length continuity: first-chunk + second-chunk == original qlen
+    # First ubatch last request query length
+    qlen_first_last = int(first_meta.query_start_loc_cpu[-1] -
+                          first_meta.query_start_loc_cpu[-2])
+    # Second ubatch first request query length
+    qlen_second_first = int(second_meta.query_start_loc_cpu[1] -
+                            second_meta.query_start_loc_cpu[0])
+    assert qlen_first_last == tokens_in_first_chunk
+    assert qlen_first_last + qlen_second_first == int(
+        orig_q_lens[split_req_idx])
+
+    # Check seq_lens adjustments
+    # Context lengths per original request
+    context_lens = [s - q for s, q in zip(seq_lens, query_lens)]
+
+    # First ubatch: last request's seq_len should be
+    #  context + tokens_in_first_chunk
+    expected_seqlen = context_lens[split_req_idx] + tokens_in_first_chunk
+    assert int(first_meta.seq_lens[-1]) == expected_seqlen
+
+    # For full preceding requests in first ubatch, seq_lens should match
+    #  originals
+    for i in range(first_meta.num_reqs - 1):
+        assert int(first_meta.seq_lens[i]) == seq_lens[i]
+
+    # Second ubatch: first request (continuation) seq_len should be full
+    #  original
+    assert int(second_meta.seq_lens[0]) == seq_lens[split_req_idx]
+    # Any following full requests in second ubatch should match originals
+    for j in range(1, second_meta.num_reqs):
+        # Map to original request index
+        orig_idx = split_req_idx + j
+        assert int(second_meta.seq_lens[j]) == seq_lens[orig_idx]
@@ -532,9 +532,8 @@ def create_deterministic_logits(token_ids):
     # Mock runner for attention metadata building
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].metadata_builders = [
+    proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
         attn_metadata_builder
-    ]
 
     result = proposer.propose(target_token_ids=target_token_ids,
                               target_positions=target_positions,
@@ -659,9 +658,8 @@ def create_deterministic_logits(token_ids, k: int):
     # Mock runner for attention metadata building.
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].metadata_builders = [
+    proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
         attn_metadata_builder
-    ]
 
     # Setup inputs for the proposer.
     target_token_ids = torch.randint(0,
 
@@ -638,11 +638,13 @@ def __post_init__(self):
 
         if self.parallel_config.enable_dbo:
             a2a_backend = envs.VLLM_ALL2ALL_BACKEND
-            assert a2a_backend == "deepep_low_latency", \
-            "Microbatching currently only supports the deepep_low_latency "\
-            f"all2all backend. {a2a_backend} is not supported. To fix set "\
-            "the VLLM_ALL2ALL_BACKEND environment variable to "\
-            "deepep_low_latency and install the DeepEP kerenls."
+            assert a2a_backend in \
+                ["deepep_low_latency", "deepep_high_throughput"], \
+            "Microbatching currently only supports the deepep_low_latency and "\
+            f"deepep_high_throughput all2all backend. {a2a_backend} is not "\
+            "supported. To fix set the VLLM_ALL2ALL_BACKEND environment "\
+            "variable to deepep_low_latency or deepep_high_throughput and "\
+            "install the DeepEP kernels."
 
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
@@ -139,12 +139,18 @@ class ParallelConfig:
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
     enable_dbo: bool = False
-    """Enable microbatching for the model executor."""
+    """Enable dual batch overlap for the model executor."""
 
     dbo_decode_token_threshold: int = 32
-    """The threshold for microbatching. If the number of tokens in the
-    request is greater than this threshold, microbatching will be used.
-    Otherwise, the request will be processed in a single batch."""
+    """The threshold for dual batch overlap for batches only containing decodes.
+    If the number of tokens in the request is greater than this threshold,
+    microbatching will be used. Otherwise, the request will be processed in a
+    single batch."""
+    dbo_prefill_token_threshold: int = 512  # TODO(lucas): tune
+    """The threshold for dual batch overlap for batches that contain one or more
+    prefills. If the number of tokens in the request is greater than this
+    threshold, microbatching will be used. Otherwise, the request will be
+    processed in a single batch."""
 
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
+from typing import Any, Optional
 
 import torch
 import torch.distributed as dist
 
+import vllm.envs as envs
 from vllm.distributed import get_dp_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
@@ -200,12 +201,12 @@ def __init__(self, cpu_group):
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
-        num_nvl_bytes = 1024 * 1024 * 1024
+        num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
         num_rdma_bytes = None
         num_qps_per_rank = None
 
         if self.internode:
-            num_rdma_bytes = 1024 * 1024 * 1024
+            num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
             num_qps_per_rank = self.num_sms // 2
         else:
             num_rdma_bytes = 0
@@ -230,13 +231,18 @@ def get_handle(self, kwargs):
         logger.debug("DeepEP all2all args %s", buffer_kwargs)
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
             buffer_kwargs, deep_ep.Buffer)
-        # It is dangerous to set num sms outside this function. num_sms is not
-        # a part of the hash-key that identifies this object. If we are in a
-        # situation where we make objects with different num_sms, the hash key
-        # in get_or_create must be updated.
-        handle.set_num_sms(self.num_sms)
         return handle
 
+    def set_num_sms(self, num_sms: int):
+        import deep_ep
+
+        # Right now the buffers are sized for only what the kernels were
+        # created with. So we can only reduce the number of SMS used
+        # but not increase it.
+        if num_sms > self.num_sms:
+            num_sms = self.num_sms
+        deep_ep.Buffer.set_num_sms(num_sms)
+
 
 class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
     """
@@ -265,7 +271,7 @@ def _make_all2all_kwargs(
         import deep_ep
 
         # Defaults for internode and intranode are taken from DeepEP tests.
-        num_nvl_bytes = 1024 * 1024 * 1024
+        num_nvl_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
         num_qps_per_rank = num_local_experts
         num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
             num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
@@ -291,3 +297,7 @@ def get_handle(self, kwargs):
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
             buffer_kwargs, deep_ep.Buffer)
         return handle
+
+    # DeepEP LL uses RDMA so no SMs are used for communication
+    def max_sms_used(self) -> Optional[int]:
+        return 0
@@ -60,6 +60,12 @@ def get_handle(self, kwargs):
         # and reuse it for the same config.
         raise NotImplementedError
 
+    def set_num_sms(self, num_sms: int):
+        pass
+
+    def max_sms_used(self) -> Optional[int]:
+        return None  # None means it could use the whole GPU
+
     def dispatch(self, hidden_states: torch.Tensor,
                  router_logits: torch.Tensor):
         raise NotImplementedError
 
@@ -330,6 +330,8 @@ class EngineArgs:
     enable_dbo: bool = ParallelConfig.enable_dbo
     dbo_decode_token_threshold: int = \
         ParallelConfig.dbo_decode_token_threshold
+    dbo_prefill_token_threshold: int = \
+        ParallelConfig.dbo_prefill_token_threshold
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
     expert_placement_strategy: ExpertPlacementStrategy = \
@@ -698,6 +700,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
             **parallel_kwargs["dbo_decode_token_threshold"])
+        parallel_group.add_argument(
+            "--dbo-prefill-token-threshold",
+            **parallel_kwargs["dbo_prefill_token_threshold"])
         parallel_group.add_argument("--enable-eplb",
                                     **parallel_kwargs["enable_eplb"])
         parallel_group.add_argument("--eplb-config",
@@ -1316,6 +1321,7 @@ def create_engine_config(
             enable_expert_parallel=self.enable_expert_parallel,
             enable_dbo=self.enable_dbo,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
+            dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
             enable_eplb=self.enable_eplb,
             eplb_config=self.eplb_config,
             expert_placement_strategy=self.expert_placement_strategy,
 
@@ -189,6 +189,8 @@
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
     VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER"
+    VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024
+    VLLM_DBO_COMM_SMS: int = 20
     GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = []
     VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None
 
@@ -1392,6 +1394,15 @@ def get_vllm_port() -> Optional[int]:
     lambda: os.getenv("VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME",
                       "VLLM_OBJECT_STORAGE_SHM_BUFFER"),
 
+    # The size in MB of the buffers (NVL and RDMA) used by DeepEP
+    "VLLM_DEEPEP_BUFFER_SIZE_MB":
+    lambda: int(os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")),
+
+    # The number of SMs to allocate for communication kernels when running DBO
+    # the rest of the SMs on the device will be allocated to compute
+    "VLLM_DBO_COMM_SMS":
+    lambda: int(os.getenv("VLLM_DBO_COMM_SMS", "20")),
+
     # Valid values are container,code_interpreter,web_search_preview
     # ex GPT_OSS_SYSTEM_TOOL_MCP_LABELS=container,code_interpreter
     "GPT_OSS_SYSTEM_TOOL_MCP_LABELS":