[CB] refactoring spyre model runner (#172)

wallashss · web-flow · commit d2c305bfaec2 · 2025-06-27T14:14:57.000-06:00
This PR does some refactoring primarily on spyre_model_runner. This changes tries to reduce code deduplication between static batching and continuous batching. However, the intention of this work will not be complete until a next PR has as goal remove kv cache manager from the spyre model runner. Summary of changes: - Reduce code deduplication in spyre model runner, some methods are common in `SpyreMoldeRunner` class, while `StaticBatchingSpyreModelRunner` and `ContinuousBatchingSpyreModelRunner` override few of them to do their specific logic - Changed `ContinuousBatchingFmsModel` class to get the attention metadata via forward context, and changed the model runner to pass to use the `with set_forward_context` to pass the attention metadata. This is the way vLLM does to support multiple attention backends [[REF](vllm-project/vllm#10558)] - Moved the left pads to the CachedRequestState. - Bugfix: The `execute_model` in CB model runner was inconsistent with the data of input batch when it outputs the resul in `CBSpyreModelRunnerOutput`. Changed it with prepare_prompt to use the data of input batch. - Misc: few renamed variables, more comments, and TODOs --------- Signed-off-by: Wallas Santos <wallashss@ibm.com>
diff --git a/tests/e2e/test_spyre_basic.py b/tests/e2e/test_spyre_basic.py
@@ -128,14 +128,11 @@ def test_output_sendnn_decoder(
 @pytest.mark.parametrize("cb",
                          [pytest.param(1, marks=pytest.mark.cb, id="cb"), 0])
 def test_batch_handling(model: str, backend: str, cb: int,
-                        monkeypatch: pytest.MonkeyPatch, runtime_xfail):
+                        monkeypatch: pytest.MonkeyPatch):
     """Test that the spyre worker correctly handles
     continuous batches of requests that
     finish after different numbers of forward passes"""
 
-    if cb == 1:
-        runtime_xfail("Batch handling bug with continuous batching")
-
     prompts = get_chicken_soup_prompts(4)
 
     sampling_params1 = SamplingParams(max_tokens=5,
diff --git a/tests/v1/worker/test_spyre_input_batch.py b/tests/v1/worker/test_spyre_input_batch.py
@@ -163,6 +163,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         sampling_params=_create_sampling_params(),
         generator=None,
         output_token_ids=output_token_ids,
+        left_padding=0,
     )
 
 
diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py
@@ -1,6 +1,7 @@
 """Utilities for selecting and loading Spyre models."""
 import os
-from typing import Any, Optional
+from dataclasses import dataclass
+from typing import Any, Optional, cast
 
 import torch
 import torch._inductor.config
@@ -9,6 +10,7 @@
 from fms.models import get_model
 from transformers import PretrainedConfig
 from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -29,6 +31,14 @@
 logger = init_logger(__name__)
 
 
+@dataclass
+class SpyreAttentionMetadata:
+    slot_mapping: torch.Tensor = None
+    current_tkv_mask: torch.Tensor = None
+    left_padded_prompt_mask: torch.Tensor = None
+    block_table: torch.Tensor = None
+
+
 class SpyreCausalLM(nn.Module):
 
     def __init__(
@@ -73,10 +83,6 @@ def forward(
         positions: torch.Tensor,
         masks: torch.Tensor,
         is_prompt: bool,
-        current_tkv_mask: Optional[torch.Tensor] = None,
-        left_padded_prompt_mask: Optional[torch.Tensor] = None,
-        block_table: Optional[torch.Tensor] = None,
-        slot_mapping: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
         if is_prompt and not envs_spyre.VLLM_SPYRE_USE_CB:
@@ -88,12 +94,6 @@ def forward(
             # cpu impl when padding too much
             extra_kwargs["attn_algorithm"] = "math"
 
-        if envs_spyre.VLLM_SPYRE_USE_CB:
-            extra_kwargs["current_tkv_mask"] = current_tkv_mask
-            extra_kwargs["left_padded_prompt_mask"] = left_padded_prompt_mask
-            extra_kwargs["block_table"] = block_table
-            extra_kwargs["slot_mapping"] = slot_mapping
-
         # normal prefill or decoding step
         logits = self.model(
             input_ids,
@@ -353,32 +353,30 @@ def forward(
         mask: torch.Tensor,
         use_cache: bool,
         only_last_token: bool,
-        current_tkv_mask: torch.Tensor,
-        left_padded_prompt_mask: torch.Tensor,
-        block_table: torch.Tensor,
-        slot_mapping: torch.Tensor,
         **extra_kwargs,
     ) -> torch.Tensor:
 
+        forward_context = get_forward_context()
+
+        attn_metadata = cast(SpyreAttentionMetadata,
+                             forward_context.attn_metadata)
         # import will be not be needed/ handled by FMS soon
         import fms.utils.spyre.paged  # noqa # pylint: disable=unused-import
 
         # specify attention type for continuous batching
         extra_kwargs['attn_name'] = "spyre_paged_attn"
 
-        # additional (paged) attention arguments
-        extra_kwargs['current_tkv_mask'] = current_tkv_mask
-        extra_kwargs['left_padded_prompt_mask'] = left_padded_prompt_mask
-        extra_kwargs['block_table'] = block_table
-        extra_kwargs['slot_mapping'] = slot_mapping
-
         output = self.model(
             input_ids,
             position_ids=position_ids,
             mask=mask,
             past_key_value_states=self.past_key_value_states,
             use_cache=use_cache,
             only_last_token=only_last_token,
+            current_tkv_mask=attn_metadata.current_tkv_mask,
+            left_padded_prompt_mask=attn_metadata.left_padded_prompt_mask,
+            block_table=attn_metadata.block_table,
+            slot_mapping=attn_metadata.slot_mapping,
             **extra_kwargs,
         )
 
diff --git a/vllm_spyre/v1/worker/spyre_input_batch.py b/vllm_spyre/v1/worker/spyre_input_batch.py
@@ -23,6 +23,7 @@ class CachedRequestState:
     generator: Optional[torch.Generator]
 
     output_token_ids: list[int]
+    left_padding: int = 0  # Defaults to 0, i. e. not padding
 
     @property
     def num_tokens(self) -> int:
@@ -565,3 +566,8 @@ def no_allowed_token_ids(self) -> bool:
     @property
     def requests_ids(self) -> list[str]:
         return list(self.req_id_to_index.keys())
+
+    @property
+    def sorted_requests_ids(self) -> list[str]:
+        return sorted(self.req_id_to_index,
+                      key=self.req_id_to_index.get)  # type: ignore
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py

Original file line number	Diff line number	Diff line change
`@@ -163,6 +163,7 @@ def _construct_cached_request_state(req_id_suffix: int):`
`163`	`163`	`sampling_params=_create_sampling_params(),`
`164`	`164`	`generator=None,`
`165`	`165`	`output_token_ids=output_token_ids,`
	`166`	`+ left_padding=0,`
`166`	`167`	`)`
`167`	`168`
`168`	`169`