vllm-project
diff --git a/‎tests/v1/attention/test_mla_backends.py‎
Lines changed: 9 additions & 13 deletions b/‎tests/v1/attention/test_mla_backends.py‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎vllm/v1/attention/backends/cpu_attn.py‎
Lines changed: 4 additions & 6 deletions b/‎vllm/v1/attention/backends/cpu_attn.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎vllm/v1/attention/backends/flashinfer.py‎
Lines changed: 4 additions & 6 deletions b/‎vllm/v1/attention/backends/flashinfer.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎vllm/v1/attention/backends/gdn_attn.py‎
Lines changed: 2 additions & 4 deletions b/‎vllm/v1/attention/backends/gdn_attn.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎vllm/v1/attention/backends/linear_attn.py‎
Lines changed: 2 additions & 6 deletions b/‎vllm/v1/attention/backends/linear_attn.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎vllm/v1/attention/backends/mamba1_attn.py‎
Lines changed: 1 addition & 3 deletions b/‎vllm/v1/attention/backends/mamba1_attn.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎vllm/v1/attention/backends/mamba2_attn.py‎
Lines changed: 1 addition & 3 deletions b/‎vllm/v1/attention/backends/mamba2_attn.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎vllm/v1/attention/backends/mamba_attn.py‎
Lines changed: 1 addition & 2 deletions b/‎vllm/v1/attention/backends/mamba_attn.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎vllm/v1/attention/backends/mla/common.py‎
Lines changed: 42 additions & 27 deletions b/‎vllm/v1/attention/backends/mla/common.py‎
Lines changed: 42 additions & 27 deletions
@@ -352,7 +352,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
        simulated paged KV cache.
     5. Comparing the vLLM backend's output to the ground-truth SDPA output.
     """
-    from vllm.v1.attention.backends.utils import QueryLenSupport
+    from vllm.v1.attention.backends.mla.common import QueryLenSupport
 
     batch_spec = BATCH_SPECS[batch_spec_name]
     is_spec_decode_test = batch_spec_name.startswith("spec_decode")
@@ -372,7 +372,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         block_size=block_size,
     )
 
-    # For spec decode tests, add a speculative_config to set the decode_threshold
+    # For spec decode tests, add a speculative_config to set the reorder_batch_threshold
     if is_spec_decode_test:
         from vllm.config import SpeculativeConfig
 
@@ -460,24 +460,20 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
         for backend_idx, backend in enumerate(BACKENDS_TO_TEST):
             builder_cls, _ = try_get_attention_backend(backend)
             if is_spec_decode_test:
-                decode_query_len_support = getattr(
-                    builder_cls.reorder_spec,
-                    "decode_query_len_support",
-                    QueryLenSupport.SINGLE_ONLY,
+                query_len_support = getattr(
+                    builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY
                 )
-                supports_spec = decode_query_len_support != QueryLenSupport.SINGLE_ONLY
+                supports_spec = query_len_support != QueryLenSupport.SINGLE_ONLY
                 is_decode.append(supports_spec)
             else:
-                threshold = getattr(builder_cls.reorder_spec, "decode_threshold", None)
-                decode_query_len_support = getattr(
-                    builder_cls.reorder_spec,
-                    "decode_query_len_support",
-                    QueryLenSupport.SINGLE_ONLY,
+                threshold = getattr(builder_cls, "reorder_batch_threshold", None)
+                query_len_support = getattr(
+                    builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY
                 )
                 within_threshold = q_len <= threshold if threshold else False
                 if (
                     within_threshold
-                    and decode_query_len_support == QueryLenSupport.UNIFORM
+                    and query_len_support == QueryLenSupport.UNIFORM
                     and i > 0
                 ):
                     first_q_len = query_lens[0]
 
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import numpy as np
 import torch
@@ -20,7 +20,6 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -349,7 +348,7 @@ def get_seq_len_block_table_args(
 
 
 class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -361,7 +360,7 @@ def __init__(
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
 
         self.scheduler_config = vllm_config.scheduler_config
-        self._init_decode_threshold(1, False)
+        self._init_reorder_batch_threshold(1, False)
 
         self.seq_start_loc_cpu = torch.zeros(
             vllm_config.scheduler_config.max_num_seqs + 1,
@@ -385,11 +384,10 @@ def build(
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         query_start_loc_np = query_start_loc_cpu.numpy()
 
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
                 common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
+                decode_threshold=self.reorder_batch_threshold,
                 require_uniform=True,
             )
         )
 
@@ -44,7 +44,6 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
     get_kv_cache_layout,
     get_per_layer_parameters,
     infer_global_hyperparameters,
@@ -276,7 +275,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -354,7 +353,7 @@ def __init__(
         else:
             self.q_data_type = self.model_config.dtype
 
-        self._init_decode_threshold(1, supports_spec_as_decode=can_use_trtllm)
+        self._init_reorder_batch_threshold(1, supports_spec_as_decode=can_use_trtllm)
 
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
@@ -470,11 +469,10 @@ def build(
     ) -> FlashInferMetadata:
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
                 common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
+                decode_threshold=self.reorder_batch_threshold,
                 require_uniform=True,
             )
         )
@@ -552,7 +550,7 @@ def build(
             paged_kv_last_page_len_np,
         )
 
-        uses_spec_reorder = self.reorder_spec.decode_threshold > 1
+        uses_spec_reorder = self.reorder_batch_threshold > 1
         prefill_use_trtllm = use_trtllm_attention(
             self.num_qo_heads,
             self.num_kv_heads,
 
@@ -3,7 +3,6 @@
 """Backend for GatedDeltaNet attention."""
 
 from dataclasses import dataclass
-from typing import ClassVar
 
 import torch
 
@@ -14,7 +13,6 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
     compute_causal_conv1d_metadata,
     split_decodes_and_prefills,
 )
@@ -63,7 +61,7 @@ class GDNAttentionMetadata:
 class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]):
     cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
 
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -82,7 +80,7 @@ def __init__(
         else:
             self.num_spec = 0
         self.use_spec_decode = self.num_spec > 0
-        self._init_decode_threshold(1, self.use_spec_decode)
+        self._init_reorder_batch_threshold(1, self.use_spec_decode)
 
         self.use_full_cuda_graph = (
             self.compilation_config.cudagraph_mode.has_full_cudagraphs()
 
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar
 
 import torch
 
@@ -10,7 +9,6 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
@@ -35,7 +33,7 @@ class LinearAttentionMetadata:
 
 
 class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -58,11 +56,9 @@ def build(
 
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
             )
         )
 
 
@@ -49,11 +49,9 @@ def build(
             query_start_loc.device
         )
 
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
             )
         )
 
 
@@ -223,11 +223,9 @@ def build(
             block_idx_last_scheduled_token = None
             block_idx_last_computed_token = None
 
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
+                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
             )
         )
 
 
@@ -11,15 +11,14 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 M = TypeVar("M")
 
 
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)
+    reorder_batch_threshold: int = 1
     cudagraph_support: ClassVar[AttentionCGSupport] = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
@@ -190,6 +190,7 @@
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import ClassVar, Generic, TypeVar
 
 import torch
@@ -224,14 +225,30 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    QueryLenSupport,
-    ReorderSpec,
     get_per_layer_parameters,
     infer_global_hyperparameters,
     split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
+
+class QueryLenSupport(Enum):
+    """Defines the level of query length support for an attention backend's
+    decode pipeline.
+
+    - SINGLE_ONLY: Decode pipeline only supports single-token queries
+                   (query_len=1)
+    - UNIFORM: Decode pipeline supports uniform multi-token queries
+               (all requests must have same query_len > 1)
+    - VARLEN: Decode pipeline supports variable-length queries
+              (mixed query lengths in same batch)
+    """
+
+    SINGLE_ONLY = "single_only"
+    UNIFORM = "uniform"
+    VARLEN = "varlen"
+
+
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
 
@@ -465,14 +482,20 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     understand this class
     """
 
+    # Defines the level of query length support for this backend.
+    # - SINGLE_ONLY: Only single-token queries (no spec decode support)
+    # - UNIFORM: Supports uniform multi-token queries (spec decode with uniform lengths)
+    # - VARLEN: Supports variable-length queries (spec decode with mixed lengths)
+    # If set to UNIFORM or VARLEN, this will increase `reorder_batch_threshold` when
+    # speculative decoding is enabled.
+    query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.SINGLE_ONLY
+
     # The threshold for reordering the batch into decode and prefill requests.
     # If > 1, the batch will be reordered such that requests with
     # query length <= threshold are classified as decode requests.
-    # Use `decode_query_len_support` (above) to set this automatically
+    # Use `query_len_support` (above) to set this automatically
     # when speculative decoding is enabled.
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(
-        1, decode_query_len_support=QueryLenSupport.SINGLE_ONLY
-    )
+    reorder_batch_threshold: int = 1
 
     @staticmethod
     def determine_chunked_prefill_workspace_size(vllm_config: VllmConfig) -> int:
@@ -597,19 +620,16 @@ def __init__(
                 device=device,
             )
 
-        assert self.reorder_spec.decode_threshold is not None
-        supports_spec_decode = (
-            self.reorder_spec.decode_query_len_support != QueryLenSupport.SINGLE_ONLY
-        )
-        self._init_decode_threshold(
-            self.reorder_spec.decode_threshold, supports_spec_decode
+        supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
+        self._init_reorder_batch_threshold(
+            self.reorder_batch_threshold, supports_spec_decode
         )
 
-        # Validate consistency between decode_query_len_support and decode_threshold
-        if self.reorder_spec.decode_query_len_support == QueryLenSupport.SINGLE_ONLY:
-            assert self.reorder_spec.decode_threshold == 1, (
-                f"decode_threshold must be 1 when decode_query_len_support is "
-                f"SINGLE_ONLY, got {self.reorder_spec.decode_threshold}"
+        # Validate consistency between query_len_support and reorder_batch_threshold
+        if self.query_len_support == QueryLenSupport.SINGLE_ONLY:
+            assert self.reorder_batch_threshold == 1, (
+                f"reorder_batch_threshold must be 1 when query_len_support is "
+                f"SINGLE_ONLY, got {self.reorder_batch_threshold}"
             )
 
     def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata):
@@ -712,14 +732,12 @@ def build_for_cudagraph_capture(
         Currently, only decode is supported for full cudagraphs with MLA.
         """
         m = common_attn_metadata
-        assert self.reorder_spec.decode_threshold is not None
-        assert m.num_reqs <= (
-            m.num_actual_tokens * self.reorder_spec.decode_threshold
-        ), (
+        assert m.num_reqs <= (m.num_actual_tokens * self.reorder_batch_threshold), (
             "MLA only supports decode-only full CUDAGraph capture. "
             "Make sure all cudagraph capture sizes <= max_num_seq."
         )
-        assert m.max_query_len <= self.reorder_spec.decode_threshold  # decode only
+
+        assert m.max_query_len <= self.reorder_batch_threshold  # decode only
 
         return self.build(0, m)
 
@@ -751,14 +769,11 @@ def build(
 
         num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu
 
-        assert self.reorder_spec.decode_threshold is not None
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
                 common_attn_metadata,
-                decode_threshold=self.reorder_spec.decode_threshold,
-                require_uniform=(
-                    self.reorder_spec.decode_query_len_support != QueryLenSupport.VARLEN
-                ),
+                decode_threshold=self.reorder_batch_threshold,
+                require_uniform=(self.query_len_support != QueryLenSupport.VARLEN),
             )
         )
Original file line number	Diff line number	Diff line change
`@@ -49,11 +49,9 @@ def build(`
`49`	`49`	`query_start_loc.device`
`50`	`50`	`)`
`51`	`51`
`52`		`- assert self.reorder_spec.decode_threshold is not None`
`53`	`52`	`num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (`
`54`	`53`	`split_decodes_and_prefills(`
`55`		`- common_attn_metadata,`
`56`		`- decode_threshold=self.reorder_spec.decode_threshold,`
	`54`	`+ common_attn_metadata, decode_threshold=self.reorder_batch_threshold`
`57`	`55`	`)`
`58`	`56`	`)`
`59`	`57`
Original file line number	Diff line number	Diff line change
`@@ -223,11 +223,9 @@ def build(`
`223`	`223`	`block_idx_last_scheduled_token = None`
`224`	`224`	`block_idx_last_computed_token = None`
`225`	`225`
`226`		`- assert self.reorder_spec.decode_threshold is not None`
`227`	`226`	`num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (`
`228`	`227`	`split_decodes_and_prefills(`
`229`		`- common_attn_metadata,`
`230`		`- decode_threshold=self.reorder_spec.decode_threshold,`
	`228`	`+ common_attn_metadata, decode_threshold=self.reorder_batch_threshold`
`231`	`229`	`)`
`232`	`230`	`)`
`233`	`231`
Original file line number	Diff line number	Diff line change
`@@ -11,15 +11,14 @@`
`11`	`11`	`AttentionCGSupport,`
`12`	`12`	`AttentionMetadataBuilder,`
`13`	`13`	`CommonAttentionMetadata,`
`14`		`- ReorderSpec,`
`15`	`14`	`)`
`16`	`15`	`from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec`
`17`	`16`
`18`	`17`	`M = TypeVar("M")`
`19`	`18`
`20`	`19`
`21`	`20`	`class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):`
`22`		`- reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1)`
	`21`	`+ reorder_batch_threshold: int = 1`
`23`	`22`	`cudagraph_support: ClassVar[AttentionCGSupport] = (`
`24`	`23`	`AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE`
`25`	`24`	`)`