verify accuracy and performance

ganyi1996ppo · ganyi1996ppo · commit 0caa210eefc5 · 2025-10-23T06:23:10.000Z
Signed-off-by: ganyi &lt;ygan@amd.com&gt;
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -3,7 +3,6 @@
 """Attention layer with AiterFlashAttention."""
 
 from dataclasses import dataclass
-from typing import ClassVar
 
 import torch
 
@@ -23,7 +22,6 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    ReorderSpec,
     split_decodes_prefills_and_extends,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -254,7 +252,7 @@ class AiterFlashAttentionMetadataBuilder(
     AttentionMetadataBuilder[AiterFlashAttentionMetadata]
 ):
     cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
-    reorder_spec: ClassVar[ReorderSpec] = ReorderSpec(1, split_extend=True)
+    reorder_batch_threshold: int = 1
 
     def __init__(
         self,
@@ -303,10 +301,9 @@ def build(
         common_attn_metadata: CommonAttentionMetadata,
         fast_build: bool = False,
     ) -> "AiterFlashAttentionMetadata":
-        assert self.reorder_spec.decode_threshold is not None
         split_ret = split_decodes_prefills_and_extends(
             common_attn_metadata,
-            decode_threshold=self.reorder_spec.decode_threshold,
+            decode_threshold=self.reorder_batch_threshold,
         )
 
         (
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -885,8 +885,8 @@ def reorder_batch_to_split_decodes_and_prefills(
     num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]
 
     is_decode = num_scheduled_tokens_np <= decode_threshold
-    is_extend = (~is_decode) & (num_computed_tokens_np > num_scheduled_tokens_np)
-    is_prefill = (~is_decode) & (num_computed_tokens_np == num_scheduled_tokens_np)
+    is_extend = (~is_decode) & (num_computed_tokens_np > 0)
+    is_prefill = (~is_decode) & (num_computed_tokens_np == 0)
 
     # Desired order: decode → extend → prefill
     order_key = np.zeros(is_decode.shape, dtype=np.int32)  # 0 = decode by default
@@ -907,6 +907,7 @@ def reorder_batch_to_split_decodes_and_prefills(
             input_batch.swap_states(i, j)
             dest[i], dest[j] = dest[j], dest[i]
             modified_batch = True
+
     return modified_batch