fix reorder

ganyi1996ppo · ganyi1996ppo · commit f5cc7522f6b6 · 2025-10-29T08:22:07.000Z
Signed-off-by: ganyi &lt;ygan@amd.com&gt;
diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py
@@ -83,6 +83,21 @@ class ReorderTestCase:
         expected_order=[3, 1, 2, 0],  # Only swap 0↔3, keep 1 and 2 in place
         expected_modified=True,
     ),
+    "complicated_mixed_interleaved": ReorderTestCase(
+        requests=[
+            (1, 20),
+            (1, 50),
+            (374, 0),
+            (300, 20),
+            (1, 20),
+            (256, 0),
+            (1, 5),
+            (27, 0),
+            (1, 4),
+        ],
+        expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
+        expected_modified=True,
+    ),
 }
 
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -17,7 +17,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import cdiv
+from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import abc
+import copy
 import enum
 import functools
 from abc import abstractmethod
@@ -874,6 +875,29 @@ def reorder_batch_to_split_decodes_and_prefills(
     # NOTE for now we loosely use "decode" to mean requests where attention is
     #  likely memory-bound and "prefill" to mean requests where attention is
     #  likely compute-bound,
+    # rid = dist.get_rank()
+    rid = 0
+
+    def print_order():
+        if rid == 0:
+            num_scheduled_tokens = [
+                scheduler_output.num_scheduled_tokens[id] for id in input_batch.req_ids
+            ]
+            num_scheduled_tokens_np = np.array(num_scheduled_tokens)
+            num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]
+            print("num scheduled tokens: ", num_scheduled_tokens_np, flush=True)
+            print("num computed tokens: ", num_computed_tokens_np, flush=True)
+            is_decode = num_scheduled_tokens_np <= decode_threshold
+            is_extend = (~is_decode) & (num_computed_tokens_np > 0)
+            is_prefill = (~is_decode) & (num_computed_tokens_np == 0)
+            idx = np.arange(0, is_decode.shape[0])
+            decodes = idx[is_decode]
+            extends = idx[is_extend]
+            prefills = idx[is_prefill]
+            print("decode: ", decodes, flush=True)
+            print("extends: ", extends, flush=True)
+            print("prefills: ", prefills, flush=True)
+
     num_reqs = len(input_batch.req_ids)
     num_scheduled_tokens = [
         scheduler_output.num_scheduled_tokens[id] for id in input_batch.req_ids
@@ -907,16 +931,29 @@ def reorder_batch_to_split_decodes_and_prefills(
     sorted_order = np.argsort(req_regions[needs_swap], kind="stable")
     dest_indices = swap_indices[sorted_order]
 
-    src_dest_map = {int(src): int(dst) for src, dst in zip(swap_indices, dest_indices)}
+    idx_mapping = {val: idx for idx, val in enumerate(swap_indices)}
+    # Record the original positions of idx in input_batch, the further
+    # reorder manipulate the position_ids, so we need this variable helps
+    # ping out the real position in input_batch
+    indices_positions = copy.deepcopy(swap_indices)
+
+    # Then we reorder the swap_indices to dest_indices
+    for i in range(len(swap_indices)):
+        dst = dest_indices[i]
+        src = swap_indices[i]
+        if dst != src:
+            # Get the real index position in input_batch to swap
+            dst_pos = indices_positions[idx_mapping[dst]]
+            src_pos = indices_positions[idx_mapping[src]]
+
+            input_batch.swap_states(dst_pos, src_pos)
+
+            dst_idx = idx_mapping[dst]
+            swap_indices[i] = dst
+            swap_indices[dst_idx] = src
+            idx_mapping[dst] = i
+            idx_mapping[src] = dst_idx
 
-    for src in src_dest_map:
-        dst = src_dest_map[src]
-        while src != dst:
-            input_batch.swap_states(src, dst)
-            # Mark dst as done by updating its destination to itself
-            next_dst = src_dest_map.get(dst, dst)
-            src_dest_map[dst] = dst
-            dst = next_dst
     return True