use old version reject_sample_greedy

whx-sjtu · whx-sjtu · commit 11f3aa27362d · 2025-08-11T11:49:21.000+08:00
Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/tests/ut/sample/test_rejection_sampler.py b/tests/ut/sample/test_rejection_sampler.py
@@ -36,7 +36,8 @@ def test_rejection_greedy_sample_pytorch(self):
         output_token_ids = torch.full((batch_size, max_spec_len + 1),
                                       PLACEHOLDER_TOKEN_ID)
 
-        num_draft_tokens = torch.tensor([2, 2])
+        cu_num_draft_tokens = torch.tensor([2, 4])
+        num_draft_tokens = [2, 2]
         draft_token_ids = torch.tensor([10, 11, 20, 21])
         target_argmax = torch.tensor([10, 99, 20, 22])
         bonus_token_ids = torch.tensor([[100], [200]])
@@ -45,10 +46,11 @@ def test_rejection_greedy_sample_pytorch(self):
 
         rejection_greedy_sample_pytorch(
             output_token_ids,
-            num_draft_tokens,
+            cu_num_draft_tokens,
             draft_token_ids,
             target_argmax,
             bonus_token_ids,
+            num_draft_tokens,
             max_spec_len,
             is_greedy,
         )
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -156,14 +156,13 @@ def rejection_sample(
                 bonus_token_ids,
             )
         else:
-            num_draft_tokens_tensor = torch.tensor(num_draft_tokens,
-                                                   device=device)
             rejection_greedy_sample_pytorch(
                 output_token_ids,
-                num_draft_tokens_tensor,
+                cu_num_draft_tokens,
                 draft_token_ids,
                 target_argmax,
                 bonus_token_ids,
+                num_draft_tokens,
                 max_spec_len,
                 is_greedy,
             )
@@ -311,40 +310,72 @@ def rejection_greedy_sample_spec_len_1_pytorch(
 
 def rejection_greedy_sample_pytorch(
         output_token_ids,  # [batch_size, max_spec_len + 1]
-        num_draft_tokens,  # [batch_size]
+        cu_num_draft_tokens,  # [batch_size]
         draft_token_ids,  # [num_tokens]
         target_argmax,  # [num_tokens]
-        bonus_token_ids,  # [batch_size, 1]
-        max_spec_len,  # int
+        bonus_token_ids,  # [batch_size]
+        draft_tokens_per_req,  # [batch_size], list
+        max_spec_len,
         is_greedy=None,  # [batch_size] or None
 ):
-    batch_size = output_token_ids.shape[0]
+    batch_size = output_token_ids.size(0)
+    num_tokens = draft_token_ids.size(0)
     device = output_token_ids.device
+    draft_tokens_per_req = torch.tensor(draft_tokens_per_req).to(
+        device, non_blocking=True)
     if is_greedy is None:
         is_greedy = torch.ones(batch_size, dtype=torch.bool, device=device)
-    draft_token_mask = draft_token_ids == target_argmax
-    pos_ids = torch.arange(0, max_spec_len + 1,
-                           device=device).view(1, -1).expand(batch_size, -1)
-    pos_mask = pos_ids < num_draft_tokens.view(-1, 1)
-    output_token_mask = torch.zeros([batch_size, max_spec_len + 1],
-                                    dtype=torch.int,
-                                    device=device)
-    output_token_mask[pos_mask] = draft_token_mask.to(torch.int)
-    output_token_mask = torch.cumprod(output_token_mask,
-                                      dim=1)  # [batch_size, max_spec_len + 1]
-    extra_accept_pos = torch.max(
-        pos_ids * output_token_mask, dim=1, keepdim=True)[1] + 1
-    output_token_mask[:, extra_accept_pos] = True
-    output_token_mask *= is_greedy.view(-1, 1)
-    output_token_ids[pos_mask] = target_argmax.to(output_token_ids.dtype)
-    extra_accept_ids = output_token_ids[:, extra_accept_pos]
-    output_token_ids[pos_mask] = draft_token_ids.to(output_token_ids.dtype)
-    output_token_ids[:, extra_accept_pos] = extra_accept_ids.to(
-        output_token_ids.dtype)
-    output_token_ids[:, -1] = bonus_token_ids.squeeze(1).to(
-        output_token_ids.dtype)
-    output_token_ids[~output_token_mask.bool()] = -1
-    return output_token_ids
+
+    start_indices = cu_num_draft_tokens - draft_tokens_per_req
+    req_ids = torch.arange(batch_size, device=device)
+    token_req_ids = torch.repeat_interleave(req_ids, draft_tokens_per_req)
+    token_positions = torch.arange(
+        num_tokens, device=device) - start_indices[token_req_ids]
+
+    # Find the first mismatch position of each request.
+    mismatch_global = (draft_token_ids != target_argmax)
+    if max_spec_len == 0:
+        first_mismatch_pos_per_req = torch.zeros(batch_size,
+                                                 dtype=torch.long,
+                                                 device=device)
+    else:
+        # [bs, max_spec_len]
+        pos_matrix = torch.full((batch_size, max_spec_len),
+                                -1,
+                                dtype=torch.long,
+                                device=device)
+        pos_matrix[token_req_ids, token_positions] = token_positions
+        mismatch_matrix = torch.full((batch_size, max_spec_len),
+                                     False,
+                                     dtype=torch.bool,
+                                     device=device)
+        mismatch_matrix[token_req_ids, token_positions] = mismatch_global
+        mismatch_positions = torch.where(mismatch_matrix, pos_matrix,
+                                         max_spec_len * 2)
+        first_mismatch_pos_per_req, _ = torch.min(mismatch_positions, dim=1)
+        no_mismatch_mask = (first_mismatch_pos_per_req == max_spec_len * 2)
+        first_mismatch_pos_per_req[no_mismatch_mask] = draft_tokens_per_req[
+            no_mismatch_mask]
+
+    # Copy matched target tokens into output.
+    copy_len = torch.minimum(first_mismatch_pos_per_req + 1,
+                             draft_tokens_per_req)
+    copy_indices = torch.arange(max_spec_len + 1,
+                                device=device).expand(batch_size, -1)
+    copy_mask = copy_indices < copy_len.unsqueeze(1)
+    greedy_mask = is_greedy.unsqueeze(1)
+    final_copy_mask = copy_mask & greedy_mask
+    global_idx = start_indices.unsqueeze(1) + copy_indices
+    output_token_ids[final_copy_mask] = target_argmax[
+        global_idx[final_copy_mask]].to(output_token_ids.dtype)
+    # Fill bonus token.
+    needs_bonus = is_greedy & (first_mismatch_pos_per_req
+                               >= draft_tokens_per_req)
+    if torch.any(needs_bonus):
+        bonus_rows = torch.where(needs_bonus)[0]
+        bonus_cols = draft_tokens_per_req[bonus_rows]
+        bonus_token_ids = bonus_token_ids.squeeze(1)
+        output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows]
 
 
 def rejection_random_sample_pytorch(