[perf] Use CPU tensor to reduce GPU->CPU sync (vllm-project#25884)

lhtin · pdasigi · commit 1116b82c3b5f · 2025-10-02T15:21:16.000-07:00
Signed-off-by: Lehua Ding &lt;lehuading@tencent.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2478,7 +2478,7 @@ def propose_draft_token_ids(sampled_token_ids):
             effective_drafter_max_model_len = (
                 self.speculative_config.draft_model_config.max_model_len)
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.seq_lens.max() +
+            spec_decode_common_attn_metadata.max_seq_len +
             self.speculative_config.num_speculative_tokens
             <= effective_drafter_max_model_len)
         if use_padded_batch_for_eagle and input_fits_in_drafter: