Fixes cuda graph of MTP verify under unaligned sps tokens.

sighingnow · sighingnow · commit b667c7ed3470 · 2025-09-17T00:00:50.000+08:00
Signed-off-by: Tao He &lt;linzhu.ht@alibaba-inc.com&gt;
(cherry picked from commit 8b83d23259ac24ec1f3e5e012da0c997a90031d8)
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -680,6 +680,25 @@ def _causal_conv1d_update_kernel(
             # not processing as this is not the actual sequence
             return
 
+    if IS_VARLEN:
+        query_start_index = tl.load(query_start_loc_ptr + idx_seq).to(tl.int64)
+        query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to(
+            tl.int64)
+        # revise state_len and seqlen
+        state_len = state_len - (seqlen -
+                                 (query_end_index - query_start_index))
+        seqlen = query_end_index - query_start_index
+        x_offset = query_start_index * stride_x_token
+        o_offset = query_start_index * stride_o_token
+    else:
+        query_start_index = idx_seq * seqlen
+        query_end_index = query_start_index + seqlen
+        x_offset = idx_seq * stride_x_seq
+        o_offset = idx_seq * stride_o_seq
+
+    if query_start_index == query_end_index:
+        return
+
     if IS_SPEC_DECODING:
         # The rolling of conv state:
         #
@@ -722,22 +741,6 @@ def _causal_conv1d_update_kernel(
         conv_states_ptrs = prior_tokens + 4 * stride_conv_state_tok  # [BLOCK_N]
         col4 = tl.load(conv_states_ptrs, mask_w, 0.0)
 
-    if IS_VARLEN:
-        query_start_index = tl.load(query_start_loc_ptr + idx_seq).to(tl.int64)
-        query_end_index = tl.load(query_start_loc_ptr + (idx_seq + 1)).to(
-            tl.int64)
-        # revise state_len and seqlen
-        state_len = state_len - (seqlen -
-                                 (query_end_index - query_start_index))
-        seqlen = query_end_index - query_start_index
-        x_offset = query_start_index * stride_x_token
-        o_offset = query_start_index * stride_o_token
-    else:
-        query_start_index = idx_seq * seqlen
-        query_end_index = query_start_index + seqlen
-        x_offset = idx_seq * stride_x_seq
-        o_offset = idx_seq * stride_o_seq
-
     # STEP 2: assume state_len > seqlen
     idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
 
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
@@ -417,9 +417,7 @@ def _forward(
         self_kv_cache = self.kv_cache[forward_context.virtual_engine]
         conv_state = self_kv_cache[0].transpose(-1, -2)
         ssm_state = self_kv_cache[1]
-        num_actual_tokens = (attn_metadata.num_prefill_tokens +
-                             attn_metadata.num_decode_tokens +
-                             attn_metadata.num_spec_decode_tokens)
+        num_actual_tokens = attn_metadata.num_actual_tokens
         num_accepted_tokens = attn_metadata.num_accepted_tokens
 
         # 1. Set up dimensions for reshapes later
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
@@ -31,6 +31,7 @@ class GDNAttentionMetadata:
     num_decode_tokens: int
     num_spec_decodes: int
     num_spec_decode_tokens: int
+    num_actual_tokens: int
 
     has_initial_state: Optional[torch.Tensor] = None
 
@@ -205,25 +206,22 @@ def build(  # type: ignore[override]
                 has_initial_state = has_initial_state[~spec_sequence_masks]
         else:
             has_initial_state = None
+        num_actual_tokens = num_prefill_tokens + num_decode_tokens + \
+            num_spec_decode_tokens
 
         # prepare tensors for cudagraph
         #
         # With speculative decoding, the xgrammar backend may rollback tokens
         # and causing some sequences has less draft tokens than self.num_spec.
         #
-        # During cudagraph capture, the GDN backends requires an assumption
-        # that num_spec_decode_tokens == num_spec_decodes * (self.num_spec + 1).
-        #
-        # More than one such sequences may break the assumption (less tokens),
-        # causing incompatible inputs for cuda graph replay.
+        # In above cases, the max possible batch size for n tokens, can be
+        # min(n, cudagraph_max_bs).
         if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
                 and num_spec_decodes <= self.decode_cudagraph_max_bs
-                and num_spec_decode_tokens <= self.decode_cudagraph_max_bs
-                and num_spec_decode_tokens == num_spec_decodes *
-            (self.num_spec + 1)):
-            num_total_tokens = self.vllm_config.pad_for_cudagraph(
+                and num_spec_decode_tokens <= self.decode_cudagraph_max_bs):
+            num_actual_tokens = self.vllm_config.pad_for_cudagraph(
                 m.num_actual_tokens)
-            batch_size = num_total_tokens // (self.num_spec + 1)
+            batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens)
 
             self.spec_state_indices_tensor[:num_spec_decodes].copy_(
                 spec_state_indices_tensor, non_blocking=True)
@@ -239,7 +237,7 @@ def build(  # type: ignore[override]
             assert spec_token_masks is not None
             self.spec_token_masks[:spec_token_masks.size(0)].copy_(
                 spec_token_masks, non_blocking=True)
-            spec_token_masks = self.spec_token_masks[:m.num_actual_tokens]
+            spec_token_masks = self.spec_token_masks[:num_actual_tokens]
             spec_token_masks[spec_token_masks.size(0):].fill_(False)
 
             self.spec_query_start_loc[:num_spec_decodes + 1].copy_(
@@ -258,9 +256,9 @@ def build(  # type: ignore[override]
         if (self.use_full_cuda_graph and num_prefills == 0
                 and num_spec_decodes == 0
                 and num_decodes <= self.decode_cudagraph_max_bs):
-            num_total_tokens = self.vllm_config.pad_for_cudagraph(
+            num_actual_tokens = self.vllm_config.pad_for_cudagraph(
                 m.num_actual_tokens)
-            batch_size = num_total_tokens
+            batch_size = num_actual_tokens
 
             self.non_spec_state_indices_tensor[:num_decodes].copy_(
                 non_spec_state_indices_tensor, non_blocking=True)
@@ -284,6 +282,7 @@ def build(  # type: ignore[override]
             num_decode_tokens=num_decode_tokens,
             num_spec_decodes=num_spec_decodes,
             num_spec_decode_tokens=num_spec_decode_tokens,
+            num_actual_tokens=num_actual_tokens,
             has_initial_state=has_initial_state,
             spec_query_start_loc=spec_query_start_loc,
             non_spec_query_start_loc=non_spec_query_start_loc,