vllm-project · anon189Ty · Oct 30, 2025 · Nov 3, 2025 · gemini-code-assist · Oct 30, 2025
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -71,7 +71,8 @@ def set_ascend_forward_context(
         batch_descriptor: Optional[BatchDescriptor] = None,
         prefetch_stream: torch.npu.Stream = None,
         model_instance: torch.nn.Module = None,
-        weight_prefetch_method: Optional[WeightPrefetchMethod] = None):
+        weight_prefetch_method: Optional[WeightPrefetchMethod] = None,
+        is_mtp_model=False):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
@@ -153,6 +154,7 @@ def set_ascend_forward_context(
         forward_context.prefetch_mlp_enabled = prefetch_mlp_enabled
         forward_context.model_instance = model_instance
         forward_context.weight_prefetch_method = weight_prefetch_method
+        forward_context.is_mtp_model = is_mtp_model
 
         # TODO(rjg-lyh): The current implementation is somewhat brute force and not elegant.
         # It will be improved later by implementing operator fusion through the FX graph.

diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -26,6 +26,7 @@
                                          trans_rope_weight, transdata,
                                          wait_for_kv_layer_from_connector)
 from vllm_ascend.compilation.acl_graph import (get_graph_params,
+                                               get_mtp_graph_params,
                                                update_graph_params_workspaces)
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
@@ -1022,8 +1023,11 @@ def _forward_decode(
             "actual_seq_lengths": actual_seq_lengths,
             "actual_seq_lengths_kv": decode_meta.seq_lens_list,
         }
-        graph_params = get_graph_params()
         forward_context: ForwardContext = get_forward_context()
+        if forward_context.is_mtp_model:
+            graph_params = get_mtp_graph_params()
+        else:
+            graph_params = get_graph_params()
         if forward_context.capturing:
             stream = torch_npu.npu.current_stream()
 

diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -232,7 +232,10 @@ def update_attn_params(update_stream, forward_context, runtime_shape):
 
 def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                            speculative_config):
-    graph_params = get_graph_params()
+    if forward_context.is_mtp_model:
+        graph_params = get_mtp_graph_params()
+    else:
+        graph_params = get_graph_params()
     # FIXME: Behold! We are using a temporary hack here to update the args
     # for each layer's attention op in the graph.
     for key, param, handle, event in zip(
@@ -245,7 +248,8 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
          spec_attn_mask, sparse_mode, scale, block_table, block_size,
          seq_lens_list, actual_seq_lengths, attn_output, softmax_lse) = param
         seq_lens_list = forward_context.attn_metadata[key].decode.seq_lens_list
-        if speculative_config and speculative_config.method == "deepseek_mtp":
+        if speculative_config and speculative_config.method == "deepseek_mtp" \
+                and not forward_context.is_mtp_model:
             actual_seq_lengths = forward_context.attn_metadata[
                 key].decode.actual_seq_lengths_q
             spec_multiple = speculative_config.num_speculative_tokens + 1
@@ -255,6 +259,9 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                 spec_multiple * (i + 1)
                 for i in range(runtime_shape // spec_multiple)
             ]
+        elif forward_context.is_mtp_model:
+            seq_lens_list = seq_lens_list + [0] * (len(actual_seq_lengths) -
+                                                   len(seq_lens_list))
         else:
             seq_lens_list = seq_lens_list + [0] * (runtime_shape -
                                                    len(seq_lens_list))
@@ -321,3 +328,40 @@ def update_graph_params_workspaces(num_tokens: int, workspace: Any):
 
 def get_graph_params():
     return _graph_params
+
+
+@dataclass
+class MTPGraphParams:
+    events: dict[int, list[torch.npu.ExternalEvent]]
+    workspaces: dict[int, torch.Tensor]
+    handles: dict[int, list[torch_npu._C._NPUTaskGroupHandle]]
+    attn_params: dict[int, list[tuple]]
+
+
+_mtp_graph_params: Optional[MTPGraphParams] = None
+
+
+def set_mtp_graph_params(aclgraph_capture_sizes: set[int]):
+    global _mtp_graph_params
+    if _mtp_graph_params is not None:
+        raise ValueError("MTPGraph parameters have already been set!")
+    _mtp_graph_params = MTPGraphParams(
+        {size: []
+         for size in aclgraph_capture_sizes},
+        {size: None
+         for size in aclgraph_capture_sizes},
+        {size: []
+         for size in aclgraph_capture_sizes},
+        {size: []
+         for size in aclgraph_capture_sizes},
+    )
+
+
+def update_mtp_graph_params_workspaces(num_tokens: int, workspace: Any):
+    global _mtp_graph_params
+    if _mtp_graph_params is not None:
+        _mtp_graph_params.workspaces[num_tokens] = workspace
+
+
+def get_mtp_graph_params():
+    return _mtp_graph_params