chunkprefill mla with torchair graph

phyde682 · phyde682 · commit 8069fefa46ba · 2025-08-04T15:35:57.000+08:00
Signed-off-by: haojiangzheng &lt;justineric096@gmail.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -964,7 +964,7 @@ def _forward_decode(
         decode_meta = attn_metadata.decode
         assert decode_meta is not None
         num_tokens = q_nope.size(0)
-        if self.running_in_graph or self.running_chunkprefilll_with_torchair:
+        if self.running_in_graph:
             # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim]
             if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
                 assert num_tokens % self.spec_token_num == 0
@@ -1080,7 +1080,6 @@ def forward(
         self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [
             AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
         ]
-        self.running_chunkprefilll_with_torchair = self.torchair_graph_enabled and attn_metadata.attn_state == AscendAttentionState.ChunkedPrefill
         num_actual_toks = attn_metadata.num_actual_tokens
         if k_pe is None and not self.running_in_graph:
             kv_c, k_pe = self.kv_a_proj_with_mqa(
@@ -1117,25 +1116,18 @@ def forward(
         if has_decode:
             decode_k_nope = None
             assert attn_metadata.decode is not None
-            if self.running_in_graph or self.running_chunkprefilll_with_torchair:
+            if self.running_in_graph:
                 cos = attn_metadata.decode.cos
                 sin = attn_metadata.decode.sin
-                if self.running_chunkprefilll_with_torchair:
-                    decode_hs = (
-                        hidden_states_or_kv_c_normed[:num_decode_tokens])
-                    slots = attn_metadata.slot_mapping[:num_decode_tokens]
+                with npu_stream_switch("mla_secondary",
+                                       0,
+                                       enabled=enable_multistream_mla):
+                    npu_wait_tensor(hidden_states_or_kv_c_normed,
+                                    ckq,
+                                    enabled=enable_multistream_mla)
                     decode_k_pe, decode_k_nope, decode_kv = self.exec_kv(
-                        decode_hs, cos, sin, kv_cache, slots)
-                else:
-                    with npu_stream_switch("mla_secondary",
-                                           0,
-                                           enabled=enable_multistream_mla):
-                        npu_wait_tensor(hidden_states_or_kv_c_normed,
-                                        ckq,
-                                        enabled=enable_multistream_mla)
-                        decode_k_pe, decode_k_nope, decode_kv = self.exec_kv(
-                            hidden_states_or_kv_c_normed, cos, sin, kv_cache,
-                            attn_metadata.slot_mapping)
+                        hidden_states_or_kv_c_normed, cos, sin, kv_cache,
+                        attn_metadata.slot_mapping)
                 # Without explicitly controlling the order, IndexByTensor operations
                 # would be placed after `matmul W_KV_T` hindering the overlapping of
                 # KvRmsNormRopeCache and SingleRope.
@@ -1159,8 +1151,6 @@ def forward(
                                     decode_k_pe,
                                     enabled=enable_multistream_mla)
                     decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
-            elif self.running_chunkprefilll_with_torchair:
-                decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
             else:
                 decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                     attn_metadata.decode.input_positions,
@@ -1199,15 +1189,16 @@ def forward(
             kv_cache
         ) > 1, "the number of kv cache should be greater than 1, namely (nope_cache and rope_cache)"
         if self.torchair_graph_enabled:
-            if kv_cache[0].numel() > 0 and has_prefill:
+            if kv_cache[0].numel(
+            ) > 0 and attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
                 slots = attn_metadata.slot_mapping
                 # NOTE: Separate the kv cache in advance to avoid OOM or other issues
-                torch_npu._npu_reshape_and_cache(
-                    key=kv_c_normed.view(num_tokens, self.num_kv_heads, -1),
-                    value=prefill_k_pe,
-                    key_cache=kv_cache[0],
-                    value_cache=kv_cache[1],
-                    slot_indices=slots[num_decode_tokens:])
+                torch_npu._npu_reshape_and_cache(key=kv_c_normed.view(
+                    num_tokens, self.num_kv_heads, -1),
+                                                 value=prefill_k_pe,
+                                                 key_cache=kv_cache[0],
+                                                 value_cache=kv_cache[1],
+                                                 slot_indices=slots)
         else:
             kv_c_normed = kv_c_normed.view(
                 [num_actual_toks, self.num_kv_heads, -1])