[perf] optimize rope in deepseek

David9857 · David9857 · commit dd16906e6a15 · 2025-06-19T22:03:35.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -1077,15 +1077,8 @@ def forward(
             decode_k_nope = None
             assert attn_metadata.decode is not None
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.decode.cos
+                sin = attn_metadata.decode.sin
                 # Without explicitly controlling the order, IndexByTensor operations
                 # would be placed after `matmul W_KV_T` hindering the overlapping of
                 # KvRmsNormRopeCache and SingleRope.
@@ -1122,15 +1115,8 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
-                seq_len = self.rotary_emb.max_position_embeddings
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                cos = cos[attn_metadata.prefill.input_positions]
-                sin = sin[attn_metadata.prefill.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.prefill.cos
+                sin = attn_metadata.prefill.sin
 
                 prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
                 prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -69,6 +69,7 @@
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
                                npu_wait_tensor)
 
@@ -671,6 +672,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
+        
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.max_position_embeddings = self.layers[0].self_attn.rotary_emb.max_position_embeddings
+        self.cos_cached = self.layers[0].self_attn.rotary_emb.cos_cached
+        self.sin_cached = self.layers[0].self_attn.rotary_emb.sin_cached
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -695,6 +702,31 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
+        # get cos, sin before layers
+        self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [
+            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
+        ]
+        if attn_metadata.num_decodes > 0 and self.running_in_graph:
+            seq_len = self.max_position_embeddings
+            cos = self.cos_cached[:seq_len].to(
+                dtype=hidden_states.dtype)
+            sin = self.sin_cached[:seq_len].to(
+                dtype=hidden_states.dtype)
+            cos = cos[attn_metadata.decode.input_positions]
+            sin = sin[attn_metadata.decode.input_positions]
+            attn_metadata.decode.cos = cos[:, None, None, :]
+            attn_metadata.decode.sin = sin[:, None, None, :]
+        if attn_metadata.num_prefills > 0 and self.torchair_graph_enabled:
+            seq_len = self.rotary_emb.max_position_embeddings
+            cos = self.cos_cached[:seq_len].to(
+                dtype=hidden_states.dtype)
+            sin = self.sin_cached[:seq_len].to(
+                dtype=hidden_states.dtype)
+            cos = cos[attn_metadata.prefill.input_positions]
+            sin = sin[attn_metadata.prefill.input_positions]
+            attn_metadata.prefill.cos = cos[:, None, None, :]
+            attn_metadata.prefill.sin = sin[:, None, None, :]
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states, residual = layer(