vllm-project
diff --git a/‎vllm_ascend/attention/mla_v1.py‎
Lines changed: 26 additions & 25 deletions b/‎vllm_ascend/attention/mla_v1.py‎
Lines changed: 26 additions & 25 deletions
@@ -4,12 +4,6 @@
 import numpy as np
 import torch
 import torch_npu
-from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
-from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
-from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
-from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
-
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
@@ -20,6 +14,12 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
+from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
+from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
+
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
@@ -123,8 +123,8 @@ def __post_init__(self):
         #         f"received {self.head_dim}.")
 
     def split_metadata_for_multistream(
-            self,
-            ms_split_config: MSAttentionMetadataSplitConfig,
+        self,
+        ms_split_config: MSAttentionMetadataSplitConfig,
     ) -> list["AscendMLAMetadata"]:
         """Split metadata for multi-stream with AscendMLAMetadata"""
         return model_input_split_v1_mla_attn(
@@ -133,6 +133,7 @@ def split_metadata_for_multistream(
             _metadata_cls=AscendMLAMetadata,
         )
 
+
 M = TypeVar("M", bound=AscendMLAMetadata)
 
 
@@ -574,14 +575,14 @@ def _forward_prefill(
             )
         attn_output = attn_output.reshape(
             [num_tokens, self.num_heads * self.v_head_dim])
-        
+
         # A better way is to modify the communication ops or RowParallel Layer in vllm;
         from vllm_ascend.multistream.context import \
             get_multistream_comm_context
-        current_ms_metadata = get_multistream_comm_context()  
+        current_ms_metadata = get_multistream_comm_context()
         if current_ms_metadata is None:
             return self.o_proj(attn_output)[0]
-        else: 
+        else:
             current_ms_metadata.before_comm_event.record()
             with torch.npu.stream(current_ms_metadata.comm_stream):
                 current_ms_metadata.before_comm_event.wait()
@@ -687,16 +688,15 @@ def _forward_decode(
                 out=attn_output)
         from vllm_ascend.multistream.context import \
             get_multistream_comm_context
-        current_ms_metadata = get_multistream_comm_context()  
+        current_ms_metadata = get_multistream_comm_context()
         if current_ms_metadata is None:
             return self._v_up_proj_and_o_proj(attn_output)
-        else: 
+        else:
             current_ms_metadata.before_comm_event.record()
             with torch.npu.stream(current_ms_metadata.comm_stream):
                 current_ms_metadata.before_comm_event.wait()
                 return self._v_up_proj_and_o_proj(attn_output)
 
-
     def forward(
         self,
         layer: AttentionLayer,
@@ -820,14 +820,15 @@ def forward(
                 key_cache=kv_cache,
                 slot_indices=attn_metadata.slot_mapping.flatten())
         if has_prefill:
-            # FIX: aicore move should be also placed on the comm stream in dbo, 
-            # otherwise it may affect the accuracy 
+            # FIX: aicore move should be also placed on the comm stream in dbo,
+            # otherwise it may affect the accuracy
             # TODO: use an elegant way to overlap
             from vllm_ascend.multistream.context import \
                 get_multistream_comm_context
-            output_prefill = self._forward_prefill(
-                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
-                attn_metadata)
+            output_prefill = self._forward_prefill(prefill_q,
+                                                   prefill_k_c_normed,
+                                                   prefill_k_pe, kv_cache,
+                                                   attn_metadata)
             current_ms_metadata = get_multistream_comm_context()
             if current_ms_metadata is not None:
                 with torch.npu.stream(current_ms_metadata.comm_stream):
@@ -836,7 +837,6 @@ def forward(
             else:
                 output[num_decode_tokens:] = output_prefill
 
-
         if has_decode:
             if self.running_in_graph:
                 return self._forward_decode(decode_ql_nope, decode_q_pe,
@@ -845,16 +845,17 @@ def forward(
             else:
                 from vllm_ascend.multistream.context import \
                     get_multistream_comm_context
-                output_decode = self._forward_decode(
-                     decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                     kv_cache, attn_metadata)
-                current_ms_metadata = get_multistream_comm_context()   
+                output_decode = self._forward_decode(decode_ql_nope,
+                                                     decode_q_pe,
+                                                     decode_k_nope,
+                                                     decode_k_pe, kv_cache,
+                                                     attn_metadata)
+                current_ms_metadata = get_multistream_comm_context()
             if current_ms_metadata is not None:
                 with torch.npu.stream(current_ms_metadata.comm_stream):
                     output[:num_decode_tokens] = output_decode
                     current_ms_metadata.after_comm_event.record()
             else:
                 output[:num_decode_tokens] = output_decode
 
-            
         return output_padded