[0.9.1][Bugfix] fix oom issue in mla and enable mla_pa for deepseek mla decode (#1311)

ganyi1996ppo · liziyu179 · web-flow · commit 30ac3d86ed40 · 2025-06-22T11:35:50.000+08:00
### What this PR does / why we need it?

After the disaggregated PD merged, the kv cache on deepseek will become
two piece of independent buffer for kv transfer or computation. However,
the current kernel, namely `paged_attention_mla` can only accept k_cache
as a single parameter, this make us have to concat these two piece of kv
cache together before the attention thus incurs a memory peak inside the
attention in eager mode. In this PR we introduce a
`torch_npu.atb.npu_multi_head_latent_attention` for mla decode path,
which will be used as default path for both eager mode and aclgraph
after the related torch_npu is public available. Since its still a
restrict package, we add `VLLM_ASCEND_MLA_PA` to control its usage. This
flag will be removed in the future.

### Does this PR introduce _any_ user-facing change?

Yes, add a new flag named `VLLM_ASCEND_MLA_PA`, but it will be removed
eventually after the newest torch_npu is released.
---------

Signed-off-by: ganyi &lt;pleaplusone.gy@gmail.com&gt;
Signed-off-by: liziyu &lt;liziyu16@huawei.com&gt;
Co-authored-by: liziyu &lt;liziyu16@huawei.com&gt;
diff --git a/examples/disaggregate_prefill_v1/README.md b/examples/disaggregate_prefill_v1/README.md
@@ -30,15 +30,14 @@ Execution Sequence
 
 * Run prefill server P1 on first node
 ```shell
-export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
-export GLOO_SOCKET_IFNAME="eth0"
+export HCCL_IF_IP=172.19.32.175  # node ip
+export GLOO_SOCKET_IFNAME="eth0"  # network card name
 export TP_SOCKET_IFNAME="eth0"
 export HCCL_SOCKET_IFNAME="eth0"
 export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_VERSION=0.9.1
 vllm serve /data01/deepseek_r1_w8a8_zhw \
   --host 0.0.0.0 \
   --port 20002 \
@@ -71,15 +70,14 @@ vllm serve /data01/deepseek_r1_w8a8_zhw \
 
 * Run prefill server P2 on second node
 ```shell
-export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export HCCL_IF_IP=172.19.241.49
 export GLOO_SOCKET_IFNAME="eth0"
 export TP_SOCKET_IFNAME="eth0"
 export HCCL_SOCKET_IFNAME="eth0"
 export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_VERSION=0.9.1
 vllm serve /data01/deepseek_r1_w8a8_zhw \
   --host 0.0.0.0 \
   --port 20002 \
@@ -113,15 +111,14 @@ vllm serve /data01/deepseek_r1_w8a8_zhw \
 
 * Run decode server d1 on third node
 ```shell
-export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export HCCL_IF_IP=172.19.123.51
 export GLOO_SOCKET_IFNAME="eth0"
 export TP_SOCKET_IFNAME="eth0"
 export HCCL_SOCKET_IFNAME="eth0"
 export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_VERSION=0.9.1
 vllm serve /data01/deepseek_r1_w8a8_zhw \
   --host 0.0.0.0 \
   --port 20002 \
@@ -154,15 +151,14 @@ vllm serve /data01/deepseek_r1_w8a8_zhw \
 
 * Run decode server d2 on last node
 ```shell
-export HCCL_IF_IP=`hostname -I|awk -F " " '{print$1}'`
+export HCCL_IF_IP=172.19.190.36
 export GLOO_SOCKET_IFNAME="eth0"
 export TP_SOCKET_IFNAME="eth0"
 export HCCL_SOCKET_IFNAME="eth0"
 export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
 export OMP_PROC_BIND=false
 export OMP_NUM_THREADS=100
 export VLLM_USE_V1=1
-export VLLM_VERSION=0.9.1
 vllm serve /data01/deepseek_r1_w8a8_zhw \
   --host 0.0.0.0 \
   --port 20002 \
diff --git a/examples/disaggregate_prefill_v1/gen_ranktable.sh b/examples/disaggregate_prefill_v1/gen_ranktable.sh
@@ -8,7 +8,6 @@ while [[ $# -gt 0 ]]; do
     case "$1" in
         --ips)
             shift
-            # 收集所有后续参数直到遇到下一个选项或结束
             while [[ $# -gt 0 && ! "$1" == --* ]]; do
                 IPs+=("$1")
                 shift
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -13,6 +13,7 @@
                                                UnquantizedLinearMethod)
 from vllm.utils import cdiv, round_down
 
+from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -933,18 +934,12 @@ def _forward_decode(
         q_pe: torch.Tensor,
         k_nope: torch.Tensor,
         k_pe: torch.Tensor,
-        kv_c_and_k_pe_cache: torch.Tensor,
+        kv_c_and_k_pe_cache: Tuple[torch.Tensor],
         attn_metadata: AscendMLAMetadata,
     ) -> torch.Tensor:
         decode_meta = attn_metadata.decode
         assert decode_meta is not None
-
-        q = torch.cat([q_nope, q_pe], dim=-1)
-        num_tokens = q.size(0)
-        attn_output = torch.empty(
-            [num_tokens, self.num_heads, self.kv_lora_rank],
-            dtype=q.dtype,
-            device=q.device)
+        num_tokens = q_nope.size(0)
         if self.running_in_graph:
             # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim]
             if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
@@ -1003,16 +998,35 @@ def _forward_decode(
                 actual_seq_lengths_kv=decode_meta.seq_lens_list,
             )
         else:
-            torch_npu._npu_paged_attention_mla(
-                query=q,
-                key_cache=kv_c_and_k_pe_cache,
-                num_kv_heads=self.num_kv_heads,
-                num_heads=self.num_heads,
-                scale_value=self.scale,
-                block_table=attn_metadata.decode.block_table,  # type:ignore
-                context_lens=attn_metadata.decode.seq_lens,  # type:ignore
-                mla_vheadsize=self.kv_lora_rank,
-                out=attn_output)
+            # The MLA_PA path will be used as default path in the future, `_npu_paged_attention_mla` will
+            # be removed after the torch_npu contains `torch_npu.atb.npu_multi_head_latent_attention` become
+            # public available
+            assert len(kv_c_and_k_pe_cache) > 1
+            if envs.VLLM_ASCEND_MLA_PA:
+                attn_output = torch_npu.atb.npu_multi_head_latent_attention(
+                    q_nope, q_pe, kv_c_and_k_pe_cache[0],
+                    kv_c_and_k_pe_cache[1], attn_metadata.decode.block_table,
+                    attn_metadata.decode.seq_lens, self.num_heads, self.scale,
+                    self.num_kv_heads)
+            else:
+                q = torch.cat([q_nope, q_pe], dim=-1)
+                attn_output = torch.empty(
+                    [num_tokens, self.num_heads, self.kv_lora_rank],
+                    dtype=q.dtype,
+                    device=q.device)
+                k_cache = torch.cat(
+                    [kv_c_and_k_pe_cache[0], kv_c_and_k_pe_cache[1]], dim=-1)
+                torch_npu._npu_paged_attention_mla(
+                    query=q,
+                    key_cache=k_cache,
+                    num_kv_heads=self.num_kv_heads,
+                    num_heads=self.num_heads,
+                    scale_value=self.scale,
+                    block_table=attn_metadata.decode.
+                    block_table,  # type:ignore
+                    context_lens=attn_metadata.decode.seq_lens,  # type:ignore
+                    mla_vheadsize=self.kv_lora_rank,
+                    out=attn_output)
         current_ms_metadata = get_multistream_comm_context()
         if current_ms_metadata is None:
             return self._v_up_proj_and_o_proj(attn_output)
@@ -1193,10 +1207,11 @@ def forward(
                                             decode_k_nope, decode_k_pe,
                                             kv_cache, attn_metadata)
             else:
-                combined_cache = torch.cat([kv_cache[0], kv_cache[1]], dim=-1)
-                output_decode = self._forward_decode(
-                    decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe,
-                    combined_cache, attn_metadata)
+                output_decode = self._forward_decode(decode_ql_nope,
+                                                     decode_q_pe,
+                                                     decode_k_nope,
+                                                     decode_k_pe, kv_cache,
+                                                     attn_metadata)
             current_ms_metadata = get_multistream_comm_context()
             if current_ms_metadata is not None:
                 with torch.npu.stream(current_ms_metadata.comm_stream):
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -132,7 +132,11 @@
     # rpc communication listening port, which will be used to receive the agent metadata from the
     # remote worker.
     "VLLM_LLMDD_RPC_PORT":
-    lambda: int(os.getenv("VLLM_LLMDD_RPC_PORT", 5557))
+    lambda: int(os.getenv("VLLM_LLMDD_RPC_PORT", 5557)),
+    # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
+    # and the mla_pa will be the default path of deepseek decode path.
+    "VLLM_ASCEND_MLA_PA":
+    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0))
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -314,7 +314,7 @@ def forward(
                 is_prefill = is_prefill or attn_metadata.with_prefill_across_dp
         # If this node is kv_consumer, we force the moe always runs in decode path to make sure
         # the behaviour aligned between dummy_run and normal model_execute.
-        if self.kv_consumer is not None:
+        if self.kv_consumer:
             is_prefill = False
             enable_force_load_balance = False
 
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -121,7 +121,10 @@ def fused_experts_with_mc2(
     if log2phy:
         topk_ids = log2phy[topk_ids]
     global_bs = 0
-    moe_expert_num = len(expert_map) + global_redundant_expert_num
+    if (expert_map is not None):
+        moe_expert_num = len(expert_map) + global_redundant_expert_num
+    else:
+        moe_expert_num = global_redundant_expert_num
     # hidden_states = hidden_states.bfloat16()
     kwargs_mc2 = {
         "x": hidden_states,