add fix routing for performance test

洪炜杰 · hahazhky · commit 588cd4ba3cc1 · 2025-06-05T14:36:05.000+08:00
Signed-off-by: zhky &lt;hahazhky@163.com&gt;
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -21,6 +21,7 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import os
+import pytest
 
 import vllm  # noqa: F401
 
@@ -46,7 +47,8 @@ def test_models_distributed_QwQ():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-def test_models_distributed_DeepSeek():
+@pytest.mark.parametrize("enable_expert_parallel", [True, False])
+def test_models_distributed_DeepSeek(enable_expert_parallel):
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
         "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
@@ -58,6 +60,7 @@ def test_models_distributed_DeepSeek():
             "deepseek-ai/DeepSeek-V2-Lite",
             dtype=dtype,
             tensor_parallel_size=4,
+            enable_expert_parallel=enable_expert_parallel,
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -88,15 +88,14 @@ def fused_experts_with_mc2(
         0:5]
 
     w1 = w1.transpose(1, 2)
-    expert_token_nums = torch.cumsum(expert_token_nums,
-                                     dim=0,
-                                     dtype=torch.int64)
+
     group_list = expert_token_nums.to(torch.int64)
     gate_up_out_list = torch_npu.npu_grouped_matmul(
         x=[expand_x],
         weight=[w1],
         split_item=2,
-        group_list_type=0,
+        # 1 means count mode, to avoid cumulative operation of the group list
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )
@@ -110,7 +109,7 @@ def fused_experts_with_mc2(
         x=[gate_up_out],
         weight=[w2],
         split_item=2,
-        group_list_type=0,
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )