add fix routing for performance test

洪炜杰 · hahazhky · commit a2afe326f248 · 2025-06-04T11:41:08.000+08:00
Signed-off-by: zhky &lt;hahazhky@163.com&gt;
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -61,3 +61,21 @@ def test_models_distributed_DeepSeek():
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+def test_models_distributed_fix_route_DeepSeek():
+    os.environ["VLLM_ENABLE_FIX_ROUTE"] = "1"
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=8,
+            enable_expert_parallel=True,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -68,6 +68,9 @@
     lambda: os.getenv("VLLM_VERSION", None),
     "VLLM_ASCEND_TRACE_RECOMPILES":
     lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),
+    # dispatch tokens to experts averagely for performance test
+    "VLLM_ENABLE_FIX_ROUTE":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_FIX_ROUTE", '0'))),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -36,6 +36,7 @@
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
+VLLM_ENABLE_FIX_ROUTE: bool = envs_ascend.VLLM_ENABLE_FIX_ROUTE
 
 
 def fused_experts_with_mc2(
@@ -50,6 +51,14 @@ def fused_experts_with_mc2(
 ) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
+
+    rank = torch.distributed.get_rank()
+    if VLLM_ENABLE_FIX_ROUTE:
+        step = hidden_states.shape[0] * top_k
+        uniform_topk_list = [(i + rank) % moe_expert_num
+                             for i in range(rank * step, (rank + 1) * step)]
+        topk_ids = torch.Tensor(uniform_topk_list).int().view(
+            hidden_states.shape[0], -1).to(hidden_states.device)
     kwargs = {
         "x": hidden_states,
         "expert_ids": topk_ids,
@@ -59,8 +68,6 @@ def fused_experts_with_mc2(
         "global_bs": global_bs,
     }
 
-    rank = torch.distributed.get_rank()
-
     quant_mode = 0
     ep_group = get_ep_group().device_group
     local_rank = torch.distributed.get_rank(group=ep_group)
@@ -88,15 +95,20 @@ def fused_experts_with_mc2(
         0:5]
 
     w1 = w1.transpose(1, 2)
-    expert_token_nums = torch.cumsum(expert_token_nums,
-                                     dim=0,
-                                     dtype=torch.int64)
-    group_list = expert_token_nums.to(torch.int64)
+
+    if VLLM_ENABLE_FIX_ROUTE:
+        uniform_group_list = hidden_states.shape[0] * \
+            all_to_all_group_size * top_k // moe_expert_num
+        group_list = torch.Tensor([uniform_group_list] *
+                                  w1.shape[0]).long().to(hidden_states.device)
+    else:
+        group_list = expert_token_nums
     gate_up_out_list = torch_npu.npu_grouped_matmul(
         x=[expand_x],
         weight=[w1],
         split_item=2,
-        group_list_type=0,
+        # 1 means count mode, to avoid cumulative operation of the group list
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )
@@ -110,7 +122,7 @@ def fused_experts_with_mc2(
         x=[gate_up_out],
         weight=[w2],
         split_item=2,
-        group_list_type=0,
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,9 @@`
`68`	`68`	`lambda: os.getenv("VLLM_VERSION", None),`
`69`	`69`	`"VLLM_ASCEND_TRACE_RECOMPILES":`
`70`	`70`	`lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))),`
	`71`	`+ # dispatch tokens to experts averagely for performance test`
	`72`	`+ "VLLM_ENABLE_FIX_ROUTE":`
	`73`	`+ lambda: bool(int(os.getenv("VLLM_ENABLE_FIX_ROUTE", '0'))),`
`71`	`74`	`}`
`72`	`75`
`73`	`76`	`# end-env-vars-definition`