add fix routing for performance test

洪炜杰 · 洪炜杰 · commit 8d3727bb74d6 · 2025-05-28T16:19:56.000+08:00
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -66,6 +66,9 @@
     lambda: os.getenv("C_COMPILER", None),
     "VLLM_VERSION":
     lambda: os.getenv("VLLM_VERSION", None),
+    # dispatch tokens to experts averagely for performance test
+    "VLLM_ENABLE_FIX_ROUTE":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_FIX_ROUTE", '0'))),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -44,6 +44,7 @@
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
+VLLM_ENABLE_FIX_ROUTE: bool = envs_ascend.VLLM_ENABLE_FIX_ROUTE
 
 
 def fused_experts_with_mc2(
@@ -58,6 +59,14 @@ def fused_experts_with_mc2(
 ) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
+
+    rank = torch.distributed.get_rank()
+    if VLLM_ENABLE_FIX_ROUTE:
+        step = hidden_states.shape[0] * top_k
+        uniform_topk_list = [
+            (i + rank) % moe_expert_num for i in range(rank * step, (rank + 1) * step)
+        ]
+        topk_ids = torch.Tensor(uniform_topk_list).int().view(hidden_states.shape[0], -1).npu()
     kwargs = {
         "x": hidden_states,
         "expert_ids": topk_ids,
@@ -67,8 +76,6 @@ def fused_experts_with_mc2(
         "global_bs": global_bs,
     }
 
-    rank = torch.distributed.get_rank()
-
     quant_mode = 0
     ep_group = get_ep_group().device_group
     local_rank = torch.distributed.get_rank(group=ep_group)
@@ -97,15 +104,17 @@ def fused_experts_with_mc2(
         0:5]
 
     w1 = w1.transpose(1, 2)
-    expert_token_nums = torch.cumsum(expert_token_nums,
-                                     dim=0,
-                                     dtype=torch.int64)
-    group_list = expert_token_nums.to(torch.int64)
+
+    if VLLM_ENABLE_FIX_ROUTE:
+        uniform_group_list = hidden_states.shape[0] * all_to_all_group_size * top_k // moe_expert_num
+        group_list = torch.Tensor([uniform_group_list] * w1.shape[0]).long().npu()
+    else:
+        group_list = expert_token_nums
     gate_up_out_list = torch_npu.npu_grouped_matmul(
         x=[expand_x],
         weight=[w1],
         split_item=2,
-        group_list_type=0,
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )
@@ -119,7 +128,7 @@ def fused_experts_with_mc2(
         x=[gate_up_out],
         weight=[w2],
         split_item=2,
-        group_list_type=0,
+        group_list_type=1,
         group_type=0,
         group_list=group_list,
     )

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,9 @@`
`66`	`66`	`lambda: os.getenv("C_COMPILER", None),`
`67`	`67`	`"VLLM_VERSION":`
`68`	`68`	`lambda: os.getenv("VLLM_VERSION", None),`
	`69`	`+ # dispatch tokens to experts averagely for performance test`
	`70`	`+ "VLLM_ENABLE_FIX_ROUTE":`
	`71`	`+ lambda: bool(int(os.getenv("VLLM_ENABLE_FIX_ROUTE", '0'))),`
`69`	`72`	`}`
`70`	`73`
`71`	`74`	`# end-env-vars-definition`