support MERRouter

angazenn · angazenn · commit 41e47d8ee946 · 2025-06-25T15:43:23.000+08:00
Signed-off-by: angazenn &lt;zengyanjia@huawei.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -72,6 +72,27 @@ def __init__(self, torchair_graph_config):
             )
 
 
+class AscendModelConfig:
+    """
+    Configuration Object for ascend_model_config from additional_config
+    """
+
+    def __init__(self, ascend_model_config: dict):
+        self.num_voted_experts = ascend_model_config.get(
+          "num_voted_experts", None)
+        
+        if self.num_voted_experts is None:
+            self.num_voted_experts = 8
+        else:
+            logger.info(
+                        "Currently, MERRouter voted experts are only implemented for PanguProMoE. "
+                        "For other models, setting this value will not take any effects.")
+
+        if not isinstance(self.num_voted_experts, int) or \
+            self.num_voted_experts <= 0 or self.num_voted_experts > 8:
+            raise ValueError("num_voted_experts should be an integer within the range of (0, 8].")
+
+
 class AscendSchedulerConfig:
     """
     Configuration Object for ascend_scheduler_config from additional_config
diff --git a/vllm_ascend/models/pangu_moe.py b/vllm_ascend/models/pangu_moe.py
@@ -49,6 +49,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 
 logger = init_logger(__name__)
@@ -102,38 +103,48 @@ def pangu_group8_topk(
         hidden_states: torch.Tensor,
         gating_output: torch.Tensor,
         topk: int,
-        renormalize: bool,
+        renormalize: bool = False,
         num_expert_group: int = 0,
         topk_group: int = 0,
         global_num_experts: int = 0,
     ):
-        ep_size = get_ep_group().world_size
-        local_num_experts = global_num_experts // ep_size
-        local_num_group = topk // ep_size
-        router_scale = _ROUTER_SCALE.squeeze()  # type: ignore
-        scores = F.softmax(gating_output, dim=1)
-        scores = scores[...,
-                        get_ep_group().rank_in_group *
-                        local_num_experts:(get_ep_group().rank_in_group + 1) *
-                        local_num_experts]
-
-        router_weights = router_scale[get_ep_group().rank_in_group *
-                                      local_num_experts:
-                                      (get_ep_group().rank_in_group + 1) *
-                                      local_num_experts]
-        topk_weights, topk_ids = torch.max(scores.view(scores.shape[0],
-                                                       local_num_group, -1),
-                                           dim=-1)
-        bias = torch.arange(0,
-                            local_num_experts,
-                            topk,
-                            device=scores.device,
-                            dtype=torch.int32).unsqueeze(0)
-        topk_ids = topk_ids.to(torch.int32) + bias
+        local_num_experts = global_num_experts
+        local_num_group = topk
+        scores = F.softmax(gating_output, dim=1, dtype=torch.float16)
+        num_tokens = scores.shape[0]
+        router_weights = _ROUTER_SCALE.squeeze().to(torch.float16)
+        
+        if self.num_voted_experts == 8:
+            # use original topk
+            topk_weights, topk_ids = torch.max(scores.view(scores.shape[0], local_num_group, -1), dim = -1)
+            bias = torch.arange(0, local_num_experts, topk, device=scores.device, dtype=torch.int32).unsqueeze(0)
+            topk_ids = topk_ids.to(torch.int32) + bias
 
+        else:
+            k = self.num_voted_experts
+            experts_per_group = local_num_experts // local_num_group
+            group_expert_indices = torch.arange(experts_per_group, dtype=torch.int32, device=scores.device).view(1, 1, -1)
+            group_expert_offset = (torch.arange(local_num_group, dtype=torch.int32, device=scores.device) * experts_per_group).unsqueeze(0)
+            expert_index_range = torch.arange(experts_per_group, dtype=torch.int32, device=scores.device)
+
+            scores_grouped = scores.view(num_tokens, local_num_group, experts_per_group)
+            best_expert_idx = torch.argmax(scores_grouped, dim=2)  # (num_tokens, num_groups)
+            vote_mask = (best_expert_idx.unsqueeze(-1).to(torch.int32) == group_expert_indices).to(torch.float16)
+
+            expert_vote_freq = vote_mask.sum(dim=0)
+
+            sorted_indices = torch.argsort(expert_vote_freq, dim=1, descending=True).to(torch.int32)
+            topk_experts = sorted_indices[:, :k]
+            keep_mask = ((topk_experts.unsqueeze(-1) == expert_index_range).any(dim=1)).unsqueeze(0)
+
+            masked_scores = torch.where(keep_mask, scores_grouped, 0)
+
+            topk_weights, best_pos_in_group = masked_scores.max(dim=2)
+            best_pos_in_group = best_pos_in_group.to(torch.int32)
+            topk_ids = (best_pos_in_group + group_expert_offset).to(torch.int32)
+        
         flatten_topk_ids = topk_ids.view(-1)
-        router_weights = router_weights.index_select(0, flatten_topk_ids).view(
-            topk_ids.shape)
+        router_weights = router_weights.index_select(0, flatten_topk_ids).view(topk_ids.shape)
         topk_weights *= router_weights
 
         return topk_weights, topk_ids
@@ -192,6 +203,9 @@ def __init__(
             )
         else:
             self.shared_expert = None  # type: ignore
+        
+        ascend_config = get_ascend_config()
+        self.num_voted_experts = ascend_config.ascend_model_config.num_voted_experts
 
     def forward(
             self,