Refactor MoE parameter initialization for flexibility

yiz-liu · yiz-liu · commit 4969046880c7 · 2025-08-11T12:50:28.000+08:00
Pass the Hugging Face configuration object directly to the MoE communication method constructor. This allows the method to handle different attribute names for MoE parameters, such as `num_experts` and `n_routed_experts`.

This change improves robustness and makes the implementation more compatible with various MoE model configurations.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch_npu
+from transformers.configuration_utils import PretrainedConfig
 from vllm.distributed.parallel_state import get_ep_group, get_tp_group
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils import direct_register_custom_op
@@ -17,13 +18,19 @@ def __init__(
         self,
         device: torch.device,
         dtype: torch.dtype,
-        top_k_num: int,
-        global_num_experts: int,
+        hf_config: PretrainedConfig,
     ):
         self.device = device
         self.dtype = dtype
-        self.top_k_num = top_k_num
-        self.global_num_experts = global_num_experts
+        self.top_k_num = getattr(hf_config, "num_experts_per_tok", 0)
+        # global_num_experts may be called num_experts or n_routed_experts in different models.
+        possible_keys = ["num_experts", "n_routed_experts"]
+        for key in possible_keys:
+            if hasattr(hf_config, key):
+                self.global_num_experts = getattr(hf_config, key)
+                break
+        else:
+            self.global_num_experts = 0
 
     @abstractmethod
     def _pre_process(
@@ -232,10 +239,9 @@ def __init__(
         self,
         device: torch.device,
         dtype: torch.dtype,
-        top_k_num: int,
-        global_num_experts: int,
+        hf_config: PretrainedConfig,
     ):
-        super().__init__(device, dtype, top_k_num, global_num_experts)
+        super().__init__(device, dtype, hf_config)
 
         # Shared communication configurations
         ep_group = get_mc2_group()
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1324,10 +1324,8 @@ def _process_reqs(
                 num_tokens_across_dp=num_tokens_across_dp,
                 with_prefill=with_prefill,
                 reserved_mc2_mask=self.reserved_mc2_mask,
-                moe_comm_method=moe_comm_method(
-                    self.device, self.dtype,
-                    self.model_config.hf_config.num_experts_per_tok,
-                    self.model_config.hf_config.num_experts),
+                moe_comm_method=moe_comm_method(self.device, self.dtype,
+                                                self.model_config.hf_config),
                 num_actual_tokens=total_num_scheduled_tokens):
             with ProfileExecuteDuration().capture_async("forward"):
                 self.maybe_setup_kv_connector(scheduler_output)
@@ -1990,9 +1988,7 @@ def _dummy_run(
                     in_profile_run=self.in_profile_run,
                     reserved_mc2_mask=self.reserved_mc2_mask,
                     moe_comm_method=moe_comm_method(
-                        self.device, self.dtype,
-                        self.model_config.hf_config.num_experts_per_tok,
-                        self.model_config.hf_config.num_experts),
+                        self.device, self.dtype, self.model_config.hf_config),
                     num_actual_tokens=0,
             ):
                 model_kwargs = {}