fix(moe): fix moe_comm_method test case

yiz-liu · yiz-liu · commit de9d711f84b8 · 2025-08-30T09:37:45.000+08:00
- Moves the token dispatcher import into the `AlltoAllCommImpl` constructor to enable lazy loading.
- Restricts MoE communication method logging to the global first rank to reduce log verbosity.
- Updates MoE communication tests to accommodate a new parameter in the `permute` function.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/tests/e2e/multicard/moe/test_moe_comm.py b/tests/e2e/multicard/moe/test_moe_comm.py
@@ -33,6 +33,7 @@
 @pytest.mark.parametrize("top_k_num", [2, 4])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("ep_rank", [0, 1])
+@pytest.mark.parametrize("use_a8", [False])
 def test_all_gather_comm_impl(
     num_tokens,
     hidden_size,
@@ -41,6 +42,7 @@ def test_all_gather_comm_impl(
     top_k_num,
     dtype,
     ep_rank,
+    use_a8,
     mocker,
 ):
     """
@@ -118,8 +120,9 @@ def test_all_gather_comm_impl(
         native_permuted_hidden,
         native_expert_tokens,
         _,
+        _,
     ) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
-                            num_experts)
+                            num_experts, use_a8)
     # Simulate MLP output
     native_mlp_output = torch.randn_like(native_permuted_hidden)
     native_impl.unpermute(native_mlp_output, native_hidden_states_out)
@@ -130,8 +133,9 @@ def test_all_gather_comm_impl(
         all_gather_permuted_hidden,
         all_gather_expert_tokens,
         _,
+        _,
     ) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
-                                expert_map, num_experts)
+                                expert_map, num_experts, use_a8)
 
     # Use the same simulated MLP output for a fair comparison
     all_gather_mlp_output = native_mlp_output.clone()
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -14,8 +14,6 @@
 from vllm_ascend.distributed.communication_op import \
     data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-    get_token_dispatcher
 from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version
 
 
@@ -477,6 +475,8 @@ class AlltoAllCommImpl(MoECommMethod):
 
     def __init__(self, moe_config: Optional[FusedMoEConfig]):
         super().__init__(moe_config)
+        from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
+            get_token_dispatcher
         self.token_dispatcher = get_token_dispatcher(
             "TokenDispatcherWithAll2AllV")
         self._restore_tp_across_dp()
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1632,8 +1632,9 @@ def _select_moe_comm_method(self, num_tokens: int) -> str:
         else:
             raise ValueError(f"Unsupported soc_version: {soc_version}")
 
-        logger.debug(f"num_tokens: {num_tokens}, "
-                     f"moe_comm_method: {moe_comm_method}")
+        if is_global_first_rank():
+            logger.debug(f"num_tokens: {num_tokens}, "
+                         f"moe_comm_method: {moe_comm_method}")
 
         return moe_comm_method