feat(moe): Add All-to-All communication method

yiz-liu · yiz-liu · commit 383b307ca181 · 2025-08-28T19:54:42.000+08:00
This method leverages an `all-to-all` collective communication pattern, which is more efficient than the existing `all-gather` strategy for large token counts on newer hardware.

The model runner now dynamically selects the optimal MoE communication method (`mc2`, `allgather`, or `alltoall`) based on the token count and the underlying Ascend SoC version.

But note that all-gather has not supported quantized models.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -14,6 +14,8 @@
 from vllm_ascend.distributed.communication_op import \
     data_parallel_reduce_scatter
 from vllm_ascend.distributed.parallel_state import get_mc2_group
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
+    get_token_dispatcher
 from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version
 
 
@@ -55,7 +57,7 @@ def permute(
         expert_map: torch.Tensor,
         num_experts: int,
         use_a8: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor, int]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         """Pre-process before MLP.
 
         Args:
@@ -161,7 +163,7 @@ def permute(
         expert_map: torch.Tensor,  # noqa: F841
         num_experts: int,
         use_a8: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor, int]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         num_tokens = hidden_states.shape[0]
 
         self.topk_weights = topk_weights
@@ -222,7 +224,7 @@ def permute(
         expert_map: torch.Tensor,
         num_experts: int,
         use_a8: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor, int]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         num_tokens = hidden_states.shape[0]
 
         # Generate token indices and flatten
@@ -379,7 +381,7 @@ def permute(
         expert_map: torch.Tensor,
         num_experts: int,
         use_a8: bool,
-    ) -> tuple[torch.Tensor, torch.Tensor, int]:
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
         # Store tensors needed for post_process
         self.topk_ids = topk_ids
         self.topk_weights = topk_weights.to(torch.float32)
@@ -461,3 +463,89 @@ def unpermute(self, mlp_output: torch.Tensor,
         combine = torch_npu.npu_moe_distribute_combine_v2 if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine
 
         hidden_states[:] = combine(**combine_kwargs)
+
+
+class AlltoAllCommImpl(MoECommMethod):
+    """This implementation is for the scenarios listed below:
+    1. `enable_expert_parallel=True`.
+    2. `npu_grouped_matmul` is available.
+
+    This implementation uses all-to-all communication to exchange tokens
+    between data parallel ranks before and after the MLP computation. It should
+    have better performance than AllGatherCommImpl when DP size > 1.
+    """
+
+    def __init__(self, moe_config: Optional[FusedMoEConfig]):
+        super().__init__(moe_config)
+        self.token_dispatcher = get_token_dispatcher(
+            "TokenDispatcherWithAll2AllV")
+        self._restore_tp_across_dp()
+
+    def _restore_tp_across_dp(self):
+        # NOTE: Since vLLM flatten tp across dp, we need to restore the original
+        # tp_size and tp_rank.
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+    def prepare(
+            self, hidden_states: torch.Tensor,
+            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self.num_tokens, _ = hidden_states.shape
+        pad_size = self.tp_size - self.num_tokens
+
+        if pad_size > 0:
+            hidden_states = nn.functional.pad(hidden_states,
+                                              (0, 0, 0, pad_size))
+            router_logits = nn.functional.pad(router_logits,
+                                              (0, 0, 0, pad_size))
+
+        if self.tp_size > 1:
+            split_hidden_states = torch.tensor_split(hidden_states,
+                                                     self.tp_size,
+                                                     dim=0)
+            split_router_logits = torch.tensor_split(router_logits,
+                                                     self.tp_size,
+                                                     dim=0)
+            self.split_hidden_states = split_hidden_states
+
+            hidden_states = split_hidden_states[self.tp_rank]
+            router_logits = split_router_logits[self.tp_rank]
+
+        return hidden_states, router_logits
+
+    def finalize(self, hidden_states: torch.Tensor,
+                 reduce_results: bool) -> torch.Tensor:
+        """If TP size > 1, all-gather the hidden states to get the final output.
+
+        Also, unpad the hidden states if needed.
+        """
+        if self.tp_size > 1:
+            dist.all_gather(list(self.split_hidden_states), hidden_states,
+                            self.moe_config.tp_group.device_group)
+            hidden_states = torch.cat(self.split_hidden_states, dim=0)
+
+        if self.num_tokens < hidden_states.shape[0]:
+            hidden_states = hidden_states[:self.num_tokens]
+
+        return hidden_states
+
+    def permute(
+        self,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        expert_map: torch.Tensor,
+        num_experts: int,
+        use_a8: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]:
+        results = self.token_dispatcher.token_dispatch(hidden_states,
+                                                       topk_weights,
+                                                       topk_ids,
+                                                       None,
+                                                       log2phy=None)
+        return results["hidden_states"], results["group_list"], results[
+            "dynamic_scale"], results["group_list_type"]
+
+    def unpermute(self, mlp_output: torch.Tensor,
+                  hidden_states: torch.Tensor) -> None:
+        hidden_states[:] = self.token_dispatcher.token_combine(mlp_output)
diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py
@@ -19,20 +19,20 @@
 
 import torch
 import torch_npu
-from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, UnquantizedFusedMoEMethod)
 
-from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
-                                                     MC2CommImpl,
-                                                     MoECommMethod)
+                                                     AlltoAllCommImpl,
+                                                     MC2CommImpl)
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.fused_moe import apply_mlp, fused_experts_moge
+from vllm_ascend.ops.fused_moe import fused_experts_moge
 from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.utils import is_310p, ACL_FORMAT_FRACTAL_NZ
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
+    setup_token_dispatchers
+from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p
 
 original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__
 
@@ -66,26 +66,32 @@ def fused_experts(
     # Check constraints
     assert hidden_states.shape[1] == w1.shape[1], (
         f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[1]}")
-
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
     assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
     assert hidden_states.dtype in [
         torch.float32, torch.float16, torch.bfloat16
     ]
+    if (use_int8_w8a8 or use_int4_w4a8):
+        assert w1_scale is not None and w2_scale is not None, \
+            "INT8 quantization requires weight scales."
+
+        w1_scale = w1_scale.to(torch.float32)
+        down_scale = [w2_scale]
+        down_output_dtype = w2_scale.dtype
+    else:
+        down_scale = None
+        down_output_dtype = None
 
     moe_comm_method = get_forward_context().moe_comm_method
     assert moe_comm_method is not None, "Missing communication context"
 
     num_experts = w1.shape[0]
 
     permuted_hidden_states, expert_tokens, dynamic_scale, group_list_type = moe_comm_method.permute(
-        hidden_states, topk_ids, topk_weights, expert_map, num_experts, use_int8_w8a8 or use_int4_w4a8)
-    
-    if (use_int8_w8a8 or use_int4_w4a8) and dynamic_scale is None:
-        permuted_hidden_states, dynamic_scale = torch_npu.npu_dynamic_quant(
-            permuted_hidden_states)
+        hidden_states, topk_ids, topk_weights, expert_map, num_experts,
+        use_int8_w8a8 or use_int4_w4a8)
 
     gate_up_output = torch_npu.npu_grouped_matmul(
         x=[permuted_hidden_states],
@@ -97,10 +103,10 @@ def fused_experts(
         output_dtype=torch.int32 if use_int8_w8a8 else None,
     )[0]
 
-    if use_int8_w8a8:
+    if (use_int8_w8a8 or use_int4_w4a8):
         activated_output, activated_output_scale = torch_npu.npu_dequant_swiglu_quant(
             x=gate_up_output,
-            weight_scale=w1_scale.to(torch.float32),
+            weight_scale=w1_scale,
             activation_scale=dynamic_scale,
             bias=None,
             quant_scale=None,
@@ -109,42 +115,28 @@ def fused_experts(
             activate_left=True,
             quant_mode=1,
         )
+        activated_output_scale = [activated_output_scale]
     else:
         activated_output = torch_npu.npu_swiglu(gate_up_output)
         activated_output_scale = None
 
     down_output = torch_npu.npu_grouped_matmul(
         x=[activated_output],
         weight=[w2],
-        scale=[w2_scale] if use_int8_w8a8 else None,
-        per_token_scale=[activated_output_scale] if use_int8_w8a8 else None,
+        scale=down_scale,
+        per_token_scale=activated_output_scale,
         split_item=2,
         group_list_type=group_list_type,
         group_type=0,
         group_list=expert_tokens,
-        output_dtype=w2_scale.dtype if use_int8_w8a8 else None,
+        output_dtype=down_output_dtype,
     )[0]
 
     moe_comm_method.unpermute(down_output, hidden_states)
 
     return hidden_states
 
 
-def unquantized_fused_moe_init_func(self, *args, **kwargs):
-    original_unquantized_fused_moe_init_func(self, *args, **kwargs)
-    vllm_config = get_current_vllm_config()
-    self.max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
-
-    ascend_config = get_ascend_config()
-
-    if ascend_config.torchair_graph_config.enabled:
-        self.use_aclgraph = False
-    else:
-        self.use_aclgraph = (vllm_config.compilation_config.level
-                             == CompilationLevel.PIECEWISE
-                             and not vllm_config.model_config.enforce_eager)
-
-
 def forward_oot(
         self,
         layer: torch.nn.Module,
@@ -276,12 +268,19 @@ def __init__(
             has_bias,
         )
 
+        with_quant = quant_config is not None
+        setup_token_dispatchers(self.moe_config.ep_size,
+                                top_k=self.top_k,
+                                num_experts=self.global_num_experts,
+                                num_local_experts=self.local_num_experts,
+                                with_quant=with_quant)
+
         self.moe_config.tp_group = get_tp_group()
         self.moe_config.dp_group = get_dp_group()
         self.moe_config.ep_group = get_ep_group()
         self.moe_config.mc2_group = get_mc2_group()
 
-        for method in {AllGatherCommImpl, MC2CommImpl}:
+        for method in {AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl}:
             setattr(
                 self, method.__name__.lower(),
                 method(moe_config=self.moe_config))  # type: ignore[abstract]
@@ -332,6 +331,5 @@ def forward_impl(self, hidden_states: torch.Tensor,
         return final_hidden_states
 
 
-UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func
 UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading
 UnquantizedFusedMoEMethod.forward_oot = forward_oot
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -19,15 +19,17 @@
 
 import torch
 import torch_npu
+from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.distributed import get_ep_group
 from vllm.forward_context import get_forward_context
 
 import vllm_ascend.envs as envs_ascend
+from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
 from vllm_ascend.distributed.parallel_state import get_mc2_group
-from vllm_ascend.ops.fused_moe import unified_fused_experts_eager
 from vllm_ascend.ops.common_fused_moe import \
     fused_experts as unified_fused_experts
+from vllm_ascend.ops.fused_moe import unified_fused_experts_eager
 from vllm_ascend.ops.layers.experts_selector import select_experts
 from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, dispose_tensor
 
@@ -285,6 +287,13 @@ def __init__(self):
 
         self.ep_group = get_ep_group()
 
+        vllm_config = get_current_vllm_config()
+        ascend_config = get_ascend_config()
+        self.use_aclgraph = (
+            vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
+            and not vllm_config.model_config.enforce_eager
+            and not ascend_config.torchair_graph_config.enabled)
+
         try:
             device_group = get_mc2_group().device_group
             # TODO: Try local_rank = ep_group.rank_in_group
@@ -377,19 +386,18 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             global_num_experts=global_num_experts)
 
-        moe_comm_method = get_forward_context().moe_comm_method
-
-        return unified_fused_experts(
-            hidden_states=x,
-            w1=layer.w13_weight,
-            w2=layer.w2_weight,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            use_int8_w8a8=True,
-            w1_scale=layer.w13_weight_scale,
-            w2_scale=layer.w2_weight_scale,
-            expert_map=expert_map,
-        )
+        if self.use_aclgraph:
+            return unified_fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                use_int8_w8a8=True,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                expert_map=expert_map,
+            )
 
         fused_moe_state = get_forward_context().fused_moe_state
         shared_gate_up, shared_dequant_scale = None, None
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -89,7 +89,8 @@
 from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
 from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               ProfileExecuteDuration, is_310p,
+                               AscendSocVersion, ProfileExecuteDuration,
+                               get_ascend_soc_version, is_310p,
                                vllm_version_is)
 from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
@@ -1614,8 +1615,21 @@ def _pool(
         )
 
     def _select_moe_comm_method(self, num_tokens: int) -> str:
-        return ("mc2"
-                if num_tokens <= self.mc2_tokens_capacity else "allgather")
+        soc_version = get_ascend_soc_version()
+
+        if num_tokens <= self.mc2_tokens_capacity:
+            moe_comm_method = "mc2"
+        elif soc_version in {AscendSocVersion.A2}:
+            moe_comm_method = "allgather"
+        elif soc_version in {AscendSocVersion.A3}:
+            moe_comm_method = "alltoall"
+        else:
+            raise ValueError(f"Unsupported soc_version: {soc_version}")
+
+        logger.debug(f"num_tokens: {num_tokens}, "
+                     f"moe_comm_method: {moe_comm_method}")
+
+        return moe_comm_method
 
     @torch.inference_mode()
     def execute_model(