feat: Add MC2 communication method for MoE

yiz-liu · yiz-liu · commit 3a459def1a14 · 2025-08-04T10:18:32.000+08:00
Introduces and enables the MC2 communication implementation for Mixture-of-Experts (MoE) on Ascend devices when expert parallelism is active.

This new method leverages platform-specific `npu_moe_distribute_dispatch` and `npu_moe_distribute_combine` operators to optimize communication and computation parallelism, improving performance. The implementation also adapts to different Ascend SoC versions and available features.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -5,7 +5,8 @@
 
 import torch
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
+from vllm.distributed import (get_dp_group, get_ep_group,
+                              get_tensor_model_parallel_world_size)
 from vllm.forward_context import get_forward_context, set_forward_context
 
 import vllm_ascend.envs as envs
@@ -108,7 +109,7 @@ def set_ascend_forward_context(
         forward_context.max_tokens_across_dp = max_tokens_across_dp
 
         if num_tokens is not None:
-            tp_world_size = get_tp_group().world_size
+            tp_world_size = get_tensor_model_parallel_world_size()
             # NOTE: token num which need to pad to when mc2
             forward_context.padded_num_tokens = math.ceil(
                 max_tokens_across_dp / tp_world_size) * tp_world_size
diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py
@@ -2,9 +2,13 @@
 
 import torch
 import torch_npu
+from vllm.distributed.parallel_state import get_tp_group
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils import direct_register_custom_op
 
+from vllm_ascend.distributed.parallel_state import get_mc2_group
+from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version
+
 
 class MoECommMethod(ABC):
     """Base class for MoE communication methods."""
@@ -76,6 +80,7 @@ def _pre_process(
         expert_map: torch.Tensor,
         num_experts: int,
     ) -> tuple[torch.Tensor, torch.Tensor, int]:
+        print("Using AllGatherCommImpl for MoE communication.")
         num_tokens = hidden_states.shape[0]
 
         # Generate token indices and flatten
@@ -164,6 +169,7 @@ def _pre_process(
         expert_map: torch.Tensor,  # noqa: F841
         num_experts: int,
     ) -> tuple[torch.Tensor, torch.Tensor, int]:
+        print("Using AllReduceCommImpl for MoE communication.")
         num_tokens = hidden_states.shape[0]
 
         self.topk_weights = topk_weights
@@ -229,6 +235,145 @@ def _post_process(self, mlp_output: torch.Tensor,
         )
 
 
+class MC2CommImpl(MoECommMethod):
+    """This implementation is for the scenarios listed below:
+    1. `enable_expert_parallel=True`.
+    2. `npu_moe_distribute_dispatch` and `npu_moe_distribute_combine` are available.
+    3. `enable_expert_parallel=False` is not supported.
+    
+    This implementation uses the MC2 communication method, which is optimized for
+    Communication and Computation parallelism on Ascend devices.
+    """
+
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        top_k_num: int,
+        global_num_experts: int,
+    ):
+        super().__init__(device, dtype, top_k_num, global_num_experts)
+
+        # Shared communication configurations
+        ep_group = get_mc2_group()
+        self.ep_rank_id = ep_group.rank_in_group
+        self.ep_world_size = ep_group.world_size
+        self.tp_world_size = get_tp_group().world_size
+
+        device_group = ep_group.device_group
+        local_rank = torch.distributed.get_rank(group=device_group)
+        backend = device_group._get_backend(torch.device("npu"))
+        self.moe_all_to_all_group_name = backend.get_hccl_comm_name(local_rank)
+
+        # Feature flags
+        self.enable_dispatch_v2 = hasattr(torch_npu,
+                                          "npu_moe_distribute_dispatch_v2")
+        self.is_ascend_a3 = get_ascend_soc_version() == AscendSocVersion.A3
+        self.need_extra_args = self.is_ascend_a3  # or is_torchair
+
+        # Intermediate tensors to be passed from pre_process to post_process
+        self.topk_ids = None
+        self.topk_weights = None
+        self.mc2_mask = None
+        self.assist_info_for_combine = None
+        self.ep_recv_counts = None
+        self.tp_recv_counts = None
+
+    def _pre_process(
+        self,
+        hidden_states: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        expert_map: torch.Tensor,
+        num_experts: int,
+    ) -> tuple[torch.Tensor, torch.Tensor, int]:
+        # Store tensors needed for post_process
+        self.topk_ids = topk_ids.clone()
+        self.topk_weights = topk_weights
+        self.mc2_mask = get_forward_context().mc2_mask
+
+        dispatch_kwargs = {
+            "x": hidden_states,
+            "expert_ids": self.topk_ids,
+            "expert_shard_type": 0,
+            "shared_expert_rank_num": 0,
+            "moe_expert_num": self.global_num_experts,
+            "global_bs": 0,
+            "scales": None,
+            "quant_mode": 0,
+            "group_ep": self.moe_all_to_all_group_name,
+            "ep_world_size": self.ep_world_size,
+            "ep_rank_id": self.ep_rank_id,
+        }
+
+        if self.need_extra_args:
+            dispatch_kwargs.update({
+                "group_tp": self.moe_all_to_all_group_name,
+                "tp_world_size": 1,
+                "tp_rank_id": 0,
+            })
+        if self.is_ascend_a3 and self.enable_dispatch_v2:
+            dispatch_kwargs.update({
+                "x_active_mask": self.mc2_mask,
+            })
+
+        dispatch = torch_npu.npu_moe_distribute_dispatch_v2 if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_dispatch
+
+        (
+            permuted_hidden_states,
+            _,  # dynamic_scale is not used
+            self.assist_info_for_combine,
+            expert_tokens,
+            self.ep_recv_counts,
+            self.tp_recv_counts,
+        ) = torch_npu.npu_moe_distribute_dispatch_v2(**dispatch_kwargs)[:6]
+
+        group_list_type = 1
+
+        return permuted_hidden_states, expert_tokens, group_list_type
+
+    def _post_process(self, mlp_output: torch.Tensor,
+                      hidden_states: torch.Tensor) -> None:
+        combine_kwargs = {
+            "expand_x": mlp_output,
+            "expert_ids": self.topk_ids,
+            "expert_scales": self.topk_weights.to(torch.float32),
+            "expert_shard_type": 0,
+            "shared_expert_rank_num": 0,
+            "moe_expert_num": self.global_num_experts,
+            "global_bs": 0,
+            "ep_send_counts": self.ep_recv_counts,
+            "group_ep": self.moe_all_to_all_group_name,
+            "ep_world_size": self.ep_world_size,
+            "ep_rank_id": self.ep_rank_id,
+        }
+
+        if self.enable_dispatch_v2:
+            combine_kwargs[
+                "assist_info_for_combine"] = self.assist_info_for_combine
+        else:
+            combine_kwargs["expand_idx"] = self.assist_info_for_combine
+
+        if self.need_extra_args:
+            combine_kwargs.update({
+                "tp_send_counts": self.tp_recv_counts,
+                "group_tp": self.moe_all_to_all_group_name,
+                "tp_world_size": 1,
+                "tp_rank_id": 0,
+            })
+        if self.is_ascend_a3 and self.enable_dispatch_v2:
+            combine_kwargs.update({
+                "x_active_mask": self.mc2_mask,
+            })
+
+        if self.enable_dispatch_v2:
+            hidden_states[:] = torch_npu.npu_moe_distribute_combine_v2(
+                **combine_kwargs)
+        else:
+            hidden_states[:] = torch_npu.npu_moe_distribute_combine(
+                **combine_kwargs)
+
+
 def moe_comm_pre_process(
     hidden_states: torch.Tensor,
     topk_ids: torch.Tensor,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -82,6 +82,7 @@
 from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl,
                                                      AllReduceCommImpl,
                                                      DummyCommImpl,
+                                                     MC2CommImpl,
                                                      MoECommMethod)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.platform import NPUPlatform
@@ -365,7 +366,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         )
 
         if self.parallel_config.enable_expert_parallel:
-            self.moe_comm_method = AllGatherCommImpl
+            # self.moe_comm_method = AllGatherCommImpl
+            self.moe_comm_method = MC2CommImpl
         else:
             self.moe_comm_method = AllReduceCommImpl
 
@@ -1218,12 +1220,15 @@ def _process_reqs(
 
         moe_comm_method = self.moe_comm_method
 
+        # NOTE: Currently this padding logic is really messy,
+        # MC2 may not be available in eager mode
+        if not self.use_aclgraph or self.torchair_graph_enabled:
+            num_input_tokens = padded_num_tokens_across_dp
+
         # Run forward pass
         with set_ascend_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                # NOTE: This will break some function
-                # num_tokens=padded_num_tokens_across_dp,
                 num_tokens=num_input_tokens,
                 num_tokens_across_dp=num_tokens_across_dp,
                 with_prefill=with_prefill,