vllm-project · tlrmchlsmth · Sep 27, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
@@ -279,6 +279,24 @@ def stateless_init_dp_group(self) -> ProcessGroup:
         assert last_exc is not None
         raise last_exc
 
+    # The all_reduce at the end of attention (during o_proj) means that
+    # inputs are replicated across each rank of the tensor parallel group.
+    # If using expert-parallelism with DeepEP All2All ops, replicated
+    # tokens results in useless duplicate computation and communication.
+    #
+    # In this case, ensure the input to the experts is sequence parallel
+    # to avoid the excess work.
+    #
+    # Not needed for pplx-kernels as it can handle duplicate input tokens.
+    @property
+    def use_sequence_parallel_moe(self) -> bool:
+        return (envs.VLLM_ALL2ALL_BACKEND
+                in ("allgather_reducescatter", "naive",
+                    "deepep_high_throughput", "deepep_low_latency")
+                and self.enable_expert_parallel
+                and self.tensor_parallel_size > 1
+                and self.data_parallel_size > 1)
+
     @staticmethod
     def has_unfinished_dp(dp_group: ProcessGroup,
                           has_unfinished: bool) -> bool:

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
@@ -6,7 +6,7 @@
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.distributed import get_dp_group
+from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils import has_deep_ep, has_pplx
@@ -34,41 +34,60 @@ def __init__(self, cpu_group):
         super().__init__(cpu_group)
 
     def naive_multicast(self, x: torch.Tensor,
-                        cu_tokens_across_dp_cpu: torch.Tensor):
+                        cu_tokens_across_sp_cpu: torch.Tensor,
+                        is_sequence_parallel: bool) -> torch.Tensor:
         assert (len(x.shape) == 2)
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+        buffer = torch.empty((cu_tokens_across_sp_cpu[-1], x.size(1)),
                              device=x.device,
                              dtype=x.dtype)
 
-        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        rank = self.rank if is_sequence_parallel else self.dp_rank
+        world_size = (self.world_size
+                      if is_sequence_parallel else self.dp_world_size)
+
+        start = 0 if rank == 0 else cu_tokens_across_sp_cpu[rank - 1]
+        end = cu_tokens_across_sp_cpu[rank]
         buffer[start:end, :].copy_(x)
-        for idx in range(self.dp_world_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
-            self.dp_group.broadcast(buffer[start:end, :], idx)
+        for idx in range(world_size):
+            start = 0 if idx == 0 else cu_tokens_across_sp_cpu[idx - 1]
+            end = cu_tokens_across_sp_cpu[idx]
+            get_ep_group().broadcast(buffer[start:end, :], idx)
 
         return buffer
 
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
-        sizes = get_forward_context(
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states, router_logits = get_dp_group().all_gatherv(
-            [hidden_states, router_logits],
-            dim=0,
-            sizes=sizes,
-        )
-
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
+        dp_metadata = get_forward_context().dp_metadata
+        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
+
+        hidden_states = self.naive_multicast(hidden_states,
+                                             cu_tokens_across_sp_cpu,
+                                             is_sequence_parallel)
+        router_logits = self.naive_multicast(router_logits,
+                                             cu_tokens_across_sp_cpu,
+                                             is_sequence_parallel)
         return hidden_states, router_logits
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        sizes = get_forward_context(
-        ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
-                                                       dim=0,
-                                                       sizes=sizes)
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
+
+        ep_rank = self.rank if is_sequence_parallel else self.dp_rank
+
+        dp_metadata = get_forward_context().dp_metadata
+        sp_size = self.tp_group.world_size if is_sequence_parallel else 1
+        cu_tokens_across_sp_cpu = dp_metadata.cu_tokens_across_sp(sp_size)
+
+        start = 0 if ep_rank == 0 else cu_tokens_across_sp_cpu[ep_rank - 1]
+        end = cu_tokens_across_sp_cpu[ep_rank]
+
+        all_hidden_states = get_ep_group().all_reduce(hidden_states)
+        hidden_states = all_hidden_states[start:end, :]
         return hidden_states
 
     def destroy(self):
@@ -84,29 +103,40 @@ class AgRsAll2AllManager(All2AllManagerBase):
     def __init__(self, cpu_group):
         super().__init__(cpu_group)
 
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Gather hidden_states and router_logits from all dp ranks.
         """
         sizes = get_forward_context(
         ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states, router_logits = get_dp_group().all_gatherv(
+
+        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
+        assert sizes[dist_group.rank_in_group] == hidden_states.shape[0]
+        hidden_states, router_logits = dist_group.all_gatherv(
             [hidden_states, router_logits],
             dim=0,
             sizes=sizes,
         )
         return hidden_states, router_logits
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         """
         Reduce-scatter hidden_states across all dp ranks.
         """
         sizes = get_forward_context(
         ).dp_metadata.get_chunk_sizes_across_dp_rank()
-        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
-                                                       dim=0,
-                                                       sizes=sizes)
+
+        dist_group = get_ep_group() if is_sequence_parallel else get_dp_group()
+        hidden_states = dist_group.reduce_scatterv(hidden_states,
+                                                   dim=0,
+                                                   sizes=sizes)
         return hidden_states
 
     def destroy(self):
@@ -148,11 +178,17 @@ def get_handle(self, kwargs):
             kwargs, pplx.AllToAll.internode
             if self.internode else pplx.AllToAll.intranode)
 
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         raise NotImplementedError
 
     def destroy(self):
@@ -184,11 +220,17 @@ def __init__(self, cpu_group):
     def get_handle(self, kwargs):
         raise NotImplementedError
 
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
+    def dispatch(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         raise NotImplementedError
 
     def destroy(self):
@@ -395,4 +437,4 @@ def cleanup(self):
                 self.workspace_tensor = None
                 self.prepare_workspace_tensor = None
                 self.mapping = None
-                self.initialized = False
+                self.initialized = False
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -28,6 +28,8 @@ def get_or_create(self, kwargs, func):
 
 
 class All2AllManagerBase:
+    rank: int
+    world_size: int
 
     def __init__(self, cpu_group):
         self.cpu_group = cpu_group
@@ -40,6 +42,7 @@ def __init__(self, cpu_group):
         # all2all lives in ep group, which is merged from dp and tp group
         self.dp_group = get_dp_group()
         self.tp_group = get_tp_group()
+
         # no self.ep_group since self.ep_group is still in construction
         # when we create this object
         self.dp_rank = self.dp_group.rank_in_group
@@ -60,17 +63,21 @@ def get_handle(self, kwargs):
         # and reuse it for the same config.
         raise NotImplementedError
 
+    def dispatch(self,
+                 hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor,
+                 is_sequence_parallel: bool = False):
+        raise NotImplementedError
+
     def set_num_sms(self, num_sms: int):
         pass
 
     def max_sms_used(self) -> Optional[int]:
         return None  # None means it could use the whole GPU
 
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
-        raise NotImplementedError
-
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False):
         raise NotImplementedError
 
     def destroy(self):
@@ -267,15 +274,20 @@ def prepare_communication_buffer_for_model(self,
             module.quant_method.init_prepare_finalize(module)
 
     def dispatch(
-            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Dispatch the hidden states and router logits to the appropriate device.
         This is a no-op in the base class.
         """
         return hidden_states, router_logits
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         """
         Combine the hidden states and router logits from the appropriate device.
         This is a no-op in the base class.

diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -39,10 +39,6 @@ def __init__(self,
             use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
             use_torch_symm_mem = envs.VLLM_ALLREDUCE_USE_SYMM_MEM
 
-        # ep does not use pynccl
-        use_pynccl = "ep" not in unique_name
-
-        self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
         self.use_torch_symm_mem = use_torch_symm_mem
 
@@ -57,7 +53,7 @@ def __init__(self,
             SymmMemCommunicator)
 
         self.pynccl_comm: Optional[PyNcclCommunicator] = None
-        if use_pynccl and self.world_size > 1:
+        if self.world_size > 1:
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
                 device=self.device,
@@ -308,14 +304,20 @@ def _all_gather_single(input_: torch.Tensor,
         return output_list
 
     def dispatch(
-            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.all2all_manager is not None
         hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits)
+            hidden_states, router_logits, is_sequence_parallel)
         return hidden_states, router_logits
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states)
+        hidden_states = self.all2all_manager.combine(hidden_states,
+                                                     is_sequence_parallel)
         return hidden_states
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -75,14 +75,20 @@ def broadcast(self, input_: torch.Tensor, src: int = 0) -> None:
         dist.broadcast(input_, src=src, group=self.device_group)
 
     def dispatch(
-            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.all2all_manager is not None
         hidden_states, router_logits = self.all2all_manager.dispatch(
-            hidden_states, router_logits)
+            hidden_states, router_logits, is_sequence_parallel)
         return hidden_states, router_logits
 
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def combine(self,
+                hidden_states: torch.Tensor,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         assert self.all2all_manager is not None
-        hidden_states = self.all2all_manager.combine(hidden_states)
+        hidden_states = self.all2all_manager.combine(hidden_states,
+                                                     is_sequence_parallel)
         return hidden_states
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -871,17 +871,24 @@ def prepare_communication_buffer_for_model(self, model: torch.nn.Module):
                 model)
 
     def dispatch(
-            self, hidden_states: torch.Tensor,
-            router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        is_sequence_parallel: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.device_communicator is not None:
             return self.device_communicator.dispatch(hidden_states,
-                                                     router_logits)
+                                                     router_logits,
+                                                     is_sequence_parallel)
         else:
             return hidden_states, router_logits
 
-    def combine(self, hidden_states) -> torch.Tensor:
+    def combine(self,
+                hidden_states,
+                is_sequence_parallel: bool = False) -> torch.Tensor:
         if self.device_communicator is not None:
-            return self.device_communicator.combine(hidden_states)
+            return self.device_communicator.combine(hidden_states,
+                                                    is_sequence_parallel)
         else:
             return hidden_states