From 69c4ecf3aea6606c279657230a4d6042869a159e Mon Sep 17 00:00:00 2001
From: Bill Nell <bnell@redhat.com>
Date: Tue, 9 Sep 2025 22:20:00 +0000
Subject: [PATCH] [Bugfix] Fix for 24530. Fix naive all2all shared expert
 overlap.

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 2f88a63665c5..551f284a3609 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1755,9 +1755,6 @@ def forward_impl(
             self.dp_size > 1
             and not self.moe_parallel_config.use_deepep_ht_kernels
             and not self.moe_config.use_flashinfer_cutlass_kernels)
-        if do_naive_dispatch_combine:
-            hidden_states, router_logits = get_ep_group().dispatch(
-                hidden_states, router_logits)
 
         # If there are shared experts but we are not using a modular kernel, the
         # shared experts must be called here
@@ -1768,6 +1765,10 @@ def forward_impl(
         else:
             shared_output = None
 
+        if do_naive_dispatch_combine:
+            hidden_states, router_logits = get_ep_group().dispatch(
+                hidden_states, router_logits)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -1800,8 +1801,9 @@ def forward_impl(
                 final_hidden_states,
             )
 
-        def reduce_output(states: torch.Tensor) -> torch.Tensor:
-            if do_naive_dispatch_combine:
+        def reduce_output(states: torch.Tensor,
+                          do_combine: bool = True) -> torch.Tensor:
+            if do_naive_dispatch_combine and do_combine:
                 states = get_ep_group().combine(states)
 
             if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
@@ -1810,10 +1812,11 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor:
             return states
 
         if self.shared_experts is None:
+            assert not isinstance(final_hidden_states, tuple)
             return reduce_output(final_hidden_states)
         else:
             return (
-                reduce_output(final_hidden_states[0]),
+                reduce_output(final_hidden_states[0], do_combine=False),
                 reduce_output(final_hidden_states[1]),
             )