From 69c4ecf3aea6606c279657230a4d6042869a159e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 9 Sep 2025 22:20:00 +0000 Subject: [PATCH] [Bugfix] Fix for 24530. Fix naive all2all shared expert overlap. Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2f88a63665c5..551f284a3609 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1755,9 +1755,6 @@ def forward_impl( self.dp_size > 1 and not self.moe_parallel_config.use_deepep_ht_kernels and not self.moe_config.use_flashinfer_cutlass_kernels) - if do_naive_dispatch_combine: - hidden_states, router_logits = get_ep_group().dispatch( - hidden_states, router_logits) # If there are shared experts but we are not using a modular kernel, the # shared experts must be called here @@ -1768,6 +1765,10 @@ def forward_impl( else: shared_output = None + if do_naive_dispatch_combine: + hidden_states, router_logits = get_ep_group().dispatch( + hidden_states, router_logits) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1800,8 +1801,9 @@ def forward_impl( final_hidden_states, ) - def reduce_output(states: torch.Tensor) -> torch.Tensor: - if do_naive_dispatch_combine: + def reduce_output(states: torch.Tensor, + do_combine: bool = True) -> torch.Tensor: + if do_naive_dispatch_combine and do_combine: states = get_ep_group().combine(states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): @@ -1810,10 +1812,11 @@ def reduce_output(states: torch.Tensor) -> torch.Tensor: return states if self.shared_experts is None: + assert not isinstance(final_hidden_states, tuple) return reduce_output(final_hidden_states) else: return ( - reduce_output(final_hidden_states[0]), + reduce_output(final_hidden_states[0], do_combine=False), reduce_output(final_hidden_states[1]), )