vllm-project
diff --git a/‎vllm/model_executor/models/aria.py‎
Lines changed: 8 additions & 4 deletions b/‎vllm/model_executor/models/aria.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/bailing_moe.py‎
Lines changed: 34 additions & 33 deletions b/‎vllm/model_executor/models/bailing_moe.py‎
Lines changed: 34 additions & 33 deletions
diff --git a/‎vllm/model_executor/models/deepseek_v2.py‎
Lines changed: 27 additions & 21 deletions b/‎vllm/model_executor/models/deepseek_v2.py‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎vllm/model_executor/models/dots1.py‎
Lines changed: 14 additions & 4 deletions b/‎vllm/model_executor/models/dots1.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/ernie45_moe.py‎
Lines changed: 10 additions & 0 deletions b/‎vllm/model_executor/models/ernie45_moe.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎vllm/model_executor/models/ernie45_vl_moe.py‎
Lines changed: 14 additions & 3 deletions b/‎vllm/model_executor/models/ernie45_vl_moe.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎vllm/model_executor/models/glm4_moe.py‎
Lines changed: 17 additions & 6 deletions b/‎vllm/model_executor/models/glm4_moe.py‎
Lines changed: 17 additions & 6 deletions
@@ -12,12 +12,11 @@
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
@@ -269,6 +268,7 @@ def __init__(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            reduce_results=True,
             prefix=f"{prefix}.experts",
         )
 
@@ -287,8 +287,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         router_output = torch.nn.functional.linear(hidden_states,
                                                    self.router_weight)
 
-        # NOTE: hidden_states will be modified inplace by `SharedFusedMoE`
-        return self.experts(hidden_states, router_output)
+        sparse_expert_output = self.experts(hidden_states, router_output)
+
+        if self.shared_experts is not None:
+            return sparse_expert_output[0] + sparse_expert_output[1]
+        else:
+            return sparse_expert_output
 
 
 class AriaTextDecoderLayer(LlamaDecoderLayer):
 
@@ -36,7 +36,8 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -278,41 +279,28 @@ def __init__(
                 quant_config=quant_config,
                 reduce_results=False,
                 prefix=f"{prefix}.shared_experts")
-
-            self.experts = SharedFusedMoE(
-                shared_experts=self.shared_experts,
-                fused_output_scaling_factor=self.routed_scaling_factor,
-                shared_output_scaling_factor=1.0,
-                num_experts=self.num_experts,
-                top_k=self.top_k,
-                hidden_size=self.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                renormalize=self.norm_expert_prob,
-                quant_config=quant_config,
-                prefix=f"{prefix}.experts",
-                scoring_func=self.score_function,
-                e_score_correction_bias=self.gate.expert_bias,
-                num_expert_group=self.n_group,
-                topk_group=self.topk_group,
-                use_grouped_topk=self.use_grouped_topk,
-            )
         else:
-            self.experts = FusedMoE(
-                num_experts=self.num_experts,
-                top_k=self.top_k,
-                hidden_size=self.hidden_size,
-                intermediate_size=config.moe_intermediate_size,
-                renormalize=self.norm_expert_prob,
-                quant_config=quant_config,
-                prefix=f"{prefix}.experts",
-                scoring_func=self.score_function,
-                e_score_correction_bias=self.gate.expert_bias,
-                num_expert_group=self.n_group,
-                topk_group=self.topk_group,
-                use_grouped_topk=self.use_grouped_topk,
-            )
             self.shared_experts = None
 
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            fused_output_scaling_factor=self.routed_scaling_factor,
+            shared_output_scaling_factor=1.0,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.norm_expert_prob,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_function,
+            e_score_correction_bias=self.gate.expert_bias,
+            num_expert_group=self.n_group,
+            topk_group=self.topk_group,
+            use_grouped_topk=self.use_grouped_topk,
+        )
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_size)
@@ -324,6 +312,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
 
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = final_hidden_states
+        else:
+            shared_output = None
+
+        final_hidden_states *= self.routed_scaling_factor
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
 
@@ -36,7 +36,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, VllmConfig,
+from vllm.config import (CacheConfig, ParallelConfig, VllmConfig,
                          get_current_vllm_config)
 from vllm.distributed import (get_ep_group, get_pp_group,
                               get_tensor_model_parallel_rank,
@@ -133,7 +133,6 @@ class DeepseekV2MoE(nn.Module):
     def __init__(
         self,
         config: Union[DeepseekV2Config, DeepseekV3Config],
-        model_config: ModelConfig,
         parallel_config: ParallelConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -184,8 +183,6 @@ def __init__(
 
         if config.n_shared_experts is None:
             self.shared_experts = None
-            fused_output_scaling_factor = 1.0
-            shared_output_scaling_factor = 1.0
         else:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
@@ -196,28 +193,17 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 is_sequence_parallel=self.is_sequence_parallel,
-                reduce_results=False,  # XXXXX
+                reduce_results=False,
                 prefix=f"{prefix}.shared_experts",
             )
 
-            # Fix FP16 overflow
-            # See DeepseekV2DecoderLayer for more details.
-            if model_config.dtype != torch.float16:
-                fused_output_scaling_factor = self.routed_scaling_factor
-                shared_output_scaling_factor = 1.0
-            else:
-                fused_output_scaling_factor = 1.0
-                shared_output_scaling_factor = (1. /
-                                                self.routed_scaling_factor)
-
         self.experts = SharedFusedMoE(
             shared_experts=self.shared_experts,
-            fused_output_scaling_factor=fused_output_scaling_factor,
-            shared_output_scaling_factor=shared_output_scaling_factor,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
             renormalize=config.norm_topk_prob,
             quant_config=quant_config,
             use_grouped_topk=True,
@@ -247,15 +233,36 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        fused_moe_out = self.experts(hidden_states=hidden_states,
+                                     router_logits=router_logits)
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+        else:
+            shared_output = None
+            final_hidden_states = fused_moe_out
+
+        # Fix FP16 overflow
+        # See DeepseekV2DecoderLayer for more details.
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= (1. / self.routed_scaling_factor)
+
+        if self.shared_experts is not None:
+            assert shared_output is not None
+            final_hidden_states += shared_output
 
         if self.is_sequence_parallel:
             final_hidden_states = tensor_model_parallel_all_gather(
                 final_hidden_states, 0)
             final_hidden_states = final_hidden_states[:num_tokens]
+        elif self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
 
-        # TODO(bnell): why is this view needed?
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
@@ -1009,7 +1016,6 @@ def __init__(self,
                 and layer_idx % config.moe_layer_freq == 0):
             self.mlp = DeepseekV2MoE(
                 config=config,
-                model_config=model_config,
                 parallel_config=parallel_config,
                 quant_config=quant_config,
                 prefix=f"{prefix}.mlp",
 
@@ -35,7 +35,9 @@
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (get_pp_group,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -138,11 +140,11 @@ def __init__(
 
         self.experts = SharedFusedMoE(
             shared_experts=self.shared_experts,
-            fused_output_scaling_factor=self.routed_scaling_factor,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
             renormalize=config.norm_topk_prob,
             quant_config=quant_config,
             use_grouped_topk=True,
@@ -159,9 +161,17 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
 
+        if self.shared_experts is not None:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[
+                1]
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(
+                final_hidden_states)
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
 
@@ -147,6 +147,7 @@ def __init__(
             top_k=config.moe_k,
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
             renormalize=True,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
@@ -162,6 +163,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
 
+        if self.has_shared_experts:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[
+                1]
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
         return final_hidden_states.view(orig_shape)
 
 
 
@@ -70,10 +70,10 @@ def __init__(self,
         self.shared_experts = shared_experts
 
     def forward(self, x):
-        out = super().forward(x)
         if self.shared_experts is not None:
-            out = out + self.shared_experts(x)
-        return out
+            return self.shared_experts(x) + super().forward(x)
+        else:
+            return super().forward(x)
 
 
 class Ernie4_5_VLMoeAttention(nn.Module):
@@ -244,6 +244,7 @@ def __init__(
                 top_k=config.moe_k,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.moe_intermediate_size[0],
+                reduce_results=False,
                 renormalize=True,
                 quant_config=quant_config,
                 e_score_correction_bias=self.e_score_correction_bias[0],
@@ -275,6 +276,7 @@ def __init__(
                 top_k=config.moe_k,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.moe_intermediate_size[1],
+                reduce_results=False,
                 renormalize=True,
                 quant_config=quant_config,
                 e_score_correction_bias=self.e_score_correction_bias[1],
@@ -337,6 +339,15 @@ def forward(
             final_hidden_states = self.text_experts(
                 hidden_states=hidden_states, router_logits=text_router_logits)
 
+        if self.has_shared_experts:
+            final_hidden_states = final_hidden_states[0] + final_hidden_states[
+                1]
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.text_experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
         return final_hidden_states.view(orig_shape)
 
 
 
@@ -162,12 +162,11 @@ def __init__(
 
         self.experts = SharedFusedMoE(
             shared_experts=self.shared_experts,
-            fused_output_scaling_factor=self.routed_scaling_factor,
-            shared_output_scaling_factor=1.0,
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
             renormalize=config.norm_topk_prob,
             quant_config=quant_config,
             use_grouped_topk=True,
@@ -179,8 +178,7 @@ def __init__(
             routed_scaling_factor=1.0,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
-            num_redundant_experts=self.n_redundant_experts,
-        )
+            num_redundant_experts=self.n_redundant_experts)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
@@ -189,9 +187,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states.to(dtype=torch.float32))
 
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        fused_moe_out = self.experts(hidden_states=hidden_states,
+                                     router_logits=router_logits)
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+            assert shared_output is not None
+            final_hidden_states = \
+                final_hidden_states * self.routed_scaling_factor\
+                    + shared_output
+        else:
+            final_hidden_states = fused_moe_out * self.routed_scaling_factor
 
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
         return final_hidden_states.view(num_tokens, hidden_dim)