vllm-project
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 30 additions & 27 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 30 additions & 27 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/modular_kernel.py‎
Lines changed: 9 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/modular_kernel.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/quantization/awq_marlin.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/layers/quantization/awq_marlin.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/quantization/bitsandbytes.py‎
Lines changed: 1 addition & 2 deletions b/‎vllm/model_executor/layers/quantization/bitsandbytes.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 0 additions & 47 deletions b/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎vllm/model_executor/layers/quantization/experts_int8.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/layers/quantization/experts_int8.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/quantization/fp8.py‎
Lines changed: 5 additions & 26 deletions b/‎vllm/model_executor/layers/quantization/fp8.py‎
Lines changed: 5 additions & 26 deletions
diff --git a/‎vllm/model_executor/layers/quantization/gguf.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/layers/quantization/gguf.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/quantization/gptq_marlin.py‎
Lines changed: 0 additions & 2 deletions b/‎vllm/model_executor/layers/quantization/gptq_marlin.py‎
Lines changed: 0 additions & 2 deletions
@@ -111,7 +111,6 @@ def __init__(self, moe: FusedMoEConfig):
         super().__init__()
         self.moe = moe
         self.moe_quant_config: FusedMoEQuantConfig | None = None
-        self.fused_experts: FusedMoEModularKernel | None = None
         self.topk_indices_dtype = None
 
     @abstractmethod
@@ -254,9 +253,6 @@ def init_prepare_finalize(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
             )
             assert self.topk_indices_dtype is None
-            assert self.fused_experts is None, (
-                f"Attempt to override experts for {id(self)}!"
-            )
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, layer)
             return FusedMoEModularKernel(
@@ -287,7 +283,11 @@ def get_fused_moe_quant_config(
 
     @property
     def using_modular_kernel(self) -> bool:
-        return self.fused_experts is not None
+        return False
+
+    @property
+    def supports_eplb(self) -> bool:
+        return False
 
     @abstractmethod
     def apply(
@@ -330,10 +330,21 @@ def __init__(
         self.moe_quant_config = old_moe_method.moe_quant_config
         self.fused_experts = fused_experts
         self.topk_indices_dtype = old_moe_method.topk_indices_dtype
-
+        self.disable_expert_map = not fused_experts.supports_expert_map()
+        self.old_method_name = old_moe_method.__class__.__name__
+        self._supports_eplb = old_moe_method.supports_eplb
         if isinstance(old_moe_method, torch.nn.Module):
             self.load_state_dict(old_moe_method.state_dict())
-        logger.debug("Swapping out %s", old_moe_method.__class__.__name__)
+        logger.debug("Swapping out %s", self.old_method_name)
+
+    @property
+    def using_modular_kernel(self) -> bool:
+        return True
+
+    @property
+    @abstractmethod
+    def supports_eplb(self) -> bool:
+        return self._supports_eplb
 
     def create_weights(
         self,
@@ -374,12 +385,21 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is not None
-
         # Is getattr needed?
         zero_expert_num = getattr(layer, "zero_expert_num", 0)
         zero_expert_type = getattr(layer, "zero_expert_type", None)
 
+        if enable_eplb:
+            if not self.supports_eplb:
+                assert expert_load_view is not None
+                assert logical_to_physical_map is not None
+                assert logical_replica_count is not None
+                assert isinstance(layer, FusedMoE)
+            else:
+                raise NotImplementedError(
+                    f"EPLB is not supported for {self.old_method_name}"
+                )
+
         select_result = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -415,7 +435,7 @@ def apply(
             activation=activation,
             global_num_experts=global_num_experts,
             apply_router_weight_on_input=apply_router_weight_on_input,
-            expert_map=expert_map,
+            expert_map=None if self.disable_expert_map else expert_map,
         )
 
         if zero_expert_num != 0 and zero_expert_type is not None:
@@ -750,7 +770,6 @@ def forward_cuda(
         )
 
         if self.rocm_aiter_moe_enabled:
-            assert self.fused_experts is None
             result = self.rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -771,23 +790,7 @@ def forward_cuda(
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
-        elif self.fused_experts is not None:
-            if self.moe.has_bias:
-                raise ValueError("FusedMoEModularKernel does not support bias.")
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-            )
         else:
-            assert fused_experts is not None
             result = fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
 
@@ -704,6 +704,15 @@ def __init__(
             f"{fused_experts.activation_formats[0]}"
         )
 
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps
+        """
+        return (
+            self.prepare_finalize.num_dispatchers() <= 1
+            and self.fused_experts.supports_expert_map()
+        )
+
     def output_is_reduced(self) -> bool:
         """
         Indicates whether or not the output of fused MoE kernel
 
@@ -582,8 +582,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
 
 
@@ -518,12 +518,11 @@ def apply(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `BitsAndBytesMoEMethod` yet."
             )
+
         topk_weights, topk_ids, _ = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
 
@@ -456,12 +456,7 @@ def apply(
             indices_type=self.topk_indices_dtype,
         )
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin.
-        #
         if self.use_marlin:
-            assert self.fused_experts is None
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -482,24 +477,6 @@ def apply(
                 workspace=layer.workspace,
             )
 
-        elif self.fused_experts is not None:
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight
-            ), "Flashinfer CUTLASS Fused MoE not applicable!"
-
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
-
         # FlashInfer fused experts path
         elif self.allow_flashinfer:
             from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
@@ -1060,13 +1037,8 @@ def apply(
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
         per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
 
-        #
-        # Note: the order here is important. self.fused_experts can override
-        # cutlass fp8 or fused_experts but not marlin or rocm.
-        #
         if self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
-            assert self.fused_experts is None
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1092,7 +1064,6 @@ def apply(
 
             assert per_act_token == per_channel_quant
             assert self.moe_quant_config is not None
-            assert self.fused_experts is None
             return rocm_aiter_fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1105,18 +1076,6 @@ def apply(
                 quant_config=self.moe_quant_config,
             )
 
-        elif self.fused_experts is not None:
-            return self.fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=None if self.disable_expert_map else expert_map,
-            )
-
         # cutlass path
         elif self.use_cutlass:
             assert self.moe_quant_config is not None
@@ -1312,8 +1271,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
@@ -1630,8 +1587,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
@@ -1895,8 +1850,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
 
@@ -158,8 +158,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ExpertsInt8MoEMethod` yet."
 
@@ -584,9 +584,6 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-
-        self.fused_experts: mk.FusedMoEModularKernel | None = None  # type: ignore
-
         self.fp8_backend = get_fp8_moe_backend(self.block_quant)
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
@@ -1062,6 +1059,10 @@ def get_fused_moe_quant_config(
             block_shape=self.weight_block_size,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1091,10 +1092,7 @@ def apply(
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
 
-        if (
-            self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
-            and self.fused_experts is None
-        ):
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -1170,18 +1168,13 @@ def apply(
             zero_expert_type=zero_expert_type,
         )
 
-        #
-        # Note: the order of checks is important since self.fused_experts
-        # can override fused_experts or cutlass but not rocm or marlin.
-        #
         topk_weights, topk_ids, zero_expert_result = select_result
 
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                 rocm_aiter_fused_experts,
             )
 
-            assert self.fused_experts is None
             result = rocm_aiter_fused_experts(
                 x,
                 layer.w13_weight,
@@ -1195,7 +1188,6 @@ def apply(
             )
         elif self.use_marlin:
             assert activation == "silu", f"{activation} not supported for Marlin MoE."
-            assert self.fused_experts is None
             result = torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1213,19 +1205,6 @@ def apply(
                 expert_map=expert_map,
                 workspace=layer.workspace,
             )
-        elif self.fused_experts:
-            result = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=True,
-                activation=activation,
-                global_num_experts=global_num_experts,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
-            )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             assert not self.block_quant
             assert not renormalize and custom_routing_function is not None
 
@@ -585,8 +585,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
 
 
@@ -741,8 +741,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet."