From c15cbdd8e07873554d25fdad14443a3339c4310c Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Sat, 5 Apr 2025 15:59:12 -0700 Subject: [PATCH] Add apply_router_weight_on_input to CompressedTensorsWNA16MoEMethod's apply Signed-off-by: Lu Fang --- .../compressed_tensors/compressed_tensors_moe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 681a25ed4d33..f573c8ae5131 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -783,6 +783,7 @@ def apply( custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: assert activation == "silu", "Only SiLU activation is supported." @@ -790,6 +791,10 @@ def apply( raise NotImplementedError( "Expert Parallelism is not supported for " "fused Marlin MoE method.") + if apply_router_weight_on_input: + raise NotImplementedError( + "Apply router weight on input is not supported for " + "fused Marlin MoE method.") topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x,