diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 681a25ed4d33..f573c8ae5131 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -783,6 +783,7 @@ def apply( custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: assert activation == "silu", "Only SiLU activation is supported." @@ -790,6 +791,10 @@ def apply( raise NotImplementedError( "Expert Parallelism is not supported for " "fused Marlin MoE method.") + if apply_router_weight_on_input: + raise NotImplementedError( + "Apply router weight on input is not supported for " + "fused Marlin MoE method.") topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x,