Skip to content

Commit c403a57

Browse files
committed
fall back logic
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
1 parent b9a5763 commit c403a57

File tree

1 file changed

+21
-1
lines changed

1 file changed

+21
-1
lines changed

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -952,7 +952,27 @@ def apply(
952952

953953
per_act_token = (
954954
self.input_quant.strategy == QuantizationStrategy.TOKEN)
955-
955+
per_channel_quant = (
956+
self.weight_quant.strategy == QuantizationStrategy.CHANNEL)
957+
if topk_ids.shape[0] <= 8:
958+
from vllm.model_executor.layers.fused_moe import fused_experts
959+
return fused_experts(
960+
x,
961+
layer.w13_weight,
962+
layer.w2_weight,
963+
topk_weights,
964+
topk_ids,
965+
inplace=True,
966+
activation=activation,
967+
apply_router_weight_on_input=apply_router_weight_on_input,
968+
use_fp8_w8a8=True,
969+
per_channel_quant=per_channel_quant,
970+
global_num_experts=global_num_experts,
971+
expert_map=None if self.disable_expert_map else expert_map,
972+
w1_scale=layer.w13_weight_scale,
973+
w2_scale=layer.w2_weight_scale,
974+
a1_scale=layer.w13_input_scale,
975+
a2_scale=layer.w2_input_scale)
956976
if self.fused_experts is None:
957977
# If no modular kernel is provided, use cutlass_moe_fp8
958978
from vllm.model_executor.layers.fused_moe.cutlass_moe import (

0 commit comments

Comments
 (0)