File tree Expand file tree Collapse file tree 1 file changed +21
-1
lines changed
vllm/model_executor/layers/quantization/compressed_tensors Expand file tree Collapse file tree 1 file changed +21
-1
lines changed Original file line number Diff line number Diff line change @@ -952,7 +952,27 @@ def apply(
952952
953953 per_act_token = (
954954 self .input_quant .strategy == QuantizationStrategy .TOKEN )
955-
955+ per_channel_quant = (
956+ self .weight_quant .strategy == QuantizationStrategy .CHANNEL )
957+ if topk_ids .shape [0 ] <= 8 :
958+ from vllm .model_executor .layers .fused_moe import fused_experts
959+ return fused_experts (
960+ x ,
961+ layer .w13_weight ,
962+ layer .w2_weight ,
963+ topk_weights ,
964+ topk_ids ,
965+ inplace = True ,
966+ activation = activation ,
967+ apply_router_weight_on_input = apply_router_weight_on_input ,
968+ use_fp8_w8a8 = True ,
969+ per_channel_quant = per_channel_quant ,
970+ global_num_experts = global_num_experts ,
971+ expert_map = None if self .disable_expert_map else expert_map ,
972+ w1_scale = layer .w13_weight_scale ,
973+ w2_scale = layer .w2_weight_scale ,
974+ a1_scale = layer .w13_input_scale ,
975+ a2_scale = layer .w2_input_scale )
956976 if self .fused_experts is None :
957977 # If no modular kernel is provided, use cutlass_moe_fp8
958978 from vllm .model_executor .layers .fused_moe .cutlass_moe import (
You can’t perform that action at this time.
0 commit comments