@@ -1392,7 +1392,7 @@ def __init__(
13921392 "Only softmax scoring function is supported for non-grouped topk."
13931393 )
13941394
1395- moe = FusedMoEConfig (
1395+ self . moe_config : FusedMoEConfig = FusedMoEConfig (
13961396 num_experts = self .global_num_experts ,
13971397 experts_per_token = top_k ,
13981398 hidden_dim = hidden_size ,
@@ -1404,24 +1404,26 @@ def __init__(
14041404 is_act_and_mul = is_act_and_mul ,
14051405 is_lora_enabled = vllm_config .lora_config is not None ,
14061406 )
1407- self . moe_config : FusedMoEConfig = moe
1407+
14081408 self .moe_quant_config : FusedMoEQuantConfig | None = None
14091409 self .quant_config = quant_config
14101410
1411+ def _get_quant_method () -> FusedMoEMethodBase :
1412+ """
1413+ Helper method to ensure self.quant_method is never None and
1414+ of the proper type.
1415+ """
1416+ quant_method = None
1417+ if self .quant_config is not None :
1418+ quant_method = self .quant_config .get_quant_method (self , prefix )
1419+ if quant_method is None :
1420+ quant_method = UnquantizedFusedMoEMethod (self .moe_config )
1421+ assert isinstance (quant_method , FusedMoEMethodBase )
1422+ return quant_method
1423+
14111424 # Note: get_quant_method will look at the layer's local_num_experts
14121425 # for heuristic purposes, so it must be initialized first.
1413- quant_method : QuantizeMethodBase | None = None
1414- quant_method = (
1415- UnquantizedFusedMoEMethod (moe )
1416- if quant_config is None
1417- else quant_config .get_quant_method (self , prefix )
1418- )
1419- if quant_method is None :
1420- quant_method = UnquantizedFusedMoEMethod (moe )
1421-
1422- assert quant_method is not None
1423- assert isinstance (quant_method , FusedMoEMethodBase )
1424- self .quant_method = quant_method
1426+ self .quant_method : FusedMoEMethodBase = _get_quant_method ()
14251427
14261428 if not self .moe_config .is_act_and_mul :
14271429 # Avoid circular import
@@ -1441,20 +1443,17 @@ def __init__(
14411443 "is_act_and_mul=False is supported only for CUDA for now"
14421444 )
14431445
1444- if self .enable_eplb :
1445- from vllm .model_executor .layers .quantization .fp8 import Fp8MoEMethod
1446-
1447- if not isinstance (quant_method , (Fp8MoEMethod , UnquantizedFusedMoEMethod )):
1448- # TODO: Add support for additional quantization methods.
1449- # The implementation for other quantization methods does not
1450- # contain essential differences, but the current quant API
1451- # design causes duplicated work when extending to new
1452- # quantization methods, so I'm leaving it for now.
1453- # If you plan to add support for more quantization methods,
1454- # please refer to the implementation in `Fp8MoEMethod`.
1455- raise NotImplementedError (
1456- "EPLB is only supported for FP8 quantization for now."
1457- )
1446+ if self .enable_eplb and not self .quant_method .supports_eplb :
1447+ # TODO: Add support for additional quantization methods.
1448+ # The implementation for other quantization methods does not
1449+ # contain essential differences, but the current quant API
1450+ # design causes duplicated work when extending to new
1451+ # quantization methods, so I'm leaving it for now.
1452+ # If you plan to add support for more quantization methods,
1453+ # please refer to the implementation in `Fp8MoEMethod`.
1454+ raise NotImplementedError (
1455+ "EPLB is only supported for FP8 quantization for now."
1456+ )
14581457
14591458 moe_quant_params = {
14601459 "num_experts" : self .local_num_experts ,
0 commit comments