Skip to content

Commit 16f7641

Browse files
committed
clean up object types and initialization
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 008081f commit 16f7641

File tree

1 file changed

+27
-28
lines changed
  • vllm/model_executor/layers/fused_moe

1 file changed

+27
-28
lines changed

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,7 +1392,7 @@ def __init__(
13921392
"Only softmax scoring function is supported for non-grouped topk."
13931393
)
13941394

1395-
moe = FusedMoEConfig(
1395+
self.moe_config: FusedMoEConfig = FusedMoEConfig(
13961396
num_experts=self.global_num_experts,
13971397
experts_per_token=top_k,
13981398
hidden_dim=hidden_size,
@@ -1404,24 +1404,26 @@ def __init__(
14041404
is_act_and_mul=is_act_and_mul,
14051405
is_lora_enabled=vllm_config.lora_config is not None,
14061406
)
1407-
self.moe_config: FusedMoEConfig = moe
1407+
14081408
self.moe_quant_config: FusedMoEQuantConfig | None = None
14091409
self.quant_config = quant_config
14101410

1411+
def _get_quant_method() -> FusedMoEMethodBase:
1412+
"""
1413+
Helper method to ensure self.quant_method is never None and
1414+
of the proper type.
1415+
"""
1416+
quant_method = None
1417+
if self.quant_config is not None:
1418+
quant_method = self.quant_config.get_quant_method(self, prefix)
1419+
if quant_method is None:
1420+
quant_method = UnquantizedFusedMoEMethod(self.moe_config)
1421+
assert isinstance(quant_method, FusedMoEMethodBase)
1422+
return quant_method
1423+
14111424
# Note: get_quant_method will look at the layer's local_num_experts
14121425
# for heuristic purposes, so it must be initialized first.
1413-
quant_method: QuantizeMethodBase | None = None
1414-
quant_method = (
1415-
UnquantizedFusedMoEMethod(moe)
1416-
if quant_config is None
1417-
else quant_config.get_quant_method(self, prefix)
1418-
)
1419-
if quant_method is None:
1420-
quant_method = UnquantizedFusedMoEMethod(moe)
1421-
1422-
assert quant_method is not None
1423-
assert isinstance(quant_method, FusedMoEMethodBase)
1424-
self.quant_method = quant_method
1426+
self.quant_method: FusedMoEMethodBase = _get_quant_method()
14251427

14261428
if not self.moe_config.is_act_and_mul:
14271429
# Avoid circular import
@@ -1441,20 +1443,17 @@ def __init__(
14411443
"is_act_and_mul=False is supported only for CUDA for now"
14421444
)
14431445

1444-
if self.enable_eplb:
1445-
from vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod
1446-
1447-
if not isinstance(quant_method, (Fp8MoEMethod, UnquantizedFusedMoEMethod)):
1448-
# TODO: Add support for additional quantization methods.
1449-
# The implementation for other quantization methods does not
1450-
# contain essential differences, but the current quant API
1451-
# design causes duplicated work when extending to new
1452-
# quantization methods, so I'm leaving it for now.
1453-
# If you plan to add support for more quantization methods,
1454-
# please refer to the implementation in `Fp8MoEMethod`.
1455-
raise NotImplementedError(
1456-
"EPLB is only supported for FP8 quantization for now."
1457-
)
1446+
if self.enable_eplb and not self.quant_method.supports_eplb:
1447+
# TODO: Add support for additional quantization methods.
1448+
# The implementation for other quantization methods does not
1449+
# contain essential differences, but the current quant API
1450+
# design causes duplicated work when extending to new
1451+
# quantization methods, so I'm leaving it for now.
1452+
# If you plan to add support for more quantization methods,
1453+
# please refer to the implementation in `Fp8MoEMethod`.
1454+
raise NotImplementedError(
1455+
"EPLB is only supported for FP8 quantization for now."
1456+
)
14581457

14591458
moe_quant_params = {
14601459
"num_experts": self.local_num_experts,

0 commit comments

Comments
 (0)