Skip to content

Commit 3463197

Browse files
committed
remove uses of self.fused_experts
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 0cfe1a6 commit 3463197

File tree

14 files changed

+79
-282
lines changed

14 files changed

+79
-282
lines changed

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ def __init__(self, moe: FusedMoEConfig):
111111
super().__init__()
112112
self.moe = moe
113113
self.moe_quant_config: FusedMoEQuantConfig | None = None
114-
self.fused_experts: FusedMoEModularKernel | None = None
115114
self.topk_indices_dtype = None
116115

117116
@abstractmethod
@@ -254,9 +253,6 @@ def init_prepare_finalize(
254253
"%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
255254
)
256255
assert self.topk_indices_dtype is None
257-
assert self.fused_experts is None, (
258-
f"Attempt to override experts for {id(self)}!"
259-
)
260256
self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
261257
experts = self.select_gemm_impl(prepare_finalize, layer)
262258
return FusedMoEModularKernel(
@@ -287,7 +283,11 @@ def get_fused_moe_quant_config(
287283

288284
@property
289285
def using_modular_kernel(self) -> bool:
290-
return self.fused_experts is not None
286+
return False
287+
288+
@property
289+
def supports_eplb(self) -> bool:
290+
return False
291291

292292
@abstractmethod
293293
def apply(
@@ -330,10 +330,21 @@ def __init__(
330330
self.moe_quant_config = old_moe_method.moe_quant_config
331331
self.fused_experts = fused_experts
332332
self.topk_indices_dtype = old_moe_method.topk_indices_dtype
333-
333+
self.disable_expert_map = not fused_experts.supports_expert_map()
334+
self.old_method_name = old_moe_method.__class__.__name__
335+
self._supports_eplb = old_moe_method.supports_eplb
334336
if isinstance(old_moe_method, torch.nn.Module):
335337
self.load_state_dict(old_moe_method.state_dict())
336-
logger.debug("Swapping out %s", old_moe_method.__class__.__name__)
338+
logger.debug("Swapping out %s", self.old_method_name)
339+
340+
@property
341+
def using_modular_kernel(self) -> bool:
342+
return True
343+
344+
@property
345+
@abstractmethod
346+
def supports_eplb(self) -> bool:
347+
return self._supports_eplb
337348

338349
def create_weights(
339350
self,
@@ -374,12 +385,21 @@ def apply(
374385
logical_to_physical_map: torch.Tensor | None = None,
375386
logical_replica_count: torch.Tensor | None = None,
376387
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
377-
assert self.fused_experts is not None
378-
379388
# Is getattr needed?
380389
zero_expert_num = getattr(layer, "zero_expert_num", 0)
381390
zero_expert_type = getattr(layer, "zero_expert_type", None)
382391

392+
if enable_eplb:
393+
if not self.supports_eplb:
394+
assert expert_load_view is not None
395+
assert logical_to_physical_map is not None
396+
assert logical_replica_count is not None
397+
assert isinstance(layer, FusedMoE)
398+
else:
399+
raise NotImplementedError(
400+
f"EPLB is not supported for {self.old_method_name}"
401+
)
402+
383403
select_result = FusedMoE.select_experts(
384404
hidden_states=x,
385405
router_logits=router_logits,
@@ -415,7 +435,7 @@ def apply(
415435
activation=activation,
416436
global_num_experts=global_num_experts,
417437
apply_router_weight_on_input=apply_router_weight_on_input,
418-
expert_map=expert_map,
438+
expert_map=None if self.disable_expert_map else expert_map,
419439
)
420440

421441
if zero_expert_num != 0 and zero_expert_type is not None:
@@ -750,7 +770,6 @@ def forward_cuda(
750770
)
751771

752772
if self.rocm_aiter_moe_enabled:
753-
assert self.fused_experts is None
754773
result = self.rocm_aiter_fused_experts(
755774
hidden_states=x,
756775
w1=layer.w13_weight,
@@ -771,23 +790,7 @@ def forward_cuda(
771790
activation=activation,
772791
apply_router_weight_on_input=apply_router_weight_on_input,
773792
)
774-
elif self.fused_experts is not None:
775-
if self.moe.has_bias:
776-
raise ValueError("FusedMoEModularKernel does not support bias.")
777-
result = self.fused_experts(
778-
hidden_states=x,
779-
w1=layer.w13_weight,
780-
w2=layer.w2_weight,
781-
topk_weights=topk_weights,
782-
topk_ids=topk_ids,
783-
inplace=True,
784-
activation=activation,
785-
apply_router_weight_on_input=apply_router_weight_on_input,
786-
global_num_experts=global_num_experts,
787-
expert_map=expert_map,
788-
)
789793
else:
790-
assert fused_experts is not None
791794
result = fused_experts(
792795
hidden_states=x,
793796
w1=layer.w13_weight,

vllm/model_executor/layers/fused_moe/modular_kernel.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,15 @@ def __init__(
704704
f"{fused_experts.activation_formats[0]}"
705705
)
706706

707+
def supports_expert_map(self) -> bool:
708+
"""
709+
A flag indicating whether or not this class supports expert maps
710+
"""
711+
return (
712+
self.prepare_finalize.num_dispatchers() <= 1
713+
and self.fused_experts.supports_expert_map()
714+
)
715+
707716
def output_is_reduced(self) -> bool:
708717
"""
709718
Indicates whether or not the output of fused MoE kernel

vllm/model_executor/layers/quantization/awq_marlin.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -582,8 +582,6 @@ def apply(
582582
logical_to_physical_map: torch.Tensor | None = None,
583583
logical_replica_count: torch.Tensor | None = None,
584584
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
585-
assert self.fused_experts is None
586-
587585
if enable_eplb:
588586
raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
589587

vllm/model_executor/layers/quantization/bitsandbytes.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -518,12 +518,11 @@ def apply(
518518
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
519519
from vllm.model_executor.layers.fused_moe import fused_experts
520520

521-
assert self.fused_experts is None
522-
523521
if enable_eplb:
524522
raise NotImplementedError(
525523
"EPLB not supported for `BitsAndBytesMoEMethod` yet."
526524
)
525+
527526
topk_weights, topk_ids, _ = FusedMoE.select_experts(
528527
hidden_states=x,
529528
router_logits=router_logits,

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -456,12 +456,7 @@ def apply(
456456
indices_type=self.topk_indices_dtype,
457457
)
458458

459-
#
460-
# Note: the order here is important. self.fused_experts can override
461-
# flashinfer cutlass, cutlass fp4 or fused_experts but not marlin.
462-
#
463459
if self.use_marlin:
464-
assert self.fused_experts is None
465460
return torch.ops.vllm.fused_marlin_moe(
466461
x,
467462
layer.w13_weight,
@@ -482,24 +477,6 @@ def apply(
482477
workspace=layer.workspace,
483478
)
484479

485-
elif self.fused_experts is not None:
486-
assert is_valid_flashinfer_cutlass_fused_moe(
487-
x, layer.w13_weight, layer.w2_weight
488-
), "Flashinfer CUTLASS Fused MoE not applicable!"
489-
490-
return self.fused_experts(
491-
hidden_states=x,
492-
w1=layer.w13_weight,
493-
w2=layer.w2_weight,
494-
topk_weights=topk_weights,
495-
topk_ids=topk_ids,
496-
inplace=False, # TODO(shuw): fix later, now output is high prec
497-
activation=activation,
498-
global_num_experts=global_num_experts,
499-
expert_map=expert_map,
500-
apply_router_weight_on_input=apply_router_weight_on_input,
501-
)
502-
503480
# FlashInfer fused experts path
504481
elif self.allow_flashinfer:
505482
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501
@@ -1060,13 +1037,8 @@ def apply(
10601037
per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
10611038
per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL
10621039

1063-
#
1064-
# Note: the order here is important. self.fused_experts can override
1065-
# cutlass fp8 or fused_experts but not marlin or rocm.
1066-
#
10671040
if self.use_marlin:
10681041
assert activation == "silu", f"{activation} not supported for Marlin MoE."
1069-
assert self.fused_experts is None
10701042
return torch.ops.vllm.fused_marlin_moe(
10711043
x,
10721044
layer.w13_weight,
@@ -1092,7 +1064,6 @@ def apply(
10921064

10931065
assert per_act_token == per_channel_quant
10941066
assert self.moe_quant_config is not None
1095-
assert self.fused_experts is None
10961067
return rocm_aiter_fused_experts(
10971068
hidden_states=x,
10981069
w1=layer.w13_weight,
@@ -1105,18 +1076,6 @@ def apply(
11051076
quant_config=self.moe_quant_config,
11061077
)
11071078

1108-
elif self.fused_experts is not None:
1109-
return self.fused_experts(
1110-
x,
1111-
layer.w13_weight,
1112-
layer.w2_weight,
1113-
topk_weights,
1114-
topk_ids,
1115-
activation=activation,
1116-
global_num_experts=global_num_experts,
1117-
expert_map=None if self.disable_expert_map else expert_map,
1118-
)
1119-
11201079
# cutlass path
11211080
elif self.use_cutlass:
11221081
assert self.moe_quant_config is not None
@@ -1312,8 +1271,6 @@ def apply(
13121271
logical_to_physical_map: torch.Tensor | None = None,
13131272
logical_replica_count: torch.Tensor | None = None,
13141273
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
1315-
assert self.fused_experts is None
1316-
13171274
if enable_eplb:
13181275
raise NotImplementedError(
13191276
"EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
@@ -1630,8 +1587,6 @@ def apply(
16301587
logical_to_physical_map: torch.Tensor | None = None,
16311588
logical_replica_count: torch.Tensor | None = None,
16321589
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
1633-
assert self.fused_experts is None
1634-
16351590
if enable_eplb:
16361591
raise NotImplementedError(
16371592
"EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
@@ -1895,8 +1850,6 @@ def apply(
18951850
logical_to_physical_map: torch.Tensor | None = None,
18961851
logical_replica_count: torch.Tensor | None = None,
18971852
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
1898-
assert self.fused_experts is None
1899-
19001853
if enable_eplb:
19011854
raise NotImplementedError(
19021855
"EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."

vllm/model_executor/layers/quantization/experts_int8.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,6 @@ def apply(
158158
logical_to_physical_map: torch.Tensor | None = None,
159159
logical_replica_count: torch.Tensor | None = None,
160160
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
161-
assert self.fused_experts is None
162-
163161
if enable_eplb:
164162
raise NotImplementedError(
165163
"EPLB not supported for `ExpertsInt8MoEMethod` yet."

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -584,9 +584,6 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
584584
self.quant_config = quant_config
585585
self.weight_block_size = self.quant_config.weight_block_size
586586
self.block_quant: bool = self.weight_block_size is not None
587-
588-
self.fused_experts: mk.FusedMoEModularKernel | None = None # type: ignore
589-
590587
self.fp8_backend = get_fp8_moe_backend(self.block_quant)
591588

592589
self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
@@ -1062,6 +1059,10 @@ def get_fused_moe_quant_config(
10621059
block_shape=self.weight_block_size,
10631060
)
10641061

1062+
@property
1063+
def supports_eplb(self) -> bool:
1064+
return True
1065+
10651066
def apply(
10661067
self,
10671068
layer: torch.nn.Module,
@@ -1091,10 +1092,7 @@ def apply(
10911092
assert logical_replica_count is not None
10921093
assert isinstance(layer, FusedMoE)
10931094

1094-
if (
1095-
self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
1096-
and self.fused_experts is None
1097-
):
1095+
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
10981096
assert activation == "silu", (
10991097
f"Expected 'silu' activation but got {activation}"
11001098
)
@@ -1170,18 +1168,13 @@ def apply(
11701168
zero_expert_type=zero_expert_type,
11711169
)
11721170

1173-
#
1174-
# Note: the order of checks is important since self.fused_experts
1175-
# can override fused_experts or cutlass but not rocm or marlin.
1176-
#
11771171
topk_weights, topk_ids, zero_expert_result = select_result
11781172

11791173
if self.rocm_aiter_moe_enabled:
11801174
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
11811175
rocm_aiter_fused_experts,
11821176
)
11831177

1184-
assert self.fused_experts is None
11851178
result = rocm_aiter_fused_experts(
11861179
x,
11871180
layer.w13_weight,
@@ -1195,7 +1188,6 @@ def apply(
11951188
)
11961189
elif self.use_marlin:
11971190
assert activation == "silu", f"{activation} not supported for Marlin MoE."
1198-
assert self.fused_experts is None
11991191
result = torch.ops.vllm.fused_marlin_moe(
12001192
x,
12011193
layer.w13_weight,
@@ -1213,19 +1205,6 @@ def apply(
12131205
expert_map=expert_map,
12141206
workspace=layer.workspace,
12151207
)
1216-
elif self.fused_experts:
1217-
result = self.fused_experts(
1218-
hidden_states=x,
1219-
w1=layer.w13_weight,
1220-
w2=layer.w2_weight,
1221-
topk_weights=topk_weights,
1222-
topk_ids=topk_ids,
1223-
inplace=True,
1224-
activation=activation,
1225-
global_num_experts=global_num_experts,
1226-
apply_router_weight_on_input=apply_router_weight_on_input,
1227-
expert_map=expert_map,
1228-
)
12291208
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
12301209
assert not self.block_quant
12311210
assert not renormalize and custom_routing_function is not None

vllm/model_executor/layers/quantization/gguf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -585,8 +585,6 @@ def apply(
585585
logical_to_physical_map: torch.Tensor | None = None,
586586
logical_replica_count: torch.Tensor | None = None,
587587
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
588-
assert self.fused_experts is None
589-
590588
if enable_eplb:
591589
raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
592590

vllm/model_executor/layers/quantization/gptq_marlin.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -741,8 +741,6 @@ def apply(
741741
logical_to_physical_map: torch.Tensor | None = None,
742742
logical_replica_count: torch.Tensor | None = None,
743743
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
744-
assert self.fused_experts is None
745-
746744
if enable_eplb:
747745
raise NotImplementedError(
748746
"EPLB not supported for `GPTQMarlinMoEMethod` yet."

0 commit comments

Comments
 (0)