Skip to content

Commit 84b48b3

Browse files
committed
move SharedFusedMoE and clean up imports
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 864355a commit 84b48b3

File tree

14 files changed

+34
-43
lines changed

14 files changed

+34
-43
lines changed

vllm/model_executor/layers/fused_moe/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from vllm.model_executor.layers.fused_moe.modular_kernel import (
1111
FusedMoEActivationFormat, FusedMoEPermuteExpertsUnpermute,
1212
FusedMoEPrepareAndFinalize)
13+
from vllm.model_executor.layers.fused_moe.shared_fused_moe import (
14+
SharedFusedMoE)
1315
from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
1416
from vllm.triton_utils import HAS_TRITON
1517

@@ -37,6 +39,7 @@ def get_config() -> Optional[dict[str, Any]]:
3739
"FusedMoEPermuteExpertsUnpermute",
3840
"FusedMoEActivationFormat",
3941
"FusedMoEPrepareAndFinalize",
42+
"SharedFusedMoE",
4043
"activation_without_mul",
4144
"override_config",
4245
"get_config",

vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py renamed to vllm/model_executor/layers/fused_moe/shared_fused_moe.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ def __init__(
2424
):
2525
super().__init__(**kwargs)
2626
self._shared_experts = shared_experts
27+
# Disable shared expert overlap if EP is disabled or we are not using
28+
# flashinfer + DP since there is nothing to be gained in this case
29+
# and it prevents the shared experts from being hidden from
30+
# torch.compile.
2731
self.use_overlapped = use_overlapped and not (
2832
self.use_ep or self.use_flashinfer_cutlass_kernels
2933
) and self.shared_experts is not None

vllm/model_executor/layers/shared_fused_moe/__init__.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

vllm/model_executor/models/aria.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
from vllm.config import VllmConfig
1313
from vllm.distributed import get_tensor_model_parallel_rank
1414
from vllm.model_executor.layers.activation import get_act_fn
15+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
1516
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
1617
RowParallelLinear)
1718
from vllm.model_executor.layers.logits_processor import LogitsProcessor
1819
from vllm.model_executor.layers.quantization import QuantizationConfig
19-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
2020
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
2121
from vllm.model_executor.model_loader.weight_utils import (
2222
default_weight_loader, maybe_remap_kv_scale_name)

vllm/model_executor/models/bailing_moe.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,14 @@
3939
get_tensor_model_parallel_world_size,
4040
tensor_model_parallel_all_reduce)
4141
from vllm.model_executor.layers.activation import SiluAndMul
42-
from vllm.model_executor.layers.fused_moe import FusedMoE
42+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
4343
from vllm.model_executor.layers.layernorm import RMSNorm
4444
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
4545
QKVParallelLinear,
4646
RowParallelLinear)
4747
from vllm.model_executor.layers.logits_processor import LogitsProcessor
4848
from vllm.model_executor.layers.quantization import QuantizationConfig
4949
from vllm.model_executor.layers.rotary_embedding import get_rope
50-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
5150
from vllm.model_executor.layers.vocab_parallel_embedding import (
5251
ParallelLMHead, VocabParallelEmbedding)
5352
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -480,7 +479,7 @@ def forward(
480479
return hidden_states
481480

482481
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
483-
return FusedMoE.make_expert_params_mapping(
482+
return SharedFusedMoE.make_expert_params_mapping(
484483
ckpt_gate_proj_name="gate_proj",
485484
ckpt_down_proj_name="down_proj",
486485
ckpt_up_proj_name="up_proj",

vllm/model_executor/models/deepseek_v2.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from vllm.logger import init_logger
4747
from vllm.model_executor.layers.activation import SiluAndMul
4848
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
49-
from vllm.model_executor.layers.fused_moe import FusedMoE
49+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
5050
from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm
5151
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
5252
MergedColumnParallelLinear,
@@ -58,7 +58,6 @@
5858
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
5959
per_token_group_quant_fp8)
6060
from vllm.model_executor.layers.rotary_embedding import get_rope
61-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
6261
from vllm.model_executor.layers.vocab_parallel_embedding import (
6362
ParallelLMHead, VocabParallelEmbedding)
6463
from vllm.model_executor.model_loader.weight_utils import (
@@ -1206,7 +1205,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
12061205
config.first_k_dense_replace)
12071206
self.num_expert_groups = config.n_group
12081207

1209-
self.moe_layers: list[FusedMoE] = []
1208+
self.moe_layers: list[SharedFusedMoE] = []
12101209
example_moe = None
12111210
for layer in self.model.layers:
12121211
if isinstance(layer, PPMissingLayer):
@@ -1295,7 +1294,7 @@ def load_weights(self, weights: Iterable[tuple[str,
12951294

12961295
# Params for weights, fp8 weight scales, fp8 activation scales
12971296
# (param_name, weight_name, expert_id, shard_id)
1298-
expert_params_mapping = FusedMoE.make_expert_params_mapping(
1297+
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
12991298
ckpt_gate_proj_name="gate_proj",
13001299
ckpt_down_proj_name="down_proj",
13011300
ckpt_up_proj_name="up_proj",

vllm/model_executor/models/dots1.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
get_tensor_model_parallel_world_size,
4040
tensor_model_parallel_all_reduce)
4141
from vllm.model_executor.layers.activation import SiluAndMul
42-
from vllm.model_executor.layers.fused_moe import FusedMoE
42+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
4343
from vllm.model_executor.layers.layernorm import RMSNorm
4444
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
4545
QKVParallelLinear,
@@ -48,7 +48,6 @@
4848
from vllm.model_executor.layers.logits_processor import LogitsProcessor
4949
from vllm.model_executor.layers.quantization import QuantizationConfig
5050
from vllm.model_executor.layers.rotary_embedding import get_rope
51-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
5251
from vllm.model_executor.layers.vocab_parallel_embedding import (
5352
ParallelLMHead, VocabParallelEmbedding)
5453
from vllm.model_executor.model_loader.weight_utils import (
@@ -414,7 +413,7 @@ def forward(
414413
return hidden_states
415414

416415
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
417-
return FusedMoE.make_expert_params_mapping(
416+
return SharedFusedMoE.make_expert_params_mapping(
418417
ckpt_gate_proj_name="gate_proj",
419418
ckpt_down_proj_name="down_proj",
420419
ckpt_up_proj_name="up_proj",

vllm/model_executor/models/ernie45_moe.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
3737
from vllm.logger import init_logger
3838
from vllm.model_executor.layers.activation import SiluAndMul
39-
from vllm.model_executor.layers.fused_moe import FusedMoE
39+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
4040
from vllm.model_executor.layers.layernorm import RMSNorm
4141
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
4242
QKVParallelLinear,
@@ -45,7 +45,6 @@
4545
from vllm.model_executor.layers.logits_processor import LogitsProcessor
4646
from vllm.model_executor.layers.quantization import QuantizationConfig
4747
from vllm.model_executor.layers.rotary_embedding import get_rope
48-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
4948
from vllm.model_executor.layers.vocab_parallel_embedding import (
5049
ParallelLMHead, VocabParallelEmbedding)
5150
from vllm.model_executor.model_loader.weight_utils import (
@@ -437,7 +436,7 @@ def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
437436

438437
# Params for weights, fp8 weight scales, fp8 activation scales
439438
# (param_name, weight_name, expert_id, shard_id)
440-
return FusedMoE.make_expert_params_mapping(
439+
return SharedFusedMoE.make_expert_params_mapping(
441440
ckpt_gate_proj_name="gate_proj",
442441
ckpt_down_proj_name="down_proj",
443442
ckpt_up_proj_name="up_proj",

vllm/model_executor/models/ernie45_vl_moe.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from vllm.config import CacheConfig, VllmConfig
3636
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
3737
from vllm.logger import init_logger
38-
from vllm.model_executor.layers.fused_moe import FusedMoE
38+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
3939
from vllm.model_executor.layers.layernorm import RMSNorm
4040
from vllm.model_executor.layers.linear import (QKVParallelLinear,
4141
ReplicatedLinear,
@@ -44,7 +44,6 @@
4444
from vllm.model_executor.layers.quantization import QuantizationConfig
4545
from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
4646
Ernie4_5_VLRotaryEmbedding)
47-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
4847
from vllm.model_executor.layers.vocab_parallel_embedding import (
4948
ParallelLMHead, VocabParallelEmbedding)
5049
from vllm.model_executor.model_loader.weight_utils import (
@@ -622,7 +621,7 @@ def load_weights(self, weights: Iterable[tuple[str,
622621

623622
# Params for weights, fp8 weight scales, fp8 activation scales
624623
# (param_name, weight_name, expert_id, shard_id)
625-
expert_params_mapping = FusedMoE.make_expert_params_mapping(
624+
expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
626625
ckpt_gate_proj_name="gate_proj",
627626
ckpt_down_proj_name="down_proj",
628627
ckpt_up_proj_name="up_proj",

vllm/model_executor/models/glm4_moe.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,14 @@
3838
get_tensor_model_parallel_world_size)
3939
from vllm.logger import init_logger
4040
from vllm.model_executor.layers.activation import SiluAndMul
41-
from vllm.model_executor.layers.fused_moe import FusedMoE
41+
from vllm.model_executor.layers.fused_moe import SharedFusedMoE
4242
from vllm.model_executor.layers.layernorm import RMSNorm
4343
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
4444
QKVParallelLinear,
4545
RowParallelLinear)
4646
from vllm.model_executor.layers.logits_processor import LogitsProcessor
4747
from vllm.model_executor.layers.quantization import QuantizationConfig
4848
from vllm.model_executor.layers.rotary_embedding import get_rope
49-
from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE
5049
from vllm.model_executor.layers.vocab_parallel_embedding import (
5150
ParallelLMHead, VocabParallelEmbedding)
5251
from vllm.model_executor.model_loader.weight_utils import (
@@ -481,7 +480,7 @@ def make_empty_intermediate_tensors(
481480
def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
482481
# Params for weights, fp8 weight scales, fp8 activation scales
483482
# (param_name, weight_name, expert_id, shard_id)
484-
return FusedMoE.make_expert_params_mapping(
483+
return SharedFusedMoE.make_expert_params_mapping(
485484
ckpt_gate_proj_name="gate_proj",
486485
ckpt_down_proj_name="down_proj",
487486
ckpt_up_proj_name="up_proj",
@@ -630,7 +629,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
630629
config.first_k_dense_replace)
631630
self.num_expert_groups = config.n_group
632631

633-
self.moe_layers: list[FusedMoE] = []
632+
self.moe_layers: list[SharedFusedMoE] = []
634633
example_moe = None
635634
for layer in self.model.layers:
636635
if isinstance(layer, PPMissingLayer):

0 commit comments

Comments
 (0)