Skip to content

Commit fd12cf5

Browse files
lengrongfuhmellor
authored andcommitted
[Feature] use --eplb_config to set eplb param (vllm-project#20562)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: rongfu.leng <lenronfu@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent 06885de commit fd12cf5

File tree

9 files changed

+149
-52
lines changed

9 files changed

+149
-52
lines changed

vllm/config/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@
3333
PrefixCachingHashAlgo)
3434
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
3535
CUDAGraphMode, PassConfig)
36-
from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
36+
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
37+
ParallelConfig)
3738
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
3839
from vllm.config.utils import ConfigType, config
3940
from vllm.logger import init_logger

vllm/config/parallel.py

Lines changed: 87 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
77

88
import torch
9-
from pydantic import model_validator
9+
from pydantic import TypeAdapter, model_validator
1010
from pydantic.dataclasses import dataclass
1111
from torch.distributed import ProcessGroup, ReduceOp
1212
from typing_extensions import Self
@@ -32,6 +32,38 @@
3232
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
3333

3434

35+
@config
36+
@dataclass
37+
class EPLBConfig:
38+
"""Configuration for Expert Parallel Load Balancing (EP)."""
39+
40+
window_size: int = 1000
41+
"""Window size for expert load recording."""
42+
step_interval: int = 3000
43+
"""
44+
Interval for rearranging experts in expert parallelism.
45+
46+
Note that if this is greater than the EPLB window size, only the metrics
47+
of the last `lb_window_size` steps will be used for rearranging experts.
48+
"""
49+
50+
num_redundant_experts: int = 0
51+
"""Number of redundant experts to use for expert parallelism."""
52+
53+
log_balancedness: bool = False
54+
"""
55+
Log the balancedness each step of expert parallelism.
56+
This is turned off by default since it will cause communication overhead.
57+
"""
58+
59+
@classmethod
60+
def from_cli(cls, cli_value: str) -> "EPLBConfig":
61+
"""Parse the CLI value for the compilation config.
62+
-O1, -O2, -O3, etc. is handled in FlexibleArgumentParser.
63+
"""
64+
return TypeAdapter(EPLBConfig).validate_json(cli_value)
65+
66+
3567
@config
3668
@dataclass
3769
class ParallelConfig:
@@ -75,22 +107,24 @@ class ParallelConfig:
75107
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
76108
enable_eplb: bool = False
77109
"""Enable expert parallelism load balancing for MoE layers."""
78-
num_redundant_experts: int = 0
79-
"""Number of redundant experts to use for expert parallelism."""
80-
eplb_window_size: int = 1000
81-
"""Window size for expert load recording."""
82-
eplb_step_interval: int = 3000
83-
"""
84-
Interval for rearranging experts in expert parallelism.
85-
86-
Note that if this is greater than the EPLB window size, only the metrics
87-
of the last `eplb_window_size` steps will be used for rearranging experts.
88-
"""
89-
eplb_log_balancedness: bool = False
90-
"""
91-
Log the balancedness each step of expert parallelism.
92-
This is turned off by default since it will cause communication overhead.
93-
"""
110+
eplb_config: EPLBConfig = field(default_factory=EPLBConfig)
111+
"""Expert parallelism configuration."""
112+
num_redundant_experts: Optional[int] = None
113+
"""`num_redundant_experts` is deprecated and has been replaced with
114+
`eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
115+
Please use `eplb_config.num_redundant_experts` instead."""
116+
eplb_window_size: Optional[int] = None
117+
"""`eplb_window_size` is deprecated and has been replaced with
118+
`eplb_config.window_size`. This will be removed in v0.12.0.
119+
Please use `eplb_config.window_size` instead."""
120+
eplb_step_interval: Optional[int] = None
121+
"""`eplb_step_interval` is deprecated and has been replaced with
122+
`eplb_config.step_interval`. This will be removed in v0.12.0.
123+
Please use `eplb_config.step_interval` instead."""
124+
eplb_log_balancedness: Optional[bool] = None
125+
"""`eplb_log_balancedness` is deprecated and has been replaced with
126+
`eplb_config.log_balancedness`. This will be removed in v0.12.0.
127+
Please use `eplb_config.log_balancedness` instead."""
94128

95129
max_parallel_loading_workers: Optional[int] = None
96130
"""Maximum number of parallel loading workers when loading model
@@ -237,6 +271,38 @@ def compute_hash(self):
237271
return hashlib.sha256(str(factors).encode()).hexdigest()
238272

239273
def __post_init__(self) -> None:
274+
# Forward deprecated fields to their new location
275+
if self.num_redundant_experts is not None:
276+
self.eplb_config.num_redundant_experts = (
277+
self.num_redundant_experts)
278+
logger.warning_once(
279+
"num_redundant_experts is deprecated and has been replaced "
280+
"with eplb_config.num_redundant_experts. This will be removed "
281+
"in v0.12.0. Changing this field after initialization will "
282+
"have no effect.")
283+
if self.eplb_window_size is not None:
284+
self.eplb_config.window_size = self.eplb_window_size
285+
logger.warning_once(
286+
"eplb_window_size is deprecated and has been replaced "
287+
"with eplb_config.window_size. This will be removed "
288+
"in v0.12.0. Changing this field after initialization will "
289+
"have no effect.")
290+
if self.eplb_step_interval is not None:
291+
self.eplb_config.step_interval = self.eplb_step_interval
292+
logger.warning_once(
293+
"eplb_step_interval is deprecated and has been replaced "
294+
"with eplb_config.step_interval. This will be removed "
295+
"in v0.12.0. Changing this field after initialization will "
296+
"have no effect.")
297+
if self.eplb_log_balancedness is not None:
298+
self.eplb_config.log_balancedness = self.eplb_log_balancedness
299+
logger.warning_once(
300+
"eplb_log_balancedness is deprecated and has been replaced "
301+
"with eplb_config.log_balancedness. This will be removed "
302+
"in v0.12.0. Changing this field after initialization will "
303+
"have no effect.")
304+
305+
# Continue with the rest of the initialization
240306
self.world_size = self.pipeline_parallel_size * \
241307
self.tensor_parallel_size
242308

@@ -275,10 +341,10 @@ def __post_init__(self) -> None:
275341
raise ValueError(
276342
"Expert parallelism load balancing is only supported on "
277343
"CUDA devices now.")
278-
if self.num_redundant_experts < 0:
344+
if self.eplb_config.num_redundant_experts < 0:
279345
raise ValueError(
280346
"num_redundant_experts must be non-negative, but got "
281-
f"{self.num_redundant_experts}.")
347+
f"{self.eplb_config.num_redundant_experts}.")
282348
if not self.enable_expert_parallel:
283349
raise ValueError(
284350
"enable_expert_parallel must be True to use EPLB.")
@@ -289,10 +355,10 @@ def __post_init__(self) -> None:
289355
f"TP={self.tensor_parallel_size},DP={self.data_parallel_size}."
290356
)
291357
else:
292-
if self.num_redundant_experts != 0:
358+
if self.eplb_config.num_redundant_experts != 0:
293359
raise ValueError(
294360
"num_redundant_experts should be used with EPLB."
295-
f"{self.num_redundant_experts}.")
361+
f"{self.eplb_config.num_redundant_experts}.")
296362
if self.distributed_executor_backend is None and self.world_size > 1:
297363
# We use multiprocessing by default if world_size fits on the
298364
# current node and we aren't in a ray placement group.

vllm/distributed/eplb/eplb_state.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def build(
244244
dtype=torch.int32,
245245
device=device,
246246
)
247-
expert_load_window_size = parallel_config.eplb_window_size
247+
expert_load_window_size = parallel_config.eplb_config.window_size
248248
expert_load_window = torch.zeros(
249249
(expert_load_window_size, model.num_moe_layers,
250250
model.num_physical_experts),
@@ -253,7 +253,7 @@ def build(
253253
)
254254

255255
# Set the initial progress of rearrangement to 3/4
256-
eplb_step_interval = parallel_config.eplb_step_interval
256+
eplb_step_interval = parallel_config.eplb_config.step_interval
257257
expert_rearrangement_step = max(
258258
0, eplb_step_interval - eplb_step_interval // 4)
259259

vllm/engine/arg_utils.py

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
2626
ConfigFormat, ConfigType, ConvertOption,
2727
DecodingConfig, DetailedTraceModules, Device,
28-
DeviceConfig, DistributedExecutorBackend,
28+
DeviceConfig, DistributedExecutorBackend, EPLBConfig,
2929
GuidedDecodingBackend, HfOverrides, KVEventsConfig,
3030
KVTransferConfig, LoadConfig, LogprobsMode,
3131
LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
@@ -305,11 +305,12 @@ class EngineArgs:
305305
data_parallel_hybrid_lb: bool = False
306306
data_parallel_backend: str = ParallelConfig.data_parallel_backend
307307
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
308+
eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
308309
enable_eplb: bool = ParallelConfig.enable_eplb
309-
num_redundant_experts: int = ParallelConfig.num_redundant_experts
310-
eplb_window_size: int = ParallelConfig.eplb_window_size
311-
eplb_step_interval: int = ParallelConfig.eplb_step_interval
312-
eplb_log_balancedness: bool = ParallelConfig.eplb_log_balancedness
310+
num_redundant_experts: int = EPLBConfig.num_redundant_experts
311+
eplb_window_size: int = EPLBConfig.window_size
312+
eplb_step_interval: int = EPLBConfig.step_interval
313+
eplb_log_balancedness: bool = EPLBConfig.log_balancedness
313314
max_parallel_loading_workers: Optional[
314315
int] = ParallelConfig.max_parallel_loading_workers
315316
block_size: Optional[BlockSize] = CacheConfig.block_size
@@ -454,6 +455,9 @@ def __post_init__(self):
454455
if isinstance(self.compilation_config, dict):
455456
self.compilation_config = CompilationConfig(
456457
**self.compilation_config)
458+
if isinstance(self.eplb_config, dict):
459+
self.eplb_config = EPLBConfig.from_cli(json.dumps(
460+
self.eplb_config))
457461
# Setup plugins
458462
from vllm.plugins import load_general_plugins
459463
load_general_plugins()
@@ -661,14 +665,32 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
661665
**parallel_kwargs["enable_expert_parallel"])
662666
parallel_group.add_argument("--enable-eplb",
663667
**parallel_kwargs["enable_eplb"])
664-
parallel_group.add_argument("--num-redundant-experts",
665-
**parallel_kwargs["num_redundant_experts"])
666-
parallel_group.add_argument("--eplb-window-size",
667-
**parallel_kwargs["eplb_window_size"])
668-
parallel_group.add_argument("--eplb-step-interval",
669-
**parallel_kwargs["eplb_step_interval"])
670-
parallel_group.add_argument("--eplb-log-balancedness",
671-
**parallel_kwargs["eplb_log_balancedness"])
668+
parallel_group.add_argument("--eplb-config",
669+
**parallel_kwargs["eplb_config"])
670+
parallel_group.add_argument(
671+
"--num-redundant-experts",
672+
type=int,
673+
help=
674+
"[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
675+
deprecated=True)
676+
parallel_group.add_argument(
677+
"--eplb-window-size",
678+
type=int,
679+
help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
680+
deprecated=True)
681+
parallel_group.add_argument(
682+
"--eplb-step-interval",
683+
type=int,
684+
help=
685+
"[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
686+
deprecated=True)
687+
parallel_group.add_argument(
688+
"--eplb-log-balancedness",
689+
action=argparse.BooleanOptionalAction,
690+
help=
691+
"[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
692+
deprecated=True)
693+
672694
parallel_group.add_argument(
673695
"--max-parallel-loading-workers",
674696
**parallel_kwargs["max_parallel_loading_workers"])
@@ -1244,6 +1266,16 @@ def create_engine_config(
12441266
"Currently, speculative decoding is not supported with "
12451267
"async scheduling.")
12461268

1269+
# Forward the deprecated CLI args to the EPLB config.
1270+
if self.num_redundant_experts is not None:
1271+
self.eplb_config.num_redundant_experts = self.num_redundant_experts
1272+
if self.eplb_window_size is not None:
1273+
self.eplb_config.window_size = self.eplb_window_size
1274+
if self.eplb_step_interval is not None:
1275+
self.eplb_config.step_interval = self.eplb_step_interval
1276+
if self.eplb_log_balancedness is not None:
1277+
self.eplb_config.log_balancedness = self.eplb_log_balancedness
1278+
12471279
parallel_config = ParallelConfig(
12481280
pipeline_parallel_size=self.pipeline_parallel_size,
12491281
tensor_parallel_size=self.tensor_parallel_size,
@@ -1257,10 +1289,7 @@ def create_engine_config(
12571289
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
12581290
enable_expert_parallel=self.enable_expert_parallel,
12591291
enable_eplb=self.enable_eplb,
1260-
num_redundant_experts=self.num_redundant_experts,
1261-
eplb_window_size=self.eplb_window_size,
1262-
eplb_step_interval=self.eplb_step_interval,
1263-
eplb_log_balancedness=self.eplb_log_balancedness,
1292+
eplb_config=self.eplb_config,
12641293
max_parallel_loading_workers=self.max_parallel_loading_workers,
12651294
disable_custom_all_reduce=self.disable_custom_all_reduce,
12661295
ray_workers_use_nsight=self.ray_workers_use_nsight,

vllm/model_executor/models/deepseek_v2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,10 @@ def __init__(
132132

133133
# Load balancing settings.
134134
vllm_config = get_current_vllm_config()
135-
parallel_config = vllm_config.parallel_config
135+
eplb_config = vllm_config.parallel_config.eplb_config
136136
self.enable_eplb = enable_eplb
137137

138-
self.n_redundant_experts = parallel_config.num_redundant_experts
138+
self.n_redundant_experts = eplb_config.num_redundant_experts
139139
self.n_logical_experts = self.n_routed_experts
140140
self.n_physical_experts = (self.n_logical_experts +
141141
self.n_redundant_experts)

vllm/model_executor/models/glm4_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,10 @@ def __init__(
131131

132132
# Load balancing settings.
133133
vllm_config = get_current_vllm_config()
134-
parallel_config = vllm_config.parallel_config
134+
eplb_config = vllm_config.parallel_config.eplb_config
135135
self.enable_eplb = enable_eplb
136136

137-
self.n_redundant_experts = parallel_config.num_redundant_experts
137+
self.n_redundant_experts = eplb_config.num_redundant_experts
138138
self.n_logical_experts = self.n_routed_experts
139139
self.n_physical_experts = (self.n_logical_experts +
140140
self.n_redundant_experts)

vllm/model_executor/models/qwen3_moe.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,11 @@ def __init__(
121121

122122
# Load balancing settings.
123123
vllm_config = get_current_vllm_config()
124-
parallel_config = vllm_config.parallel_config
124+
eplb_config = vllm_config.parallel_config.eplb_config
125125
self.enable_eplb = enable_eplb
126126

127127
self.n_logical_experts = self.n_routed_experts
128-
self.n_redundant_experts = parallel_config.num_redundant_experts
128+
self.n_redundant_experts = eplb_config.num_redundant_experts
129129
self.n_physical_experts = (self.n_logical_experts +
130130
self.n_redundant_experts)
131131
self.n_local_physical_experts = self.n_physical_experts // self.ep_size
@@ -363,7 +363,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
363363
quant_config = vllm_config.quant_config
364364
parallel_config = vllm_config.parallel_config
365365
enable_eplb = parallel_config.enable_eplb
366-
self.num_redundant_experts = parallel_config.num_redundant_experts
366+
eplb_config = parallel_config.eplb_config
367+
self.num_redundant_experts = eplb_config.num_redundant_experts
367368

368369
self.padding_idx = config.pad_token_id
369370
self.vocab_size = config.vocab_size

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ def eplb_step(self,
14351435
model,
14361436
is_dummy,
14371437
is_profile,
1438-
log_stats=self.parallel_config.eplb_log_balancedness,
1438+
log_stats=self.parallel_config.eplb_config.log_balancedness,
14391439
)
14401440

14411441
def get_dp_padding(self,
@@ -1977,7 +1977,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
19771977
global_expert_load, old_global_expert_indices = (
19781978
EplbState.recv_state())
19791979
num_logical_experts = global_expert_load.shape[1]
1980-
self.parallel_config.num_redundant_experts = (
1980+
self.parallel_config.eplb_config.num_redundant_experts = (
19811981
num_local_physical_experts * new_ep_size - num_logical_experts)
19821982
assert old_global_expert_indices.shape[
19831983
1] % num_local_physical_experts == 0

vllm/v1/worker/gpu_worker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ def _reconfigure_moe(self, old_ep_size: int,
515515
assert self.model_runner.eplb_state is not None
516516
new_physical_experts = \
517517
self.model_runner.eplb_state.physical_to_logical_map.shape[1]
518-
parallel_config.num_redundant_experts = (
518+
parallel_config.eplb_config.num_redundant_experts = (
519519
new_physical_experts -
520520
self.model_runner.eplb_state.logical_replica_count.shape[1])
521521
global_expert_load = None
@@ -531,7 +531,7 @@ def _reconfigure_moe(self, old_ep_size: int,
531531
assert self.model_runner.eplb_state is not None
532532
global_expert_load = self.model_runner.eplb_state.rearrange(
533533
self.model_runner.model, execute_shuffle=False)
534-
parallel_config.num_redundant_experts = (
534+
parallel_config.eplb_config.num_redundant_experts = (
535535
new_physical_experts - global_expert_load.shape[1])
536536
prepare_communication_buffer_for_model(self.model_runner.model)
537537
self.model_runner.model.update_physical_experts_metadata(

0 commit comments

Comments
 (0)