diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 33e831e54bbc..1aeef0fd5bd8 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -139,7 +139,7 @@ async def get_request( A lower burstiness value (0 < burstiness < 1) results in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): + ramp_up_strategy (optional): The ramp-up strategy. Can be "linear" or "exponential". If None, uses constant request rate (specified by request_rate). ramp_up_start_rps (optional): diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 8f8baa7d59db..3e318d784832 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -337,11 +337,12 @@ def step(self, Args: model (MixtureOfExperts): The MoE model. is_dummy (bool): If `True`, this is a dummy step and the load - metrics recorded in this forward pass will not count. Defaults - to `False`. + metrics recorded in this forward pass will not count. + Defaults to `False`. is_profile (bool): If `True`, perform a dummy rearrangement - with maximum communication cost. This is used in `profile_run` - to reserve enough memory for the communication buffer. + with maximum communication cost. This is used in + `profile_run` to reserve enough memory + for the communication buffer. log_stats (bool): If `True`, log the expert load metrics. # Stats diff --git a/vllm/distributed/eplb/rebalance_algo.py b/vllm/distributed/eplb/rebalance_algo.py index 3564a10dfc68..fc43dbe3b653 100644 --- a/vllm/distributed/eplb/rebalance_algo.py +++ b/vllm/distributed/eplb/rebalance_algo.py @@ -109,13 +109,16 @@ def rebalance_experts_hierarchical( num_physical_experts: number of physical experts after replication num_groups: number of expert groups num_nodes: number of server nodes, where the intra-node network - (e.g, NVLink) is faster + (e.g., NVLink) is faster num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: - physical_to_logical_map: [num_moe_layers, num_physical_experts] - logical_to_physical_map: [num_moe_layers, num_logical_experts, X] - logical_count: [num_moe_layers, num_logical_experts] + physical_to_logical_map (torch.Tensor): + [num_moe_layers, num_physical_experts] + logical_to_physical_map (torch.Tensor): + [num_moe_layers, num_logical_experts, X] + logical_count (torch.Tensor): + [num_moe_layers, num_logical_experts] """ num_layers, num_logical_experts = weight.shape assert num_logical_experts % num_groups == 0 @@ -197,11 +200,13 @@ def rebalance_experts( num_gpus: number of GPUs, must be a multiple of `num_nodes` Returns: - physical_to_logical_map: [layers, num_replicas], the expert index of - each replica - logical_to_physical_map: [layers, num_logical_experts, X], the replica - indices for each expert - expert_count: [layers, num_logical_experts], number of physical + physical_to_logical_map: + [layers, num_replicas], the expert index of each replica + logical_to_physical_map: + [layers, num_logical_experts, X], the replica indices for each + expert + expert_count: + [layers, num_logical_experts], number of physical replicas for each logical expert """ num_layers, num_logical_experts = weight.shape