Skip to content

Commit b706d89

Browse files
authored
[Bugfix][V1][PP] Only warmup sampler at last PP rank (#14643)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
1 parent 863d315 commit b706d89

File tree

1 file changed

+17
-14
lines changed

1 file changed

+17
-14
lines changed

vllm/v1/worker/gpu_worker.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from vllm.distributed import (ensure_model_parallel_initialized,
1515
init_distributed_environment,
1616
set_custom_all_reduce)
17+
from vllm.distributed.parallel_state import get_pp_group
1718
from vllm.logger import init_logger
1819
from vllm.lora.request import LoRARequest
1920
from vllm.model_executor import set_random_seed
@@ -219,20 +220,22 @@ def compile_or_warm_up_model(self) -> None:
219220
# fragmentation issue.
220221
# NOTE: This is called after `capture_model` on purpose to prevent
221222
# memory buffers from being cleared by `torch.cuda.empty_cache`.
222-
try:
223-
max_num_reqs = min(self.scheduler_config.max_num_seqs,
224-
self.scheduler_config.max_num_batched_tokens)
225-
self.model_runner._dummy_sampler_run(
226-
hidden_states=self.model_runner._dummy_run(
227-
num_tokens=max_num_reqs))
228-
except RuntimeError as e:
229-
if 'out of memory' in str(e):
230-
raise RuntimeError(
231-
"CUDA out of memory occurred when warming up sampler. "
232-
"Please try lowering `gpu_memory_utilization` when "
233-
"initializing the engine.") from None
234-
else:
235-
raise e
223+
if get_pp_group().is_last_rank:
224+
try:
225+
max_num_reqs = min(
226+
self.scheduler_config.max_num_seqs,
227+
self.scheduler_config.max_num_batched_tokens)
228+
self.model_runner._dummy_sampler_run(
229+
hidden_states=self.model_runner._dummy_run(
230+
num_tokens=max_num_reqs))
231+
except RuntimeError as e:
232+
if 'out of memory' in str(e):
233+
raise RuntimeError(
234+
"CUDA out of memory occurred when warming up sampler. "
235+
"Please try lowering `gpu_memory_utilization` when "
236+
"initializing the engine.") from None
237+
else:
238+
raise e
236239

237240
# Reset the seed to ensure that the random state is not affected by
238241
# the model initialization and profiling.

0 commit comments

Comments
 (0)