Skip to content

Commit 5894759

Browse files
ywang96simon-mo
authored andcommitted
[V1] Move OOM check into sampler run (vllm-project#14728)
Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Signed-off-by: Mu Huai <tianbowen.tbw@antgroup.com>
1 parent a59af43 commit 5894759

File tree

2 files changed

+17
-18
lines changed

2 files changed

+17
-18
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,9 +1288,18 @@ def _dummy_sampler_run(
12881288
allowed_token_ids_mask=None,
12891289
bad_words_token_ids={},
12901290
)
1291-
sampler_output = self.model.sample(logits=logits,
1292-
sampling_metadata=dummy_metadata)
1293-
1291+
try:
1292+
sampler_output = self.model.sample(
1293+
logits=logits, sampling_metadata=dummy_metadata)
1294+
except RuntimeError as e:
1295+
if 'out of memory' in str(e):
1296+
raise RuntimeError(
1297+
"CUDA out of memory occurred when warming up sampler with "
1298+
f"{num_reqs} dummy requests. Please try lowering "
1299+
"`max_num_seqs` or `gpu_memory_utilization` when "
1300+
"initializing the engine.") from e
1301+
else:
1302+
raise e
12941303
return sampler_output
12951304

12961305
def profile_run(self) -> None:

vllm/v1/worker/gpu_worker.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -221,21 +221,11 @@ def compile_or_warm_up_model(self) -> None:
221221
# NOTE: This is called after `capture_model` on purpose to prevent
222222
# memory buffers from being cleared by `torch.cuda.empty_cache`.
223223
if get_pp_group().is_last_rank:
224-
try:
225-
max_num_reqs = min(
226-
self.scheduler_config.max_num_seqs,
227-
self.scheduler_config.max_num_batched_tokens)
228-
self.model_runner._dummy_sampler_run(
229-
hidden_states=self.model_runner._dummy_run(
230-
num_tokens=max_num_reqs))
231-
except RuntimeError as e:
232-
if 'out of memory' in str(e):
233-
raise RuntimeError(
234-
"CUDA out of memory occurred when warming up sampler. "
235-
"Please try lowering `gpu_memory_utilization` when "
236-
"initializing the engine.") from None
237-
else:
238-
raise e
224+
max_num_reqs = min(self.scheduler_config.max_num_seqs,
225+
self.scheduler_config.max_num_batched_tokens)
226+
self.model_runner._dummy_sampler_run(
227+
hidden_states=self.model_runner._dummy_run(
228+
num_tokens=max_num_reqs))
239229

240230
# Reset the seed to ensure that the random state is not affected by
241231
# the model initialization and profiling.

0 commit comments

Comments
 (0)