From 7f3666dfbbfad57ce42250d73b07997abaff72e0 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 22 Jun 2023 15:21:48 +0800 Subject: [PATCH 1/3] [Fix] Better error message when there is OOM during cache initalization --- vllm/engine/llm_engine.py | 6 ++++++ vllm/outputs.py | 1 + 2 files changed, 7 insertions(+) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3668dd7ee37f..eed594d82c2d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -127,6 +127,12 @@ def _init_cache(self) -> None: # FIXME(woosuk): Change to debug log. logger.info(f'# GPU blocks: {num_gpu_blocks}, ' f'# CPU blocks: {num_cpu_blocks}') + + if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks, " + "Try increase `gpu_memory_utilization` when. " + "initialize the engine.") + self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/vllm/outputs.py b/vllm/outputs.py index ebb5c19df0ad..384ca020985d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -53,6 +53,7 @@ class RequestOutput: prompt: The prompt string of the request. prompt_token_ids: The token IDs of the prompt. outputs: The output sequences of the request. + finished: Whether the whole request is finished. """ def __init__( self, From cdba105bc166162a2d14c2ff31c09ac2b1aa09c3 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 22 Jun 2023 15:24:09 +0800 Subject: [PATCH 2/3] fix --- vllm/engine/llm_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eed594d82c2d..a9d53a3a0669 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -130,8 +130,8 @@ def _init_cache(self) -> None: if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: raise ValueError("No available memory for the cache blocks, " - "Try increase `gpu_memory_utilization` when. " - "initialize the engine.") + "Try increase `gpu_memory_utilization` when " + "initializing the engine.") self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From 5d71b286aded345897a2d81b8df814388325467d Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 22 Jun 2023 15:29:09 +0800 Subject: [PATCH 3/3] Update llm_engine.py --- vllm/engine/llm_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a9d53a3a0669..c4aea06ba12f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,8 +129,8 @@ def _init_cache(self) -> None: f'# CPU blocks: {num_cpu_blocks}') if num_gpu_blocks <= 0 or num_cpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks, " - "Try increase `gpu_memory_utilization` when " + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " "initializing the engine.") self.cache_config.num_gpu_blocks = num_gpu_blocks