From 7f3666dfbbfad57ce42250d73b07997abaff72e0 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Thu, 22 Jun 2023 15:21:48 +0800
Subject: [PATCH 1/3] [Fix] Better error message when there is OOM during cache
 initalization

---
 vllm/engine/llm_engine.py | 6 ++++++
 vllm/outputs.py           | 1 +
 2 files changed, 7 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3668dd7ee37f..eed594d82c2d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -127,6 +127,12 @@ def _init_cache(self) -> None:
         # FIXME(woosuk): Change to debug log.
         logger.info(f'# GPU blocks: {num_gpu_blocks}, '
                     f'# CPU blocks: {num_cpu_blocks}')
+
+        if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
+            raise ValueError("No available memory for the cache blocks, "
+                             "Try increase `gpu_memory_utilization` when. "
+                             "initialize the engine.")
+
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index ebb5c19df0ad..384ca020985d 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -53,6 +53,7 @@ class RequestOutput:
         prompt: The prompt string of the request.
         prompt_token_ids: The token IDs of the prompt.
         outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
     """
     def __init__(
         self,

From cdba105bc166162a2d14c2ff31c09ac2b1aa09c3 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Thu, 22 Jun 2023 15:24:09 +0800
Subject: [PATCH 2/3] fix

---
 vllm/engine/llm_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eed594d82c2d..a9d53a3a0669 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -130,8 +130,8 @@ def _init_cache(self) -> None:
 
         if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
             raise ValueError("No available memory for the cache blocks, "
-                             "Try increase `gpu_memory_utilization` when. "
-                             "initialize the engine.")
+                             "Try increase `gpu_memory_utilization` when "
+                             "initializing the engine.")
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks

From 5d71b286aded345897a2d81b8df814388325467d Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Thu, 22 Jun 2023 15:29:09 +0800
Subject: [PATCH 3/3] Update llm_engine.py

---
 vllm/engine/llm_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a9d53a3a0669..c4aea06ba12f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -129,8 +129,8 @@ def _init_cache(self) -> None:
                     f'# CPU blocks: {num_cpu_blocks}')
 
         if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks, "
-                             "Try increase `gpu_memory_utilization` when "
+            raise ValueError("No available memory for the cache blocks. "
+                             "Try increasing `gpu_memory_utilization` when "
                              "initializing the engine.")
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks