From 5c80f7989e9de647ac5f9bb596e7d7e1080cd04e Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sat, 29 Jun 2024 08:10:43 -0700
Subject: [PATCH 1/2] [BugFix] Ensure worker model loop is always stopped at
 the right time

The model loop in the worker processes is meant to stop when there are no more sequences to process but the condition being checked for this wasn't sufficient, meaning that the workers can incorrectly remain in an indefinite broadcast loop.
---
 vllm/engine/async_llm_engine.py | 2 +-
 vllm/engine/llm_engine.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7db3bb28c6ee5..e335bc5d1bd90 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -248,7 +248,7 @@ async def step_async(
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
+        if not self.has_unfinished_requests():
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
             # torch.distributed ops which may otherwise timeout, and unblocks
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f7e38c0e6b948..5886ebc24253a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -838,7 +838,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not request_outputs:
+        if not self.has_unfinished_requests():
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
             # torch.distributed ops which may otherwise timeout, and unblocks

From 87c5226b55567b3ca1451c1b75f7c3c09833abf7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Mon, 1 Jul 2024 09:45:05 -0700
Subject: [PATCH 2/2] Revert AsyncLLMEngine change

---
 vllm/engine/async_llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index e335bc5d1bd90..7db3bb28c6ee5 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -248,7 +248,7 @@ async def step_async(
         # Tracing
         self.do_tracing(scheduler_outputs)
 
-        if not self.has_unfinished_requests():
+        if not request_outputs:
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
             # torch.distributed ops which may otherwise timeout, and unblocks