From 5c80f7989e9de647ac5f9bb596e7d7e1080cd04e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 29 Jun 2024 08:10:43 -0700 Subject: [PATCH 1/2] [BugFix] Ensure worker model loop is always stopped at the right time The model loop in the worker processes is meant to stop when there are no more sequences to process but the condition being checked for this wasn't sufficient, meaning that the workers can incorrectly remain in an indefinite broadcast loop. --- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7db3bb28c6ee5..e335bc5d1bd90 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -248,7 +248,7 @@ async def step_async( # Tracing self.do_tracing(scheduler_outputs) - if not request_outputs: + if not self.has_unfinished_requests(): # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in # torch.distributed ops which may otherwise timeout, and unblocks diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f7e38c0e6b948..5886ebc24253a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -838,7 +838,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # Tracing self.do_tracing(scheduler_outputs) - if not request_outputs: + if not self.has_unfinished_requests(): # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in # torch.distributed ops which may otherwise timeout, and unblocks From 87c5226b55567b3ca1451c1b75f7c3c09833abf7 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 1 Jul 2024 09:45:05 -0700 Subject: [PATCH 2/2] Revert AsyncLLMEngine change --- vllm/engine/async_llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index e335bc5d1bd90..7db3bb28c6ee5 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -248,7 +248,7 @@ async def step_async( # Tracing self.do_tracing(scheduler_outputs) - if not self.has_unfinished_requests(): + if not request_outputs: # Stop the execute model loop in parallel workers until there are # more requests to process. This avoids waiting indefinitely in # torch.distributed ops which may otherwise timeout, and unblocks