From eb162391eec0f9ab3bae98ff900a836406aee598 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 23:11:03 +0000
Subject: [PATCH 001/130] checkpoint prototype

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/engine/multiprocessing/client.py |  4 --
 vllm/engine/protocol.py               |  5 --
 vllm/entrypoints/launcher.py          | 89 ++++++++++++++-------------
 vllm/v1/engine/async_llm.py           | 87 ++++++++++++++++++--------
 4 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0a046c71e86e..329bcc3d1ff6 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -402,10 +402,6 @@ async def check_health(self):
     def is_running(self) -> bool:
         return not self.errored
 
-    @property
-    def is_stopped(self) -> bool:
-        return self.errored
-
     @property
     def errored(self) -> bool:
         return self._errored_with is not None
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index a066836b9270..b2a5cc17ead6 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -29,11 +29,6 @@ class EngineClient(ABC):
     def is_running(self) -> bool:
         ...
 
-    @property
-    @abstractmethod
-    def is_stopped(self) -> bool:
-        ...
-
     @property
     @abstractmethod
     def errored(self) -> bool:
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 5dcf50bd1b0a..c928a9f4b8ce 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -7,10 +7,11 @@
 from fastapi import FastAPI, Request, Response
 
 from vllm import envs
-from vllm.engine.async_llm_engine import AsyncEngineDeadError
-from vllm.engine.multiprocessing import MQEngineDeadError
+# from vllm.engine.async_llm_engine import AsyncEngineDeadError
+# from vllm.engine.multiprocessing import MQEngineDeadError
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
+from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
@@ -58,46 +59,46 @@ async def dummy_shutdown() -> None:
         return server.shutdown()
 
 
+def start_termination(server: uvicorn.Server):
+    # See discussions here on shutting down a uvicorn server
+    # https://github.com/encode/uvicorn/discussions/1103
+    # In this case we cannot await the server shutdown here because
+    # this handler must first return to close the connection for
+    # this request.
+    logger.fatal("VLLM Engine failed, terminating server.")
+    server.should_exit = True
+
+
+# NOTE(rob): VLLM V1 AsyncLLM catches exceptions and returns
+# only two types: EngineGenerateError and EngineDeadError.
+#
+# EngineGenerateError is raised by the per request generate()
+# method. This error could be request specific (and therefore
+# recoverable - e.g. if there is an error in input processing).
+#
+# EngineDeadError is raised by the background output_handler
+# method. This error is global and therefore not recoverable.
+#
+# We register these @app.exception_handlers to return nice
+# responses to the end user if they occur and shut down if needed.
+# See https://fastapi.tiangolo.com/tutorial/handling-errors/
+# for more details on how exception handlers work.
 def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-    """Adds handlers for fatal errors that should crash the server"""
-
-    @app.exception_handler(RuntimeError)
-    async def runtime_error_handler(request: Request, __):
-        """On generic runtime error, check to see if the engine has died.
-        It probably has, in which case the server will no longer be able to
-        handle requests. Trigger a graceful shutdown with a SIGTERM."""
-        engine = request.app.state.engine_client
-        if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
-                and not engine.is_running):
-            logger.fatal("AsyncLLMEngine has failed, terminating server "
-                         "process")
-            # See discussions here on shutting down a uvicorn server
-            # https://github.com/encode/uvicorn/discussions/1103
-            # In this case we cannot await the server shutdown here because
-            # this handler must first return to close the connection for
-            # this request.
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
-    @app.exception_handler(AsyncEngineDeadError)
-    async def async_engine_dead_handler(_, __):
-        """Kill the server if the async engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("AsyncLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
-
-    @app.exception_handler(MQEngineDeadError)
-    async def mq_engine_dead_handler(_, __):
-        """Kill the server if the mq engine is already dead. It will
-        not handle any further requests."""
-        if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-            logger.fatal("MQLLMEngine is already dead, terminating server "
-                         "process")
-            server.should_exit = True
-
-        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+
+    if envs.VLLM_USE_V1:
+
+        @app.exception_handler(EngineGenerateError)
+        async def generate_error_handler(request: Request, __):
+            engine = request.app.state.engine_client
+            if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored):
+                # Terminate if recoverable.
+                start_termination(server)
+
+            return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+
+        @app.exception_handler(EngineDeadError)
+        async def engine_dead_handler(_, __):
+            if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
+                start_termination(server)
+
+            return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ff7a0c28dd91..27db49a11ed4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,7 @@
 import asyncio
-import os
+# import os
 import signal
+from functools import partial
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -18,7 +19,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import kill_process_tree
+# from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -27,6 +28,17 @@
 logger = init_logger(__name__)
 
 
+# NOTE(rob): raised when a generate() fails.
+class EngineGenerateError(Exception):
+    pass
+
+
+# NOTE(rob): raised when the engine dies, typically
+# by the background output handler loop. Unrecoverable.
+class EngineDeadError(Exception):
+    pass
+
+
 class AsyncLLM(EngineClient):
 
     def __init__(
@@ -42,23 +54,17 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        # The child processes will send SIGQUIT when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO: rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
+        # NOTE(rob): EngineCore sends SIGQUIT on unrecoverable errors.
         def sigquit_handler(signum, frame):
             logger.fatal(
                 "AsyncLLM got SIGQUIT from worker processes, shutting "
                 "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        signal.signal(signal.SIGQUIT, sigquit_handler)
+            self._propagate_error()
 
-        assert start_engine_loop
+        loop = asyncio.get_running_loop()
+        loop.add_signal_handler(signal.SIGQUIT, partial(sigquit_handler))
 
+        self._errored = False
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
@@ -243,12 +249,17 @@ async def generate(
             # The output_handler task pushes items into the queue.
             # This task pulls from the queue and yields to caller.
             while True:
-                # Note: drain queue without await if possible (avoids
+                # Note(rob): drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
 
-                # Note: both Detokenizer and EngineCore handle their
-                # own request cleanup based on finished.
+                # _run_output_handler() puts EngineDeadError into the queue
+                # if it encounters an unrecoverable issue in the EngineCore.
+                if isinstance(out, EngineDeadError):
+                    raise out
+
+                # NOTE(rob): both Detokenizer and EngineCore handle
+                # their own request cleanup based on finished.
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
@@ -261,8 +272,18 @@ async def generate(
         # request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
+            if self.log_requests:
+                logger.info("Request %s aborted.", request_id)
             raise
 
+        except Exception as e:
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+
+            # NOTE(rob): EngineGenerateError is handed by FastAPI
+            # exception handlers in vllm/entrypoints/launcher.py.
+            raise EngineGenerateError() from e
+
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
         """Process outputs by putting them into per-request queues."""
 
@@ -292,9 +313,29 @@ async def _run_output_handler(self):
                 # 4) Abort any requests that finished due to stop strings.
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
+        except asyncio.CancelledError:
+            raise
+
         except Exception as e:
-            logger.exception("EngineCore output handler hit an error: %s", e)
-            kill_process_tree(os.getpid())
+            self._propagate_error(e)
+
+    def _propagate_error(self, exception: Optional[Exception] = None):
+        """Propagate to generate() tasks and raise EngineDeadError."""
+
+        # Set errored state and log if we have
+        self._errored = True
+        if exception:
+            logger.error("AsyncLLM run_output_handler failed",
+                         exc_info=exception)
+
+        # Put EngineDeadError() into
+        for _, q in self.rid_to_queue.items():
+            q.put_nowait(EngineDeadError())
+
+        raise EngineDeadError(
+            "AsyncLLM finished unexpectedly. This should never happen! "
+            "Please open an issue on Github. See stack trace above for the "
+            "actual cause.") from exception
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in self, detokenizer, and engine core."""
@@ -356,16 +397,12 @@ async def stop_profile(self) -> None:
 
     @property
     def is_running(self) -> bool:
-        return True
-
-    @property
-    def is_stopped(self) -> bool:
-        return False
+        return not self.errored
 
     @property
     def errored(self) -> bool:
-        return False
+        return self._errored
 
     @property
     def dead_error(self) -> BaseException:
-        return Exception()  # TODO: implement
+        return EngineDeadError()

From 8549fdd6c0f27189e4b52170f3ccaff2fefb8f1a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 3 Jan 2025 23:49:22 +0000
Subject: [PATCH 002/130] Issue currently is with streaming. The HTTP exception
 handlers do not handle properly

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py | 48 +++++++++++++++++++------------------
 vllm/v1/engine/core.py      |  4 ++++
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 27db49a11ed4..0f7fe0452252 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,7 +1,6 @@
 import asyncio
 # import os
 import signal
-from functools import partial
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -54,15 +53,16 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        # NOTE(rob): EngineCore sends SIGQUIT on unrecoverable errors.
-        def sigquit_handler(signum, frame):
+        # EngineCore sends SIGQUIT on unrecoverable errors.
+        def sigquit_handler():
             logger.fatal(
                 "AsyncLLM got SIGQUIT from worker processes, shutting "
                 "down. See stack trace above for root cause issue.")
             self._propagate_error()
+            self._errored = True
 
         loop = asyncio.get_running_loop()
-        loop.add_signal_handler(signal.SIGQUIT, partial(sigquit_handler))
+        loop.add_signal_handler(signal.SIGQUIT, sigquit_handler)
 
         self._errored = False
         self.log_requests = log_requests
@@ -141,13 +141,12 @@ def from_engine_args(
 
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
+        if handler := getattr(self, "output_handler", None):
+            handler.cancel()
 
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
-        if handler := getattr(self, "output_handler", None):
-            handler.cancel()
-
     @classmethod
     def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
@@ -228,6 +227,10 @@ async def generate(
         returning the RequestOutput back to the caller.
         """
 
+        if self.errored:
+            self._propagate_error()
+            raise EngineDeadError()
+
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
@@ -249,12 +252,9 @@ async def generate(
             # The output_handler task pushes items into the queue.
             # This task pulls from the queue and yields to caller.
             while True:
-                # Note(rob): drain queue without await if possible (avoids
+                # NOTE(rob): drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
-
-                # _run_output_handler() puts EngineDeadError into the queue
-                # if it encounters an unrecoverable issue in the EngineCore.
                 if isinstance(out, EngineDeadError):
                     raise out
 
@@ -268,20 +268,25 @@ async def generate(
                 yield out
 
         # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # generate() task will be canceled so, we abort.
         except asyncio.CancelledError:
             await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
 
-        except Exception as e:
+        # EngineCore or output_handler pushed error. Raise so API Server
+        # can handle and shutdown in vllm/entrypoints/launcher.py.
+        except EngineDeadError:
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
+            raise
 
-            # NOTE(rob): EngineGenerateError is handed by FastAPI
-            # exception handlers in vllm/entrypoints/launcher.py.
+        # Error in the generate() task (possibly recoverable). Raise so API
+        # Server can handle and maybe shutdown vllm/entrypoints/launcher.py.
+        except Exception as e:
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
 
     def _process_request_outputs(self, request_outputs: List[RequestOutput]):
@@ -298,7 +303,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
-
         try:
             while True:
                 # 1) Pull EngineCoreOutput from the EngineCore.
@@ -314,10 +318,12 @@ async def _run_output_handler(self):
                 await self.engine_core.abort_requests_async(reqs_to_abort)
 
         except asyncio.CancelledError:
+            logger.debug("Output handler interrupted.")
             raise
 
         except Exception as e:
             self._propagate_error(e)
+            raise EngineDeadError() from e
 
     def _propagate_error(self, exception: Optional[Exception] = None):
         """Propagate to generate() tasks and raise EngineDeadError."""
@@ -328,15 +334,11 @@ def _propagate_error(self, exception: Optional[Exception] = None):
             logger.error("AsyncLLM run_output_handler failed",
                          exc_info=exception)
 
-        # Put EngineDeadError() into
+        # Put EngineDeadError() into each generate()'s queue,
+        # each of which will raise in their own context.
         for _, q in self.rid_to_queue.items():
             q.put_nowait(EngineDeadError())
 
-        raise EngineDeadError(
-            "AsyncLLM finished unexpectedly. This should never happen! "
-            "Please open an issue on Github. See stack trace above for the "
-            "actual cause.") from exception
-
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in self, detokenizer, and engine core."""
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 13a50a4f855e..428ab32d4095 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -208,7 +208,11 @@ def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
         # Loop until process is sent a SIGINT or SIGTERM
+        i = 0
         while True:
+            if i == 10:
+                raise ValueError("TEST RUN")
+            i += 1
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:

From 77801cddb5f7cba17d0a09845f9e1879350ddb94 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 14:46:09 +0000
Subject: [PATCH 003/130] switch from ValueError -> Exception.

---
 vllm/entrypoints/openai/serving_chat.py         | 6 +++---
 vllm/entrypoints/openai/serving_completion.py   | 8 ++++----
 vllm/entrypoints/openai/serving_embedding.py    | 6 +++---
 vllm/entrypoints/openai/serving_pooling.py      | 6 +++---
 vllm/entrypoints/openai/serving_score.py        | 6 +++---
 vllm/entrypoints/openai/serving_tokenization.py | 2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9ba5eeb7709c..7e3f8b56fd54 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -171,7 +171,7 @@ async def create_chat_completion(
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -228,7 +228,7 @@ async def create_chat_completion(
                     )
 
                 generators.append(generator)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -245,7 +245,7 @@ async def create_chat_completion(
             return await self.chat_completion_full_generator(
                 request, result_generator, request_id, model_name,
                 conversation, tokenizer, request_metadata)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 17197dce8da2..53ae1b134590 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -106,7 +106,7 @@ async def create_completion(
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -158,7 +158,7 @@ async def create_completion(
                     )
 
                 generators.append(generator)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -215,7 +215,7 @@ async def create_completion(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -371,7 +371,7 @@ async def completion_stream_generator(
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = final_usage_info
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e7116a3d95d1..fe8ba5eb95b9 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -136,7 +136,7 @@ async def create_embedding(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -167,7 +167,7 @@ async def create_embedding(
                 )
 
                 generators.append(generator)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -196,7 +196,7 @@ async def create_embedding(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 5830322071e5..3441071344f4 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -132,7 +132,7 @@ async def create_pooling(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -163,7 +163,7 @@ async def create_pooling(
                 )
 
                 generators.append(generator)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -192,7 +192,7 @@ async def create_pooling(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 5d3e7139d7a1..9b5aa13bda84 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -101,7 +101,7 @@ async def create_score(
             if not self.model_config.is_cross_encoder:
                 raise ValueError("Model is not cross encoder.")
 
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -155,7 +155,7 @@ async def create_score(
                 )
 
                 generators.append(generator)
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -184,7 +184,7 @@ async def create_score(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index b67ecfb01316..a3dc42ff8f02 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -86,7 +86,7 @@ async def create_tokenize(
                      request.prompt,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
+        except Exception as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 

From 8eca8646ec74728d8b133a87d4b32cf78bc37ac6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 15:09:06 +0000
Subject: [PATCH 004/130] updated

---
 vllm/v1/engine/async_llm.py   | 25 ++++++++++++++++++++-----
 vllm/v1/engine/core_client.py | 30 ++++++++++++++++--------------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5e76c6e6043b..83ed99a9fd64 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,4 +1,5 @@
 import asyncio
+import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -16,7 +17,6 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-# from vllm.utils import kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
@@ -58,6 +58,19 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
+        # EngineCore and Worker processes send SIGUSR1 when 
+        # unrecoverable errors occur. Start the shutdown
+        # process if this occurs.
+        def sigusr1_handler():
+            logger.fatal(
+                "AsyncLLM got fatal signal from worker process, "
+                "shutting down. See stack trace for root cause.")
+            self._propagate_error()
+            self._errored = True
+
+        asyncio.get_running_loop().add_signal_handler(
+            signal.SIGUSR1, sigusr1_handler)
+
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -244,14 +257,14 @@ async def generate(
             # The output_handler task pushes items into the queue.
             # This task pulls from the queue and yields to caller.
             while True:
-                # NOTE(rob): drain queue without await if possible (avoids
+                # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
                 if isinstance(out, EngineDeadError):
                     raise out
 
-                # NOTE(rob): both Detokenizer and EngineCore handle
-                # their own request cleanup based on finished.
+                # Note: both Detokenizer and EngineCore handle their
+                # own request cleanup based on finished.
                 if out.finished:
                     del self.rid_to_queue[request_id]
                     yield out
@@ -260,7 +273,8 @@ async def generate(
                 yield out
 
         # If the request is disconnected by the client, the
-        # generate() task will be canceled so, we abort.
+        # generate() task will be canceled. So, we abort the
+        # request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
             if self.log_requests:
@@ -295,6 +309,7 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]):
 
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+
         try:
             while True:
                 # 1) Pull EngineCoreOutput from the EngineCore.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a4a45ae05ff9..5e907a3c5ec7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,4 +1,4 @@
-import os
+import asyncio
 import signal
 import weakref
 from abc import ABC, abstractmethod
@@ -135,19 +135,19 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO(rob): rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
-        def sigusr1_handler(signum, frame):
-            logger.fatal("Got fatal signal from worker processes, shutting "
-                         "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        # # The child processes will send SIGUSR1 when unrecoverable
+        # # errors happen. We kill the process tree here so that the
+        # # stack trace is very evident.
+        # # TODO(rob): rather than killing the main process, we should
+        # # figure out how to raise an AsyncEngineDeadError and
+        # # handle at the API server level so we can return a better
+        # # error code to the clients calling VLLM.
+        # def sigusr1_handler(signum, frame):
+        #     logger.fatal("Got fatal signal from worker processes, shutting "
+        #                  "down. See stack trace above for root cause issue.")
+        #     kill_process_tree(os.getpid())
+
+        # signal.signal(signal.SIGUSR1, sigusr1_handler)
 
         # Serialization setup.
         self.encoder = PickleEncoder()
@@ -198,6 +198,7 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
+        
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -236,6 +237,7 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
+
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,

From b8c77b37f4e40355819ad2cb6cfb310e2d1a7c61 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 15:09:56 +0000
Subject: [PATCH 005/130] stash

---
 vllm/v1/engine/async_llm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 83ed99a9fd64..70ab5c2f3f77 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -30,13 +30,11 @@
 class EngineGenerateError(Exception):
     pass
 
-
 # NOTE(rob): raised when the engine dies, typically
 # by the background output handler loop. Unrecoverable.
 class EngineDeadError(Exception):
     pass
 
-
 class AsyncLLM(EngineClient):
 
     def __init__(

From ce9b8ef26d62db2092e2ac3fb18143d52c4f8e62 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 15:11:43 +0000
Subject: [PATCH 006/130] stash

---
 vllm/entrypoints/openai/serving_chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e3f8b56fd54..a20bf1efa08a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -591,7 +591,7 @@ async def chat_completion_stream_generator(
                 completion_tokens=num_completion_tokens,
                 total_tokens=num_prompt_tokens + num_completion_tokens)
 
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
             data = self.create_streaming_error_response(str(e))
@@ -618,7 +618,7 @@ async def chat_completion_full_generator(
                 final_res = res
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except ValueError as e:
+        except Exception as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 

From 3a760a7598df8df13acbcbafb3213f3e69071f28 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:29:38 +0000
Subject: [PATCH 007/130] add watchdog

---
 vllm/entrypoints/launcher.py          | 110 ++++++++++++++++----------
 vllm/entrypoints/openai/api_server.py |   2 +-
 vllm/v1/engine/async_llm.py           |  13 +--
 vllm/v1/engine/core_client.py         |  21 +----
 4 files changed, 79 insertions(+), 67 deletions(-)

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index c928a9f4b8ce..bbb2271d7a46 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -7,8 +7,9 @@
 from fastapi import FastAPI, Request, Response
 
 from vllm import envs
-# from vllm.engine.async_llm_engine import AsyncEngineDeadError
-# from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.async_llm_engine import AsyncEngineDeadError
+from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
 from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError
@@ -33,11 +34,14 @@ async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
 
     loop = asyncio.get_running_loop()
 
+    watchdog_task = loop.create_task(
+        watchdog_loop(server, app.state.engine_client))
     server_task = loop.create_task(server.serve())
 
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
         server_task.cancel()
+        watchdog_task.cancel()
 
     async def dummy_shutdown() -> None:
         pass
@@ -57,48 +61,72 @@ async def dummy_shutdown() -> None:
                 port, process, " ".join(process.cmdline()))
         logger.info("Shutting down FastAPI HTTP server.")
         return server.shutdown()
+    finally:
+        watchdog_task.cancel()
 
 
-def start_termination(server: uvicorn.Server):
-    # See discussions here on shutting down a uvicorn server
-    # https://github.com/encode/uvicorn/discussions/1103
-    # In this case we cannot await the server shutdown here because
-    # this handler must first return to close the connection for
-    # this request.
-    logger.fatal("VLLM Engine failed, terminating server.")
-    server.should_exit = True
-
-
-# NOTE(rob): VLLM V1 AsyncLLM catches exceptions and returns
-# only two types: EngineGenerateError and EngineDeadError.
-#
-# EngineGenerateError is raised by the per request generate()
-# method. This error could be request specific (and therefore
-# recoverable - e.g. if there is an error in input processing).
-#
-# EngineDeadError is raised by the background output_handler
-# method. This error is global and therefore not recoverable.
-#
-# We register these @app.exception_handlers to return nice
-# responses to the end user if they occur and shut down if needed.
-# See https://fastapi.tiangolo.com/tutorial/handling-errors/
-# for more details on how exception handlers work.
-def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
-
-    if envs.VLLM_USE_V1:
+async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
+    # Background task that runs in the background, checking
+    # for error state in the engine. This is needed for a
+    # clean shutdown since we cannot raise an Exception in
+    # a StreamingResponse generator() meaning we cannot use
+    # the exception handlers below.
+    VLLM_WATCHDOG_TIME_S = 3.0
+    while True:
+        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
+        terminate_if_errored(server, engine)
 
-        @app.exception_handler(EngineGenerateError)
-        async def generate_error_handler(request: Request, __):
-            engine = request.app.state.engine_client
-            if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored):
-                # Terminate if recoverable.
-                start_termination(server)
 
-            return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
+    # See discussions here on shutting down a uvicorn server
+    # https://github.com/encode/uvicorn/discussions/1103
+    # In this case we cannot await the server shutdown here
+    # because handler must first return to close the connection
+    # for this request.
+    engine_errored = engine.errored and not engine.is_running
+    is_already_exiting = server.should_exit
+    if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored
+            and not is_already_exiting):
+        # Avoid spamming the logs by only sending once.
+        logger.fatal("Engine failed, terminating server.")
+        server.should_exit = True
 
-        @app.exception_handler(EngineDeadError)
-        async def engine_dead_handler(_, __):
-            if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
-                start_termination(server)
 
-            return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
+def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
+    """
+    VLLM V1 AsyncLLM catches exceptions and returns
+    only two types: EngineGenerateError and EngineDeadError.
+    
+    EngineGenerateError is raised by the per request generate()
+    method. This error could be request specific (and therefore
+    recoverable - e.g. if there is an error in input processing).
+    
+    EngineDeadError is raised by the background output_handler
+    method. This error is global and therefore not recoverable.
+    
+    We register these @app.exception_handlers to return nice
+    responses to the end user if they occur and shut down if needed.
+    See https://fastapi.tiangolo.com/tutorial/handling-errors/
+    for more details on how exception handlers work.
+
+    NOTE(rob): if an exception is encountered in a StreamingResponse
+    generator, the exception is not raised, since we already sent
+    a 200 status. Rather, we send an error message as the next chunk.
+    Since the exception is not raised, this means that the server
+    will not automatically shut down.
+    """
+
+    # NOTE(rob): RuntimeError, AsyncEngineDeadError,
+    # MQEngineDeadError are all V0 errors.
+    @app.exception_handler(RuntimeError)
+    @app.exception_handler(AsyncEngineDeadError)
+    @app.exception_handler(MQEngineDeadError)
+    @app.exception_handler(EngineDeadError)
+    @app.exception_handler(EngineGenerateError)
+    async def runtime_exception_handler(request: Request, __):
+        terminate_if_errored(
+            server=server,
+            engine=request.app.state.engine_client,
+        )
+
+        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e942b475535a..ea3a9cd08837 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -90,8 +90,8 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
+        engine_client: EngineClient = app.state.engine_client
         if app.state.log_stats:
-            engine_client: EngineClient = app.state.engine_client
 
             async def _force_log():
                 while True:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 70ab5c2f3f77..7112e88410a4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -30,11 +30,13 @@
 class EngineGenerateError(Exception):
     pass
 
+
 # NOTE(rob): raised when the engine dies, typically
 # by the background output handler loop. Unrecoverable.
 class EngineDeadError(Exception):
     pass
 
+
 class AsyncLLM(EngineClient):
 
     def __init__(
@@ -56,18 +58,17 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # EngineCore and Worker processes send SIGUSR1 when 
+        # EngineCore and Worker processes send SIGUSR1 when
         # unrecoverable errors occur. Start the shutdown
         # process if this occurs.
         def sigusr1_handler():
-            logger.fatal(
-                "AsyncLLM got fatal signal from worker process, "
-                "shutting down. See stack trace for root cause.")
+            logger.fatal("AsyncLLM got fatal signal from worker process, "
+                         "shutting down. See stack trace for root cause.")
             self._propagate_error()
             self._errored = True
 
-        asyncio.get_running_loop().add_signal_handler(
-            signal.SIGUSR1, sigusr1_handler)
+        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
+                                                      sigusr1_handler)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5e907a3c5ec7..b665b1b7407b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,3 @@
-import asyncio
-import signal
 import weakref
 from abc import ABC, abstractmethod
 from typing import List, Type
@@ -10,8 +8,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
-                        make_zmq_socket)
+from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
@@ -135,20 +132,6 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
-        # # The child processes will send SIGUSR1 when unrecoverable
-        # # errors happen. We kill the process tree here so that the
-        # # stack trace is very evident.
-        # # TODO(rob): rather than killing the main process, we should
-        # # figure out how to raise an AsyncEngineDeadError and
-        # # handle at the API server level so we can return a better
-        # # error code to the clients calling VLLM.
-        # def sigusr1_handler(signum, frame):
-        #     logger.fatal("Got fatal signal from worker processes, shutting "
-        #                  "down. See stack trace above for root cause issue.")
-        #     kill_process_tree(os.getpid())
-
-        # signal.signal(signal.SIGUSR1, sigusr1_handler)
-
         # Serialization setup.
         self.encoder = PickleEncoder()
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
@@ -198,7 +181,7 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
-        
+
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,

From 3024da070a4aebdedf1c606843bb388541e03fcc Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:41:22 +0000
Subject: [PATCH 008/130] updated

---
 vllm/entrypoints/launcher.py          | 7 +++----
 vllm/entrypoints/openai/api_server.py | 2 +-
 vllm/v1/engine/async_llm.py           | 7 +++----
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index bbb2271d7a46..8512fe135c66 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -109,15 +109,14 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
     See https://fastapi.tiangolo.com/tutorial/handling-errors/
     for more details on how exception handlers work.
 
-    NOTE(rob): if an exception is encountered in a StreamingResponse
+    If an exception is encountered in a StreamingResponse
     generator, the exception is not raised, since we already sent
     a 200 status. Rather, we send an error message as the next chunk.
     Since the exception is not raised, this means that the server
-    will not automatically shut down.
+    will not automatically shut down. Instead, we use the watchdog
+    background task for check for errored state.
     """
 
-    # NOTE(rob): RuntimeError, AsyncEngineDeadError,
-    # MQEngineDeadError are all V0 errors.
     @app.exception_handler(RuntimeError)
     @app.exception_handler(AsyncEngineDeadError)
     @app.exception_handler(MQEngineDeadError)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ea3a9cd08837..e942b475535a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -90,8 +90,8 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     try:
-        engine_client: EngineClient = app.state.engine_client
         if app.state.log_stats:
+            engine_client: EngineClient = app.state.engine_client
 
             async def _force_log():
                 while True:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7112e88410a4..2a831ae751c5 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,13 +26,12 @@
 logger = init_logger(__name__)
 
 
-# NOTE(rob): raised when a generate() fails.
+# Raised when a generate() fails. Possibly Recoverable.
 class EngineGenerateError(Exception):
     pass
 
-
-# NOTE(rob): raised when the engine dies, typically
-# by the background output handler loop. Unrecoverable.
+# Raised when the engine dies, typically by the
+# background output handler loop. Unrecoverable.
 class EngineDeadError(Exception):
     pass
 

From 5af8189792f54caf6a7171d7c3e421692494f9cb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:42:12 +0000
Subject: [PATCH 009/130] revert spurious changes

---
 vllm/v1/engine/async_llm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2a831ae751c5..5d016161ab93 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -140,12 +140,13 @@ def from_engine_args(
 
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
-        if handler := getattr(self, "output_handler", None):
-            handler.cancel()
 
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
+        if handler := getattr(self, "output_handler", None):
+            handler.cancel()
+
     @classmethod
     def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]

From 3cb21bbb2b87a7d1c8a01f950d9c61f82deb2579 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:50:32 +0000
Subject: [PATCH 010/130] updated

---
 vllm/v1/engine/async_llm.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5d016161ab93..dc00d6f7ee35 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -30,6 +30,7 @@
 class EngineGenerateError(Exception):
     pass
 
+
 # Raised when the engine dies, typically by the
 # background output handler loop. Unrecoverable.
 class EngineDeadError(Exception):
@@ -63,8 +64,8 @@ def __init__(
         def sigusr1_handler():
             logger.fatal("AsyncLLM got fatal signal from worker process, "
                          "shutting down. See stack trace for root cause.")
-            self._propagate_error()
             self._errored = True
+            self._propagate_error()
 
         asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
                                                       sigusr1_handler)
@@ -230,9 +231,7 @@ async def generate(
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
         """
-
         if self.errored:
-            self._propagate_error()
             raise EngineDeadError()
 
         try:
@@ -328,17 +327,13 @@ async def _run_output_handler(self):
             raise
 
         except Exception as e:
-            self._propagate_error(e)
+            logger.error("run_output_handler failed", e)
+            self._errored = True
+            self._propagate_error()
             raise EngineDeadError() from e
 
-    def _propagate_error(self, exception: Optional[Exception] = None):
-        """Propagate to generate() tasks and raise EngineDeadError."""
-
-        # Set errored state and log if we have
-        self._errored = True
-        if exception:
-            logger.error("AsyncLLM run_output_handler failed",
-                         exc_info=exception)
+    def _propagate_error(self):
+        """Propagate to all generate() tasks."""
 
         # Put EngineDeadError() into each generate()'s queue,
         # each of which will raise in their own context.

From 7c973088503eaf528bb2509398d5a3f275b86f58 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:56:30 +0000
Subject: [PATCH 011/130] updated

---
 vllm/v1/engine/async_llm.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dc00d6f7ee35..9a0e5ec3f91a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -64,8 +64,7 @@ def __init__(
         def sigusr1_handler():
             logger.fatal("AsyncLLM got fatal signal from worker process, "
                          "shutting down. See stack trace for root cause.")
-            self._errored = True
-            self._propagate_error()
+            self._set_errored_and_propagate()
 
         asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
                                                       sigusr1_handler)
@@ -327,16 +326,16 @@ async def _run_output_handler(self):
             raise
 
         except Exception as e:
-            logger.error("run_output_handler failed", e)
-            self._errored = True
-            self._propagate_error()
+            logger.error("AsyncLLM._run_output_handler failed", e)
+            self._set_errored_and_propagate()
             raise EngineDeadError() from e
 
-    def _propagate_error(self):
+    def _set_errored_and_propagate(self):
         """Propagate to all generate() tasks."""
+        self._errored = True
 
-        # Put EngineDeadError() into each generate()'s queue,
-        # each of which will raise in their own context.
+        # Put EngineDeadError() into each generate() task's queue,
+        # each of which will raise it in their own context.
         for _, q in self.rid_to_queue.items():
             q.put_nowait(EngineDeadError())
 

From ea6824ae35fcee912f4abe42f246117c1bbe3f24 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 16:57:08 +0000
Subject: [PATCH 012/130] updated

---
 vllm/v1/engine/core_client.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b665b1b7407b..8412de226f4c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -181,7 +181,6 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
-
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -215,7 +214,6 @@ def profile(self, is_start: bool = True) -> None:
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
-
     def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],

From b278065ccf9d86bf35d3b0f2c7fde43a2d681af0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 17:00:01 +0000
Subject: [PATCH 013/130] remove cruft

---
 vllm/v1/engine/core_client.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8412de226f4c..62c284577007 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -218,7 +218,6 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
-
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,

From c004bd47823a44d79b592f4da2ecbb0f6334f220 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 17:06:40 +0000
Subject: [PATCH 014/130] cruft

---
 vllm/v1/engine/core_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 62c284577007..bf3a6f60c0c5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -214,6 +214,7 @@ def profile(self, is_start: bool = True) -> None:
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
+
     def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],

From 2556bc44823a95b2181de4ff152a41ea7d0bc9b2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 19:46:36 +0000
Subject: [PATCH 015/130] stash

---
 vllm/distributed/parallel_state.py     |  5 +++++
 vllm/entrypoints/launcher.py           |  6 +-----
 vllm/model_executor/models/llama.py    |  4 ++++
 vllm/v1/engine/async_llm.py            |  2 +-
 vllm/v1/engine/core.py                 |  8 +++-----
 vllm/v1/executor/multiproc_executor.py | 16 +++++++++++++---
 6 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a0d4235460f3..5d9549b1f74e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -827,6 +827,7 @@ def recv(self,
 
     def destroy(self):
         if self.device_group is not None:
+            print(f"{self.device_group}")
             torch.distributed.destroy_process_group(self.device_group)
             self.device_group = None
         if self.cpu_group is not None:
@@ -1160,13 +1161,16 @@ def get_tensor_model_parallel_rank():
 
 def destroy_model_parallel():
     """Set the groups to none and destroy them."""
+
     global _TP
     if _TP:
+        print("calling TP.destroy()")
         _TP.destroy()
     _TP = None
 
     global _PP
     if _PP:
+        print("calling PP.destroy()")
         _PP.destroy()
     _PP = None
 
@@ -1174,6 +1178,7 @@ def destroy_model_parallel():
 def destroy_distributed_environment():
     global _WORLD
     if _WORLD:
+        print("calling WORLD.destroy()")
         _WORLD.destroy()
     _WORLD = None
     if torch.distributed.is_initialized():
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 8512fe135c66..621012a800a4 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -84,11 +84,7 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
     # because handler must first return to close the connection
     # for this request.
     engine_errored = engine.errored and not engine.is_running
-    is_already_exiting = server.should_exit
-    if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored
-            and not is_already_exiting):
-        # Avoid spamming the logs by only sending once.
-        logger.fatal("Engine failed, terminating server.")
+    if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored):
         server.should_exit = True
 
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8623da99574b..704dd6aae60a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -509,6 +509,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        self.i = 0
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -566,6 +567,9 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
+        # if self.i == 100 and get_tensor_model_parallel_rank() == 0:
+        #     raise RuntimeError("ERROR IN LLAMA!")
+        # self.i += 1
         model_output = self.model(input_ids, positions, kv_caches,
                                   attn_metadata, intermediate_tensors,
                                   inputs_embeds)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 9a0e5ec3f91a..a76918a8256f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -326,7 +326,7 @@ async def _run_output_handler(self):
             raise
 
         except Exception as e:
-            logger.error("AsyncLLM._run_output_handler failed", e)
+            logger.fatal("AsyncLLM._run_output_handler failed")
             self._set_errored_and_propagate()
             raise EngineDeadError() from e
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c3244c96e33a..4ac7bc1041fb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -198,6 +198,8 @@ def signal_handler(signum, frame):
         except Exception:
             traceback = get_exception_traceback()
             logger.error("EngineCore hit an exception: %s", traceback)
+            engine_core.shutdown()
+            engine_core = None
             parent_process.send_signal(signal.SIGUSR1)
 
         finally:
@@ -207,12 +209,8 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until process is sent a SIGINT or SIGTERM
-        i = 0
+        # Loop until process is sent a SIGINT or SIGTERM.
         while True:
-            if i == 10:
-                raise ValueError("TEST RUN")
-            i += 1
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():
                 while True:
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 41e6abbd6795..952705401934 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -193,7 +193,8 @@ def wait_for_termination(procs, timeout):
         active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
         for p in active_procs:
             p.terminate()
-        if not wait_for_termination(active_procs, 4):
+        if not wait_for_termination(active_procs, 100):
+            
             # Send SIGKILL if still running
             active_procs = [p for p in active_procs if p.is_alive()]
             for p in active_procs:
@@ -210,7 +211,7 @@ def _cleanup_sockets(self):
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""
-        if getattr(self, 'shutting_down', False):
+        if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
             for w in self.workers:
                 w.worker_response_mq = None
@@ -313,8 +314,11 @@ def make_worker_process(
     def shutdown(self):
         self.rpc_broadcast_mq = None
         self.worker_response_mq = None
+        print(f"destroy_model_parallel PID: {os.getpid()}")
         destroy_model_parallel()
+        print(f"destroy_distributed_environment PID: {os.getpid()}")
         destroy_distributed_environment()
+        print(f"done with shutdown PID: {os.getpid()}")
 
     @staticmethod
     def worker_main(*args, **kwargs):
@@ -348,7 +352,7 @@ def signal_handler(signum, frame):
             worker.worker_busy_loop()
 
         except SystemExit:
-            logger.debug("Worker interrupted.")
+            logger.info("Worker interrupted.")
 
         except Exception:
             # worker_busy_loop sends exceptions exceptons to Executor
@@ -358,10 +362,12 @@ def signal_handler(signum, frame):
             raise
 
         finally:
+            print(f"IN WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}")
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
                 worker = None
+            print(f"DONE W WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}")
 
     @staticmethod
     def wait_for_startup(
@@ -390,10 +396,14 @@ class ResponseStatus(Enum):
 
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
+        i = 0
         while True:
             method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
             try:
+                if i == 10 and self.rank == 0:
+                    raise ValueError
+                i+=1
                 output = getattr(self.worker, method)(*args, **kwargs)
             except Exception as e:
                 self.worker_response_mq.enqueue(

From db0b9e673fbf3311885038d92a815cd45fd8f6cb Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:13:27 +0000
Subject: [PATCH 016/130] fix llama

---
 vllm/distributed/parallel_state.py     |  5 -----
 vllm/model_executor/models/llama.py    |  4 ----
 vllm/v1/executor/multiproc_executor.py | 31 +++++++++++++-------------
 3 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5d9549b1f74e..a0d4235460f3 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -827,7 +827,6 @@ def recv(self,
 
     def destroy(self):
         if self.device_group is not None:
-            print(f"{self.device_group}")
             torch.distributed.destroy_process_group(self.device_group)
             self.device_group = None
         if self.cpu_group is not None:
@@ -1161,16 +1160,13 @@ def get_tensor_model_parallel_rank():
 
 def destroy_model_parallel():
     """Set the groups to none and destroy them."""
-
     global _TP
     if _TP:
-        print("calling TP.destroy()")
         _TP.destroy()
     _TP = None
 
     global _PP
     if _PP:
-        print("calling PP.destroy()")
         _PP.destroy()
     _PP = None
 
@@ -1178,7 +1174,6 @@ def destroy_model_parallel():
 def destroy_distributed_environment():
     global _WORLD
     if _WORLD:
-        print("calling WORLD.destroy()")
         _WORLD.destroy()
     _WORLD = None
     if torch.distributed.is_initialized():
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 704dd6aae60a..8623da99574b 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -509,7 +509,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        self.i = 0
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -567,9 +566,6 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        # if self.i == 100 and get_tensor_model_parallel_rank() == 0:
-        #     raise RuntimeError("ERROR IN LLAMA!")
-        # self.i += 1
         model_output = self.model(input_ids, positions, kv_caches,
                                   attn_metadata, intermediate_tensors,
                                   inputs_embeds)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 952705401934..a7f702ab718d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -42,9 +42,6 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # The child processes will send SIGUSR1 when unrecoverable
         # errors happen.
         def sigusr1_handler(signum, frame):
-            logger.fatal(
-                "MulitprocExecutor got fatal signal from worker processes, "
-                "shutting down. See stack trace above for root cause issue.")
             # Propagate error up to parent process.
             parent_process = psutil.Process().parent()
             parent_process.send_signal(signal.SIGUSR1)
@@ -193,7 +190,7 @@ def wait_for_termination(procs, timeout):
         active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
         for p in active_procs:
             p.terminate()
-        if not wait_for_termination(active_procs, 100):
+        if not wait_for_termination(active_procs, 4):
             
             # Send SIGKILL if still running
             active_procs = [p for p in active_procs if p.is_alive()]
@@ -314,11 +311,8 @@ def make_worker_process(
     def shutdown(self):
         self.rpc_broadcast_mq = None
         self.worker_response_mq = None
-        print(f"destroy_model_parallel PID: {os.getpid()}")
         destroy_model_parallel()
-        print(f"destroy_distributed_environment PID: {os.getpid()}")
         destroy_distributed_environment()
-        print(f"done with shutdown PID: {os.getpid()}")
 
     @staticmethod
     def worker_main(*args, **kwargs):
@@ -352,22 +346,27 @@ def signal_handler(signum, frame):
             worker.worker_busy_loop()
 
         except SystemExit:
-            logger.info("Worker interrupted.")
+            logger.debug("Worker interrupted.")
+
+        except Exception as e:
+            # Log rather than raise so the stack trace is in order.
+            logger.exception("WorkerProc got an Exception:", exc_info=e)
+
+            # The parent will send a SIGTERM to all worker processes
+            # after we send SIGUSR. Set this value so we don't re-throw
+            # SystemExit(), to avoid zmq Exceptions during shyt
+            shutdown_requested = True
 
-        except Exception:
             # worker_busy_loop sends exceptions exceptons to Executor
             # for shutdown, but if there is an error in startup or an
             # error with IPC itself, we need to alert the parent.
             psutil.Process().parent().send_signal(signal.SIGUSR1)
-            raise
 
         finally:
-            print(f"IN WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}")
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
                 worker = None
-            print(f"DONE W WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}")
 
     @staticmethod
     def wait_for_startup(
@@ -401,15 +400,15 @@ def worker_busy_loop(self):
             method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
             try:
-                if i == 10 and self.rank == 0:
-                    raise ValueError
-                i+=1
                 output = getattr(self.worker, method)(*args, **kwargs)
             except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
                 logger.exception("WorkerProc hit an exception: %s", exc_info=e)
                 continue
-
+            
+            if i == 10 and self.rank == 0:
+                raise ValueError
+            i+=1
             self.worker_response_mq.enqueue(
                 (WorkerProc.ResponseStatus.SUCCESS, output))

From f72258961fac00cf32a71a4eae008d26c65d92b2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:15:41 +0000
Subject: [PATCH 017/130] updated

---
 vllm/v1/engine/async_llm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a76918a8256f..5f7c14ae7b59 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -58,9 +58,8 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # EngineCore and Worker processes send SIGUSR1 when
-        # unrecoverable errors occur. Start the shutdown
-        # process if this occurs.
+        # Background processes send SIGUSR1 when unrecoverable
+        # errors occur. Start the shutdown process if this happens.
         def sigusr1_handler():
             logger.fatal("AsyncLLM got fatal signal from worker process, "
                          "shutting down. See stack trace for root cause.")

From de75cc44a2a4162a2ef398ad0f729d139e78c96c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:16:35 +0000
Subject: [PATCH 018/130] cruft

---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 4ac7bc1041fb..200c8184b0ba 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -209,7 +209,7 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        # Loop until process is sent a SIGINT or SIGTERM.
+        # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
             if not self.scheduler.has_unfinished_requests():

From ba5ca87ccd1e4a2138f8de63d46c68648123be76 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:17:41 +0000
Subject: [PATCH 019/130] cruft

---
 vllm/v1/executor/multiproc_executor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index a7f702ab718d..0c7b0d792f58 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -191,7 +191,6 @@ def wait_for_termination(procs, timeout):
         for p in active_procs:
             p.terminate()
         if not wait_for_termination(active_procs, 4):
-            
             # Send SIGKILL if still running
             active_procs = [p for p in active_procs if p.is_alive()]
             for p in active_procs:

From 4f6b68a3c03a28177770355cd261d7ca3aac5030 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:29:58 +0000
Subject: [PATCH 020/130] updated

---
 vllm/v1/engine/core.py       |  2 +-
 vllm/v1/engine/llm_engine.py | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 200c8184b0ba..5f5835f8cb6f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -221,7 +221,7 @@ def run_busy_loop(self):
                     except queue.Empty:
                         self._log_stats()
                         logger.debug("EngineCore busy loop waiting.")
-                    except BaseException:
+                    except Exception:
                         raise
 
             # 2) Handle any new client requests (Abort or Add).
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 0bd9b52c9be8..04a899e614ec 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,5 +1,5 @@
+import signal
 from typing import Dict, List, Mapping, Optional, Type, Union
-
 from typing_extensions import TypeVar
 
 from vllm.config import VllmConfig
@@ -44,6 +44,16 @@ def __init__(
     ) -> None:
         self.model_config = vllm_config.model_config
 
+        # Background processes send SIGUSR1 when unrecoverable
+        # errors occur. Start the shutdown process if this happens.
+        def sigusr1_handler():
+            logger.fatal("LLMEngine got fatal signal from worker process, "
+                         "shutting down. See stack trace for root cause.")
+            self._set_errored_and_propagate()
+
+        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
+                                                      sigusr1_handler)
+
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -201,3 +211,9 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def shutdown(self):
+        """Shutdown, cleaning up the background proc and IPC."""
+
+        if engine_core := getattr(self, "engine_core", None):
+            engine_core.shutdown()

From 949d4253a7997596411336204ef1aefa3fc77fe4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:38:10 +0000
Subject: [PATCH 021/130] updated

---
 vllm/v1/engine/llm_engine.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 04a899e614ec..2693be7741d6 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -46,13 +46,15 @@ def __init__(
 
         # Background processes send SIGUSR1 when unrecoverable
         # errors occur. Start the shutdown process if this happens.
+        # NOTE: signal_handlers must be created and run in the main
+        # python thread, a workaround for this would be using polling
+        # rather than signal handling to detect a shutdown. Investigate.
         def sigusr1_handler():
             logger.fatal("LLMEngine got fatal signal from worker process, "
                          "shutting down. See stack trace for root cause.")
-            self._set_errored_and_propagate()
+            self.shutdown()
 
-        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
-                                                      sigusr1_handler)
+        signal().add_signal_handler(signal.SIGUSR1, sigusr1_handler)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(

From f67398bd3f33ff2280cccf2d8afa47d22ee031e3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 20:39:09 +0000
Subject: [PATCH 022/130] updated

---
 vllm/v1/engine/llm_engine.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 2693be7741d6..98ddc6c77b80 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -49,12 +49,12 @@ def __init__(
         # NOTE: signal_handlers must be created and run in the main
         # python thread, a workaround for this would be using polling
         # rather than signal handling to detect a shutdown. Investigate.
-        def sigusr1_handler():
-            logger.fatal("LLMEngine got fatal signal from worker process, "
-                         "shutting down. See stack trace for root cause.")
+        def sigusr1_handler(signum, frame):
+            logger.fatal("LLMEngine go fatal signal from worker, shutting "
+                         "down. See stack trace above for root cause issue.")
             self.shutdown()
 
-        signal().add_signal_handler(signal.SIGUSR1, sigusr1_handler)
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(

From b3d29946038315c297f470c32ffef88eed8eb75c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:16:28 +0000
Subject: [PATCH 023/130] updated

---
 vllm/v1/engine/async_llm.py            | 12 +-----------
 vllm/v1/engine/core.py                 |  3 +--
 vllm/v1/engine/core_client.py          | 19 ++++++++++++++++++-
 vllm/v1/engine/llm_engine.py           | 21 +--------------------
 vllm/v1/executor/multiproc_executor.py | 19 ++++++++++++-------
 5 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5f7c14ae7b59..eff6aa73736a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -19,6 +19,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.executor.ray_utils import initialize_ray_cluster
@@ -26,17 +27,6 @@
 logger = init_logger(__name__)
 
 
-# Raised when a generate() fails. Possibly Recoverable.
-class EngineGenerateError(Exception):
-    pass
-
-
-# Raised when the engine dies, typically by the
-# background output handler loop. Unrecoverable.
-class EngineDeadError(Exception):
-    pass
-
-
 class AsyncLLM(EngineClient):
 
     def __init__(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5f5835f8cb6f..5761210b6a1a 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -198,13 +198,12 @@ def signal_handler(signum, frame):
         except Exception:
             traceback = get_exception_traceback()
             logger.error("EngineCore hit an exception: %s", traceback)
-            engine_core.shutdown()
-            engine_core = None
             parent_process.send_signal(signal.SIGUSR1)
 
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
+                engine_core = None
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index bf3a6f60c0c5..2a027300c401 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,4 @@
+import signal
 import weakref
 from abc import ABC, abstractmethod
 from typing import List, Type
@@ -13,6 +14,7 @@
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.engine.exceptions import engine_dead_error_guard
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
 from vllm.v1.utils import BackgroundProcHandle
@@ -181,6 +183,20 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
+
+        # Background procs sent SIGUSR1 if they hit error.
+        # We handle this by setting the _errored state to True
+        # and shutting down. Once _errored, we convert any
+        # Exceptions into an EngineDeadError for UX.
+        def sigusr1_handler(signum, frame):
+            logger.fatal("LLMEngine got fatal signal from background "
+                         "process, starting shutting down.")
+            self._errored = True
+            self.shutdown()
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        self._errored = False
+
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -188,12 +204,14 @@ def __init__(self,
             log_stats=log_stats,
         )
 
+    @engine_dead_error_guard
     def get_output(self) -> List[EngineCoreOutput]:
 
         (frame, ) = self.output_socket.recv_multipart(copy=False)
         engine_core_outputs = self.decoder.decode(frame.buffer).outputs
         return engine_core_outputs
 
+    @engine_dead_error_guard
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
 
@@ -227,7 +245,6 @@ def __init__(self,
         )
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
-
         frames = await self.output_socket.recv_multipart(copy=False)
         engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 98ddc6c77b80..119d1eeabfc9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,5 +1,5 @@
-import signal
 from typing import Dict, List, Mapping, Optional, Type, Union
+
 from typing_extensions import TypeVar
 
 from vllm.config import VllmConfig
@@ -44,18 +44,6 @@ def __init__(
     ) -> None:
         self.model_config = vllm_config.model_config
 
-        # Background processes send SIGUSR1 when unrecoverable
-        # errors occur. Start the shutdown process if this happens.
-        # NOTE: signal_handlers must be created and run in the main
-        # python thread, a workaround for this would be using polling
-        # rather than signal handling to detect a shutdown. Investigate.
-        def sigusr1_handler(signum, frame):
-            logger.fatal("LLMEngine go fatal signal from worker, shutting "
-                         "down. See stack trace above for root cause issue.")
-            self.shutdown()
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
-
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -160,7 +148,6 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-
         # 1) Process raw inputs into the request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
@@ -213,9 +200,3 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
-
-    def shutdown(self):
-        """Shutdown, cleaning up the background proc and IPC."""
-
-        if engine_core := getattr(self, "engine_core", None):
-            engine_core.shutdown()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 0c7b0d792f58..cbc025f2c41a 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -17,6 +17,7 @@
                               destroy_model_parallel)
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
+from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
 from vllm.executor.multiproc_worker_utils import (
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
@@ -39,13 +40,17 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
 
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen.
+        # WorkerProcs send SIGUSR1 if they get an Error.
         def sigusr1_handler(signum, frame):
-            # Propagate error up to parent process.
-            parent_process = psutil.Process().parent()
-            parent_process.send_signal(signal.SIGUSR1)
+            logger.fatal("MultiprocExecutor got fatal signal from "
+                         "background process, starting shutdown.")
+            # Shutdown first (avoid SysExit exceptions in __del__).
             self.shutdown()
+            # TODO(rob): move this to the VLLMConfig.
+            if VLLM_ENABLE_V1_MULTIPROCESSING:
+                # Propagate up if using the mp engine. Note that
+                # sending in non-mp mode crashes caller process.
+                psutil.Process().parent().send_signal(signal.SIGUSR1)
 
         signal.signal(signal.SIGUSR1, sigusr1_handler)
 
@@ -405,9 +410,9 @@ def worker_busy_loop(self):
                     (WorkerProc.ResponseStatus.FAILURE, e))
                 logger.exception("WorkerProc hit an exception: %s", exc_info=e)
                 continue
-            
+
             if i == 10 and self.rank == 0:
                 raise ValueError
-            i+=1
+            i += 1
             self.worker_response_mq.enqueue(
                 (WorkerProc.ResponseStatus.SUCCESS, output))

From 34a997a2cbf871a716783c2477a3e9779b53b58e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:21:47 +0000
Subject: [PATCH 024/130] update comment

---
 vllm/v1/engine/core_client.py          | 6 +++++-
 vllm/v1/executor/multiproc_executor.py | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 2a027300c401..641e3c0d0284 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -184,6 +184,11 @@ def __init__(self,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
 
+        # NOTE(rob): signal handler only needed for SyncMPClient
+        # because AsyncLLM needs to handle the signal rather
+        # than the AsyncMPClient. TODO(follow-up): move the defn of
+        # these functions to async_llm.py and llm_engine.py to make
+        # distinction clearer.
         # Background procs sent SIGUSR1 if they hit error.
         # We handle this by setting the _errored state to True
         # and shutting down. Once _errored, we convert any
@@ -252,7 +257,6 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
-
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index cbc025f2c41a..f4cc73cb6376 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -353,12 +353,13 @@ def signal_handler(signum, frame):
             logger.debug("Worker interrupted.")
 
         except Exception as e:
-            # Log rather than raise so the stack trace is in order.
+            # Log rather than raise so the stack trace is in order of
+            # WorkerProc -> EngineCore -> AsyncLLM.
             logger.exception("WorkerProc got an Exception:", exc_info=e)
 
             # The parent will send a SIGTERM to all worker processes
             # after we send SIGUSR. Set this value so we don't re-throw
-            # SystemExit(), to avoid zmq Exceptions during shyt
+            # SystemExit(), to avoid zmq exceptions during __del__.
             shutdown_requested = True
 
             # worker_busy_loop sends exceptions exceptons to Executor

From 32cf91b52169941b30a5c464f8f6d3d8af35984b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:23:52 +0000
Subject: [PATCH 025/130] update comment

---
 vllm/entrypoints/launcher.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 621012a800a4..d5d000e28016 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -66,23 +66,25 @@ async def dummy_shutdown() -> None:
 
 
 async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
-    # Background task that runs in the background, checking
-    # for error state in the engine. This is needed for a
-    # clean shutdown since we cannot raise an Exception in
-    # a StreamingResponse generator() meaning we cannot use
-    # the exception handlers below.
-    VLLM_WATCHDOG_TIME_S = 3.0
+    """
+    # Watchdog task that runs in the background, checking
+    # for error state in the engine. Needed to trigger shutdown
+    # if an exception arises is StreamingResponse() generator.
+    """
+    VLLM_WATCHDOG_TIME_S = 5.0
     while True:
         await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
         terminate_if_errored(server, engine)
 
 
 def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
-    # See discussions here on shutting down a uvicorn server
-    # https://github.com/encode/uvicorn/discussions/1103
-    # In this case we cannot await the server shutdown here
-    # because handler must first return to close the connection
-    # for this request.
+    """
+    See discussions here on shutting down a uvicorn server
+    https://github.com/encode/uvicorn/discussions/1103
+    In this case we cannot await the server shutdown here
+    because handler must first return to close the connection
+    for this request.
+    """
     engine_errored = engine.errored and not engine.is_running
     if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored):
         server.should_exit = True

From c73801c703a51e881e7dceca60ab62451bbedb82 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:25:49 +0000
Subject: [PATCH 026/130] fix more

---
 vllm/v1/engine/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5761210b6a1a..391aa59beb73 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -203,7 +203,6 @@ def signal_handler(signum, frame):
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
-                engine_core = None
 
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""

From 11888451fb593a3daad23390fb03254189a080e7 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:35:03 +0000
Subject: [PATCH 027/130] updated

---
 vllm/v1/engine/llm_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 119d1eeabfc9..0bd9b52c9be8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -148,6 +148,7 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
+
         # 1) Process raw inputs into the request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,

From 706782c899f6bc75f99796efd0791382f8fe5e23 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:35:42 +0000
Subject: [PATCH 028/130] udpatd

---
 vllm/v1/engine/core_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 641e3c0d0284..a461f82e418a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -250,6 +250,7 @@ def __init__(self,
         )
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
+
         frames = await self.output_socket.recv_multipart(copy=False)
         engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
 
@@ -257,6 +258,7 @@ async def get_output_async(self) -> List[EngineCoreOutput]:
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
+
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
 

From 1cc09156ebb4edf15db6281be82ea2ffce3fd77a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:36:26 +0000
Subject: [PATCH 029/130] added exception file

---
 vllm/v1/engine/exceptions.py | 44 ++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 vllm/v1/engine/exceptions.py

diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
new file mode 100644
index 000000000000..d190613b2cf1
--- /dev/null
+++ b/vllm/v1/engine/exceptions.py
@@ -0,0 +1,44 @@
+# There exceptions are raised by the LLMEngine and AsyncLLM
+# when errors occur. See vllm/entrypoints/launcher.py for the 
+# handlers of these exceptions in the API Server.
+
+# Raised when a AsyncLLM.generate() fails. Possibly recoverable.
+class EngineGenerateError(Exception):
+    pass
+
+# Raised when the EngineCore dies. Unrecoverable.
+class EngineDeadError(Exception):
+    pass
+
+def engine_dead_error_guard(func):
+    """
+    Decorator to be used by functions that call engine_core.
+    engine_core runs in a background process and sends a fatal
+    signal to the LLMEngine if it encounters an error. The
+    LLMEngine handles this signal, sets self._errored, and then
+    calls self.shutdown(), which kills engine_core.
+
+    After the signal is handled, we will get an exception if
+    we try to interact with the engine_core. This decorator
+    catches the exception and raises an a more accurate 
+    EngineDeadError exception to make the fundamental issue
+    clearer to the end user.
+    """
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            # NOTE: args[0] is self (EngineCoreMPClient)
+            if not args[0]._errored:
+                raise e
+            else:
+                new_e = EngineDeadError(
+                    "Engine got error in background worker process. "
+                    "See stack trace for root cause issue.")
+                # Convert the exception to EngineDeadError to give the
+                # user a clear failure reason, suppressing.
+                # https://docs.python.org/3/library/exceptions.html#exception-context # noqa: E501
+                new_e.__suppress_context__ = True
+                raise new_e from None
+
+    return wrapper
\ No newline at end of file

From 8db0eee569ab4593954168c58e7404750ab0e5a5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:43:31 +0000
Subject: [PATCH 030/130] updated

---
 vllm/entrypoints/launcher.py | 2 +-
 vllm/v1/engine/async_llm.py  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index d5d000e28016..ad2b725b49bb 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,7 +12,7 @@
 from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
-from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index eff6aa73736a..c29f1f6d6c23 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -51,8 +51,9 @@ def __init__(
         # Background processes send SIGUSR1 when unrecoverable
         # errors occur. Start the shutdown process if this happens.
         def sigusr1_handler():
-            logger.fatal("AsyncLLM got fatal signal from worker process, "
-                         "shutting down. See stack trace for root cause.")
+            logger.fatal(
+                "AsyncLLM got fatal signal from background process, "
+                "starting shutdown. See stack trace for root cause.")
             self._set_errored_and_propagate()
 
         asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,

From 2fc8af62b72fb0eff201e07f5f7fe63f02b3246f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 4 Jan 2025 23:52:41 +0000
Subject: [PATCH 031/130] fixt

---
 vllm/v1/engine/async_llm.py  | 5 ++---
 vllm/v1/engine/exceptions.py | 9 ++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c29f1f6d6c23..51bd7baf1d53 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -51,9 +51,8 @@ def __init__(
         # Background processes send SIGUSR1 when unrecoverable
         # errors occur. Start the shutdown process if this happens.
         def sigusr1_handler():
-            logger.fatal(
-                "AsyncLLM got fatal signal from background process, "
-                "starting shutdown. See stack trace for root cause.")
+            logger.fatal("AsyncLLM got fatal signal from background process, "
+                         "starting shutdown. See stack trace for root cause.")
             self._set_errored_and_propagate()
 
         asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index d190613b2cf1..ac554aa4bc23 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,15 +1,13 @@
-# There exceptions are raised by the LLMEngine and AsyncLLM
-# when errors occur. See vllm/entrypoints/launcher.py for the 
-# handlers of these exceptions in the API Server.
-
 # Raised when a AsyncLLM.generate() fails. Possibly recoverable.
 class EngineGenerateError(Exception):
     pass
 
+
 # Raised when the EngineCore dies. Unrecoverable.
 class EngineDeadError(Exception):
     pass
 
+
 def engine_dead_error_guard(func):
     """
     Decorator to be used by functions that call engine_core.
@@ -24,6 +22,7 @@ def engine_dead_error_guard(func):
     EngineDeadError exception to make the fundamental issue
     clearer to the end user.
     """
+
     def wrapper(*args, **kwargs):
         try:
             return func(*args, **kwargs)
@@ -41,4 +40,4 @@ def wrapper(*args, **kwargs):
                 new_e.__suppress_context__ = True
                 raise new_e from None
 
-    return wrapper
\ No newline at end of file
+    return wrapper

From de39af149f73eadcfdb784079ca4048260a4d7d8 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:10:51 +0000
Subject: [PATCH 032/130] reduce cruft

---
 vllm/engine/multiprocessing/client.py | 4 ++++
 vllm/engine/protocol.py               | 4 ++++
 vllm/v1/engine/async_llm.py           | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 329bcc3d1ff6..0a046c71e86e 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -402,6 +402,10 @@ async def check_health(self):
     def is_running(self) -> bool:
         return not self.errored
 
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored
+
     @property
     def errored(self) -> bool:
         return self._errored_with is not None
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index b2a5cc17ead6..9f58d61708fb 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -29,6 +29,10 @@ class EngineClient(ABC):
     def is_running(self) -> bool:
         ...
 
+    @abstractmethod
+    def is_stopped(self) -> bool:
+        ...
+
     @property
     @abstractmethod
     def errored(self) -> bool:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 51bd7baf1d53..ce41c4946d71 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -390,6 +390,10 @@ async def stop_profile(self) -> None:
     def is_running(self) -> bool:
         return not self.errored
 
+    @property
+    def is_stopped(self) -> bool:
+        return False
+
     @property
     def errored(self) -> bool:
         return self._errored

From 732ba645d76f0c15f8cb20f7c1ff76004f3ac63c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:11:04 +0000
Subject: [PATCH 033/130] reduce cruft

---
 vllm/v1/engine/async_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ce41c4946d71..434e2a7945d3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -392,7 +392,7 @@ def is_running(self) -> bool:
 
     @property
     def is_stopped(self) -> bool:
-        return False
+        return self.errored
 
     @property
     def errored(self) -> bool:

From 437209430e5c94c79e97e6d5dd29ec5eadc7caba Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:23:58 +0000
Subject: [PATCH 034/130] cleanup

---
 vllm/engine/protocol.py       |  1 +
 vllm/v1/engine/core_client.py | 13 ++++++---
 vllm/v1/engine/exceptions.py  | 50 +++++++++--------------------------
 3 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 9f58d61708fb..a066836b9270 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -29,6 +29,7 @@ class EngineClient(ABC):
     def is_running(self) -> bool:
         ...
 
+    @property
     @abstractmethod
     def is_stopped(self) -> bool:
         ...
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a461f82e418a..5337237b26d2 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -208,13 +208,20 @@ def sigusr1_handler(signum, frame):
             executor_class=executor_class,
             log_stats=log_stats,
         )
+    
+    def _handle_exception(self, e: Exception):
+        
+
 
     @engine_dead_error_guard
     def get_output(self) -> List[EngineCoreOutput]:
 
-        (frame, ) = self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
-        return engine_core_outputs
+        try:
+            (frame, ) = self.output_socket.recv_multipart(copy=False)
+            return self.decoder.decode(frame.buffer).outputs
+        except Exception as e:
+            if self._errored
+
 
     @engine_dead_error_guard
     def _send_input(self, request_type: EngineCoreRequestType,
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index ac554aa4bc23..965c8441373e 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,43 +1,19 @@
-# Raised when a AsyncLLM.generate() fails. Possibly recoverable.
 class EngineGenerateError(Exception):
+    """Raised when a AsyncLLM.generate() fails. Maybe recoverable."""
     pass
 
 
-# Raised when the EngineCore dies. Unrecoverable.
 class EngineDeadError(Exception):
-    pass
-
-
-def engine_dead_error_guard(func):
-    """
-    Decorator to be used by functions that call engine_core.
-    engine_core runs in a background process and sends a fatal
-    signal to the LLMEngine if it encounters an error. The
-    LLMEngine handles this signal, sets self._errored, and then
-    calls self.shutdown(), which kills engine_core.
-
-    After the signal is handled, we will get an exception if
-    we try to interact with the engine_core. This decorator
-    catches the exception and raises an a more accurate 
-    EngineDeadError exception to make the fundamental issue
-    clearer to the end user.
-    """
-
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except Exception as e:
-            # NOTE: args[0] is self (EngineCoreMPClient)
-            if not args[0]._errored:
-                raise e
-            else:
-                new_e = EngineDeadError(
-                    "Engine got error in background worker process. "
-                    "See stack trace for root cause issue.")
-                # Convert the exception to EngineDeadError to give the
-                # user a clear failure reason, suppressing.
-                # https://docs.python.org/3/library/exceptions.html#exception-context # noqa: E501
-                new_e.__suppress_context__ = True
-                raise new_e from None
+    """Raised when the EngineCore dies. Unrecoverable."""
+    def __init__(self,
+                 *args, 
+                 suppress_context: bool = False, 
+                 **kwargs):
+        super().__init__(args, kwargs)
 
-    return wrapper
+        # If we get an EngineDead signal when using LLMEngine,
+        # we often shutdown the EngineCore while the main
+        # process is still using ZMQ. This makes the root
+        # cause clear in the stack trace.
+        if suppress_context:
+            self.__suppress_context__ = True
\ No newline at end of file

From b9144a34e02b91d62cd6c20d4bf60c7ba9eb7fad Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:42:22 +0000
Subject: [PATCH 035/130] updated

---
 vllm/v1/engine/core_client.py | 40 +++++++++++++++++------------------
 vllm/v1/engine/exceptions.py  |  8 +++----
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5337237b26d2..10138c37e310 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -14,7 +14,7 @@
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
-from vllm.v1.engine.exceptions import engine_dead_error_guard
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import PickleEncoder
 from vllm.v1.utils import BackgroundProcHandle
@@ -184,15 +184,13 @@ def __init__(self,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
 
-        # NOTE(rob): signal handler only needed for SyncMPClient
+        # TODO(rob): signal handler only needed for SyncMPClient
         # because AsyncLLM needs to handle the signal rather
-        # than the AsyncMPClient. TODO(follow-up): move the defn of
-        # these functions to async_llm.py and llm_engine.py to make
-        # distinction clearer.
+        # than the AsyncMPClient. TODO(rob): move the Client def
+        # to async_llm and llm_engine to make this clearer.
         # Background procs sent SIGUSR1 if they hit error.
-        # We handle this by setting the _errored state to True
-        # and shutting down. Once _errored, we convert any
-        # Exceptions into an EngineDeadError for UX.
+        # Handle by setting _errored=True and shutting down.
+        # Next action taken will raise EngineDeadError.
         def sigusr1_handler(signum, frame):
             logger.fatal("LLMEngine got fatal signal from background "
                          "process, starting shutting down.")
@@ -208,28 +206,30 @@ def sigusr1_handler(signum, frame):
             executor_class=executor_class,
             log_stats=log_stats,
         )
-    
-    def _handle_exception(self, e: Exception):
-        
 
+    def _format_exception(self, e: Exception) -> Exception:
+        # If we are in the _errored state, raise EngineDeadError
+        # so the root cause is clear in the stack trace.
+        return (EngineDeadError(
+            "EngineCore encountered an issue. See stack trace "
+            "for the root cause.",
+            suppress_context=True) if self._errored else e)
 
-    @engine_dead_error_guard
     def get_output(self) -> List[EngineCoreOutput]:
-
         try:
             (frame, ) = self.output_socket.recv_multipart(copy=False)
             return self.decoder.decode(frame.buffer).outputs
         except Exception as e:
-            if self._errored
+            raise self._format_exception(e) from None
 
-
-    @engine_dead_error_guard
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: EngineCoreRequestUnion) -> None:
-
-        # (RequestType, SerializedRequest)
-        msg = (request_type.value, self.encoder.encode(request))
-        self.input_socket.send_multipart(msg, copy=False)
+        try:
+            # (RequestType, SerializedRequest)
+            msg = (request_type.value, self.encoder.encode(request))
+            self.input_socket.send_multipart(msg, copy=False)
+        except Exception as e:
+            raise self._format_exception(e) from None
 
     def add_request(self, request: EngineCoreRequest) -> None:
         self._send_input(EngineCoreRequestType.ADD, request)
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 965c8441373e..523416ca384d 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -5,10 +5,8 @@ class EngineGenerateError(Exception):
 
 class EngineDeadError(Exception):
     """Raised when the EngineCore dies. Unrecoverable."""
-    def __init__(self,
-                 *args, 
-                 suppress_context: bool = False, 
-                 **kwargs):
+
+    def __init__(self, *args, suppress_context: bool = False, **kwargs):
         super().__init__(args, kwargs)
 
         # If we get an EngineDead signal when using LLMEngine,
@@ -16,4 +14,4 @@ def __init__(self,
         # process is still using ZMQ. This makes the root
         # cause clear in the stack trace.
         if suppress_context:
-            self.__suppress_context__ = True
\ No newline at end of file
+            self.__suppress_context__ = True

From d90e122e2047228847f171c3ce7a519649a0a48f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:43:46 +0000
Subject: [PATCH 036/130] cruft

---
 vllm/v1/executor/multiproc_executor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index f4cc73cb6376..ab96842210fa 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -40,10 +40,12 @@ def __init__(self, vllm_config: VllmConfig) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
 
-        # WorkerProcs send SIGUSR1 if they get an Error.
+        # The child processes will send SIGUSR1 when unrecoverable
+        # errors happen.
         def sigusr1_handler(signum, frame):
-            logger.fatal("MultiprocExecutor got fatal signal from "
-                         "background process, starting shutdown.")
+            logger.fatal(
+                "MulitprocExecutor got fatal signal from worker processes, "
+                "shutting down. See stack trace above for root cause issue.")
             # Shutdown first (avoid SysExit exceptions in __del__).
             self.shutdown()
             # TODO(rob): move this to the VLLMConfig.

From 2bbac313c2e71d500fed6dbfd576ef861e6f7bca Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 17:48:32 +0000
Subject: [PATCH 037/130] updated

---
 vllm/v1/engine/core_client.py |  3 +--
 vllm/v1/engine/exceptions.py  | 10 +++-------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 10138c37e310..ddedac0be1c1 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -208,8 +208,7 @@ def sigusr1_handler(signum, frame):
         )
 
     def _format_exception(self, e: Exception) -> Exception:
-        # If we are in the _errored state, raise EngineDeadError
-        # so the root cause is clear in the stack trace.
+        """If _errored, use EngineDeadError so root cause is clear."""
         return (EngineDeadError(
             "EngineCore encountered an issue. See stack trace "
             "for the root cause.",
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 523416ca384d..5313c1e0943c 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -8,10 +8,6 @@ class EngineDeadError(Exception):
 
     def __init__(self, *args, suppress_context: bool = False, **kwargs):
         super().__init__(args, kwargs)
-
-        # If we get an EngineDead signal when using LLMEngine,
-        # we often shutdown the EngineCore while the main
-        # process is still using ZMQ. This makes the root
-        # cause clear in the stack trace.
-        if suppress_context:
-            self.__suppress_context__ = True
+        # Make stack trace clearer when using with LLMEngine by
+        # silencing irrelevant ZMQError.
+        self.__suppress_context__ = suppress_context

From c40542abb83ea1b5f40093e8710e2801ada0cf81 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 19:11:11 +0000
Subject: [PATCH 038/130] revert changes to server

---
 vllm/entrypoints/openai/serving_chat.py         | 10 +++++-----
 vllm/entrypoints/openai/serving_completion.py   |  8 ++++----
 vllm/entrypoints/openai/serving_embedding.py    |  6 +++---
 vllm/entrypoints/openai/serving_pooling.py      |  6 +++---
 vllm/entrypoints/openai/serving_score.py        |  6 +++---
 vllm/entrypoints/openai/serving_tokenization.py |  2 +-
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a20bf1efa08a..9ba5eeb7709c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -171,7 +171,7 @@ async def create_chat_completion(
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -228,7 +228,7 @@ async def create_chat_completion(
                     )
 
                 generators.append(generator)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -245,7 +245,7 @@ async def create_chat_completion(
             return await self.chat_completion_full_generator(
                 request, result_generator, request_id, model_name,
                 conversation, tokenizer, request_metadata)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -591,7 +591,7 @@ async def chat_completion_stream_generator(
                 completion_tokens=num_completion_tokens,
                 total_tokens=num_prompt_tokens + num_completion_tokens)
 
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             logger.exception("Error in chat completion stream generator.")
             data = self.create_streaming_error_response(str(e))
@@ -618,7 +618,7 @@ async def chat_completion_full_generator(
                 final_res = res
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 53ae1b134590..17197dce8da2 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -106,7 +106,7 @@ async def create_completion(
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -158,7 +158,7 @@ async def create_completion(
                     )
 
                 generators.append(generator)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -215,7 +215,7 @@ async def create_completion(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -371,7 +371,7 @@ async def completion_stream_generator(
             # report to FastAPI middleware aggregate usage across all choices
             request_metadata.final_usage_info = final_usage_info
 
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))
             yield f"data: {data}\n\n"
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index fe8ba5eb95b9..e7116a3d95d1 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -136,7 +136,7 @@ async def create_embedding(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -167,7 +167,7 @@ async def create_embedding(
                 )
 
                 generators.append(generator)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -196,7 +196,7 @@ async def create_embedding(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 3441071344f4..5830322071e5 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -132,7 +132,7 @@ async def create_pooling(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -163,7 +163,7 @@ async def create_pooling(
                 )
 
                 generators.append(generator)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -192,7 +192,7 @@ async def create_pooling(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 9b5aa13bda84..5d3e7139d7a1 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -101,7 +101,7 @@ async def create_score(
             if not self.model_config.is_cross_encoder:
                 raise ValueError("Model is not cross encoder.")
 
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
@@ -155,7 +155,7 @@ async def create_score(
                 )
 
                 generators.append(generator)
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
@@ -184,7 +184,7 @@ async def create_score(
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
-        except Exception as e:
+        except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index a3dc42ff8f02..b67ecfb01316 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -86,7 +86,7 @@ async def create_tokenize(
                      request.prompt,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except Exception as e:
+        except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 

From 46734ebbe3dcbc5bcec4347c950d0d3704d11b49 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 19:13:44 +0000
Subject: [PATCH 039/130] revert debug cruft

---
 vllm/v1/executor/multiproc_executor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ab96842210fa..9464744e184b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -402,7 +402,6 @@ class ResponseStatus(Enum):
 
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
-        i = 0
         while True:
             method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
@@ -414,8 +413,5 @@ def worker_busy_loop(self):
                 logger.exception("WorkerProc hit an exception: %s", exc_info=e)
                 continue
 
-            if i == 10 and self.rank == 0:
-                raise ValueError
-            i += 1
             self.worker_response_mq.enqueue(
                 (WorkerProc.ResponseStatus.SUCCESS, output))

From f0baffbe8d07d02a1b3930b84db1b9151679d960 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 20:19:04 +0000
Subject: [PATCH 040/130] fix error

---
 vllm/v1/engine/core_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index ddedac0be1c1..4a069b0edba9 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -193,7 +193,7 @@ def __init__(self,
         # Next action taken will raise EngineDeadError.
         def sigusr1_handler(signum, frame):
             logger.fatal("LLMEngine got fatal signal from background "
-                         "process, starting shutting down.")
+                         "process, shutting down.")
             self._errored = True
             self.shutdown()
 

From 8a7f18e7234f2075778994a0d5a3daafc8da176b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 21:34:34 +0000
Subject: [PATCH 041/130] added tests

---
 tests/v1/shutdown/forward_error.py | 116 +++++++++++++++++++++++++++++
 tests/v1/utils.py                  |  28 +++++++
 2 files changed, 144 insertions(+)
 create mode 100644 tests/v1/shutdown/forward_error.py
 create mode 100644 tests/v1/utils.py

diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py
new file mode 100644
index 000000000000..824447ce4da0
--- /dev/null
+++ b/tests/v1/shutdown/forward_error.py
@@ -0,0 +1,116 @@
+"""Test that we handle an Error in model forward and shutdown."""
+
+import asyncio
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.utils import cuda_device_count_stateless 
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineDeadError
+
+from tests.utils import wait_for_gpu_memory_to_clear
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception after 5 calls."""
+    NUMBER_OF_GOOD_PASSES = 10
+
+    if not hasattr(self, "num_calls"):
+        self.num_calls = 0
+    
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES and
+        get_tensor_model_parallel_rank() == 0):
+        raise Exception("Simulated illegal memory access on Rank 0!")
+    self.num_calls += 1
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        engine_args = AsyncEngineArgs(
+            model="meta-llama/Llama-3.2-1B",
+            enforce_eager=True,
+            tensor_parallel_size=tensor_parallel_size)
+        async_llm = AsyncLLM.from_engine_args(engine_args)
+
+        async def generate(request_id: str):
+            generator = async_llm.generate("Hello my name is",
+                                           request_id=request_id,
+                                           sampling_params=SamplingParams())
+            try:
+                async for _ in generator:
+                    pass
+            except Exception as e:
+                return e
+        
+        NUM_REQS = 3
+        tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+        outputs = await asyncio.gather(*tasks)
+
+        # Every request should have get an EngineDeadError.
+        for output in outputs:
+            assert isinstance(output, EngineDeadError)
+
+        # AsyncLLM should be errored.
+        assert async_llm.errored
+
+        # We should not be able to make another request.
+        with pytest.raises(EngineDeadError):
+            async for _ in async_llm.generate(
+                "Hello my name is", request_id="abc",
+                sampling_params=SamplingParams()):
+                raise Exception("We should not get here.")
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
+
+        # NOTE: shutdown is handled by the API Server. If an exception
+        # occurs, so it is expected that we would need to call this.
+        async_llm.shutdown()
+
+
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+def test_llm_model_error(monkeypatch, tensor_parallel_size,
+                         enable_multiprocessing):
+
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        llm = LLM(model="meta-llama/Llama-3.2-1B",
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+
+        with pytest.raises(EngineDeadError):
+            llm.generate("Hello my name is Robert and I")
+    
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
diff --git a/tests/v1/utils.py b/tests/v1/utils.py
new file mode 100644
index 000000000000..b06c775220cc
--- /dev/null
+++ b/tests/v1/utils.py
@@ -0,0 +1,28 @@
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.model_executor.models.llama import LlamaModel
+
+NUMBER_OF_GOOD_PASSES = 10
+
+class ModelForwardError(Exception):
+    pass
+
+class EvilLlamaModel(LlamaModel):
+    """Evil Llama Class For Simulating Model Issue."""
+
+    def __init__(self, *args, **kwargs):
+        super.__init__(*args, **kwargs)
+        self.good_passes = 0
+
+    def forward(self, *args, **kwargs):
+        """Raise an after N iterations"""
+
+        if (self.good_passes == NUMBER_OF_GOOD_PASSES and
+            get_tensor_model_parallel_rank() == 0):
+            raise ModelForwardError(
+                "Simulated illegal memory access on rank 0!")
+        self.good_passes += 1
+        return self.forward(*args, **kwargs)
+
+    
+
+

From a66294091094dfbcdc3f7506624f94ce5ee0ba9b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 21:34:44 +0000
Subject: [PATCH 042/130] revert

---
 tests/v1/utils.py           | 28 ----------------------------
 vllm/v1/engine/async_llm.py | 32 +++++++++++++++++---------------
 2 files changed, 17 insertions(+), 43 deletions(-)
 delete mode 100644 tests/v1/utils.py

diff --git a/tests/v1/utils.py b/tests/v1/utils.py
deleted file mode 100644
index b06c775220cc..000000000000
--- a/tests/v1/utils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.model_executor.models.llama import LlamaModel
-
-NUMBER_OF_GOOD_PASSES = 10
-
-class ModelForwardError(Exception):
-    pass
-
-class EvilLlamaModel(LlamaModel):
-    """Evil Llama Class For Simulating Model Issue."""
-
-    def __init__(self, *args, **kwargs):
-        super.__init__(*args, **kwargs)
-        self.good_passes = 0
-
-    def forward(self, *args, **kwargs):
-        """Raise an after N iterations"""
-
-        if (self.good_passes == NUMBER_OF_GOOD_PASSES and
-            get_tensor_model_parallel_rank() == 0):
-            raise ModelForwardError(
-                "Simulated illegal memory access on rank 0!")
-        self.good_passes += 1
-        return self.forward(*args, **kwargs)
-
-    
-
-
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 434e2a7945d3..978740a6bd60 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -42,7 +42,7 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        self._errored = False
+        self.engine_core_errored = False
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
@@ -129,7 +129,7 @@ def from_engine_args(
 
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
-
+        
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
@@ -167,6 +167,9 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
+        if self.engine_core_errored:
+            raise EngineDeadError()
+
         # 1) Create a new output queue for the request.
         if request_id in self.rid_to_queue:
             raise ValueError(f"Request id {request_id} already running.")
@@ -219,8 +222,6 @@ async def generate(
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
         """
-        if self.errored:
-            raise EngineDeadError()
 
         try:
             # We start the output_handler on the first call to generate() so
@@ -258,25 +259,25 @@ async def generate(
 
                 yield out
 
-        # If the request is disconnected by the client, the
-        # generate() task will be canceled. So, we abort the
-        # request if we end up here.
+        # If the request is disconnected by the client, generate()
+        # is cancelled. So, we abort the request if we end up here.
         except asyncio.CancelledError:
             await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s aborted.", request_id)
             raise
 
-        # EngineCore or output_handler pushed error. Raise so API Server
-        # can handle and shutdown in vllm/entrypoints/launcher.py.
+        # EngineCore or output_handler pushed error.
         except EngineDeadError:
+            # NOTE: we do not abort, since the EngineCore is dead
+            # and we will shut down anyways (unrecoverable).
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise
 
-        # Error in the generate() task (possibly recoverable). Raise so API
-        # Server can handle and maybe shutdown vllm/entrypoints/launcher.py.
+        # Error in the generate() task (possibly recoverable).
         except Exception as e:
+            await self.abort(request_id)
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
@@ -315,13 +316,14 @@ async def _run_output_handler(self):
             raise
 
         except Exception as e:
-            logger.fatal("AsyncLLM._run_output_handler failed")
+            logger.error(
+                "AsyncLLM output_handler got an exception, shutting down",
+                exec_info=e)
             self._set_errored_and_propagate()
-            raise EngineDeadError() from e
 
     def _set_errored_and_propagate(self):
         """Propagate to all generate() tasks."""
-        self._errored = True
+        self.engine_core_errored = True
 
         # Put EngineDeadError() into each generate() task's queue,
         # each of which will raise it in their own context.
@@ -396,7 +398,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return self._errored
+        return self.engine_core_errored
 
     @property
     def dead_error(self) -> BaseException:

From 4ee6390b0563f100ab6fdc9a0222442323b3cf35 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 21:55:00 +0000
Subject: [PATCH 043/130] fixed

---
 tests/v1/shutdown/forward_error.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py
index 824447ce4da0..83bc16a54fa7 100644
--- a/tests/v1/shutdown/forward_error.py
+++ b/tests/v1/shutdown/forward_error.py
@@ -32,6 +32,9 @@ def evil_forward(self, *args, **kwargs):
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 async def test_async_llm_model_error(monkeypatch, tensor_parallel_size):
 
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 

From 3e23ee2bfebf233645c581cd8bd895d5a1c31774 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 21:55:16 +0000
Subject: [PATCH 044/130] updated

---
 tests/v1/shutdown/processor_error.py | 54 ++++++++++++++++++++++++++++
 tests/v1/shutdown/startup_error.py   |  0
 2 files changed, 54 insertions(+)
 create mode 100644 tests/v1/shutdown/processor_error.py
 create mode 100644 tests/v1/shutdown/startup_error.py

diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py
new file mode 100644
index 000000000000..49cba6e9a674
--- /dev/null
+++ b/tests/v1/shutdown/processor_error.py
@@ -0,0 +1,54 @@
+"""Test error handling in Processor."""
+
+import asyncio
+import pytest
+
+from vllm import SamplingParams
+from vllm.inputs.data import TokensPrompt
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.exceptions import EngineGenerateError
+
+
+@pytest.mark.asyncio
+async def test_async_llm_processor_error(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                                      enforce_eager=True)
+        async_llm = AsyncLLM.from_engine_args(engine_args)
+
+        async def generate(request_id: str):
+            # [] is not allowed and will raise a ValueError in Processor.
+            generator = async_llm.generate(TokensPrompt([]),
+                                           request_id=request_id,
+                                           sampling_params=SamplingParams())
+            try:
+                async for _ in generator:
+                    pass
+            except Exception as e:
+                return e
+        
+        NUM_REQS = 3
+        tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+        outputs = await asyncio.gather(*tasks)
+
+        # Every request should have get an EngineGenerateError.
+        for output in outputs:
+            with pytest.raises(EngineGenerateError):
+                raise output
+
+        # AsyncLLM should be errored.
+        assert not async_llm.errored
+
+        # This should be no problem.
+        outputs = []
+        async for out in async_llm.generate(
+            "Hello my name is", request_id="abc", 
+            sampling_params=SamplingParams(max_tokens=5)):
+            outputs.append(out)
+        assert len(outputs) == 5
+
+        async_llm.shutdown()
diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py
new file mode 100644
index 000000000000..e69de29bb2d1

From 45456f921be3c332a0b053e67a54fc2accccca5a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 22:31:59 +0000
Subject: [PATCH 045/130] fixed error

---
 tests/v1/shutdown/forward_error.py   | 22 ++++----
 tests/v1/shutdown/processor_error.py | 10 ++--
 tests/v1/shutdown/startup_error.py   | 80 ++++++++++++++++++++++++++++
 vllm/v1/engine/async_llm.py          |  2 +-
 vllm/v1/engine/core.py               | 40 +++++++-------
 vllm/v1/utils.py                     |  4 +-
 6 files changed, 123 insertions(+), 35 deletions(-)

diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py
index 83bc16a54fa7..5017bc21ac71 100644
--- a/tests/v1/shutdown/forward_error.py
+++ b/tests/v1/shutdown/forward_error.py
@@ -1,17 +1,18 @@
 """Test that we handle an Error in model forward and shutdown."""
 
 import asyncio
+
 import pytest
 
+from tests.utils import wait_for_gpu_memory_to_clear
 from vllm import LLM, SamplingParams
+from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.utils import cuda_device_count_stateless 
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
-from tests.utils import wait_for_gpu_memory_to_clear
 
 def evil_forward(self, *args, **kwargs):
     """Evil forward method that raise an exception after 5 calls."""
@@ -19,9 +20,9 @@ def evil_forward(self, *args, **kwargs):
 
     if not hasattr(self, "num_calls"):
         self.num_calls = 0
-    
-    if (self.num_calls == NUMBER_OF_GOOD_PASSES and
-        get_tensor_model_parallel_rank() == 0):
+
+    if (self.num_calls == NUMBER_OF_GOOD_PASSES
+            and get_tensor_model_parallel_rank() == 0):
         raise Exception("Simulated illegal memory access on Rank 0!")
     self.num_calls += 1
 
@@ -56,7 +57,7 @@ async def generate(request_id: str):
                     pass
             except Exception as e:
                 return e
-        
+
         NUM_REQS = 3
         tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
         outputs = await asyncio.gather(*tasks)
@@ -71,8 +72,9 @@ async def generate(request_id: str):
         # We should not be able to make another request.
         with pytest.raises(EngineDeadError):
             async for _ in async_llm.generate(
-                "Hello my name is", request_id="abc",
-                sampling_params=SamplingParams()):
+                    "Hello my name is",
+                    request_id="abc",
+                    sampling_params=SamplingParams()):
                 raise Exception("We should not get here.")
 
         # Confirm all the processes are cleaned up.
@@ -110,7 +112,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
 
         with pytest.raises(EngineDeadError):
             llm.generate("Hello my name is Robert and I")
-    
+
     # Confirm all the processes are cleaned up.
     wait_for_gpu_memory_to_clear(
         devices=list(range(tensor_parallel_size)),
diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py
index 49cba6e9a674..0b7a40bdbd97 100644
--- a/tests/v1/shutdown/processor_error.py
+++ b/tests/v1/shutdown/processor_error.py
@@ -1,11 +1,12 @@
 """Test error handling in Processor."""
 
 import asyncio
+
 import pytest
 
 from vllm import SamplingParams
-from vllm.inputs.data import TokensPrompt
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs.data import TokensPrompt
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
 
@@ -30,7 +31,7 @@ async def generate(request_id: str):
                     pass
             except Exception as e:
                 return e
-        
+
         NUM_REQS = 3
         tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
         outputs = await asyncio.gather(*tasks)
@@ -46,8 +47,9 @@ async def generate(request_id: str):
         # This should be no problem.
         outputs = []
         async for out in async_llm.generate(
-            "Hello my name is", request_id="abc", 
-            sampling_params=SamplingParams(max_tokens=5)):
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(max_tokens=5)):
             outputs.append(out)
         assert len(outputs) == 5
 
diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py
index e69de29bb2d1..4a3119e29f47 100644
--- a/tests/v1/shutdown/startup_error.py
+++ b/tests/v1/shutdown/startup_error.py
@@ -0,0 +1,80 @@
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from vllm import LLM
+from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+
+def evil_forward(self, *args, **kwargs):
+    """Evil forward method that raise an exception."""
+
+    if get_tensor_model_parallel_rank() == 0:
+        raise Exception("Simulated Error in startup!")
+
+    return self.model(*args, **kwargs, intermediate_tensors=None)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size):
+
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        engine_args = AsyncEngineArgs(
+            model="meta-llama/Llama-3.2-1B",
+            enforce_eager=True,
+            tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm we get an exception.
+        with pytest.raises(Exception, match="initialization failed"):
+            _ = AsyncLLM.from_engine_args(engine_args)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
+
+
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+def test_llm_startup_error(monkeypatch, tensor_parallel_size,
+                           enable_multiprocessing):
+
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Monkeypatch an error in the model.
+        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+        with pytest.raises(Exception, match="initialization failed"):
+            _ = LLM(model="meta-llama/Llama-3.2-1B",
+                    enforce_eager=True,
+                    tensor_parallel_size=tensor_parallel_size)
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 978740a6bd60..947a186cd0d6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -129,7 +129,7 @@ def from_engine_args(
 
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
-        
+
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 391aa59beb73..e65195c79c67 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -145,24 +145,28 @@ def __init__(
         executor_class: Type[Executor],
         log_stats: bool = False,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
-
-        # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ socket IO with GPU since they release the GIL,
-        # and to overlap some serialization/deserialization with the
-        # model forward pass.
-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
-        self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue()
-        threading.Thread(target=self.process_input_socket,
-                         args=(input_path, ),
-                         daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, ),
-                         daemon=True).start()
-
-        # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": "READY"})
+        try:
+            super().__init__(vllm_config, executor_class, log_stats)
+
+            # Background Threads and Queues for IO. These enable us to
+            # overlap ZMQ socket IO with GPU since they release the GIL,
+            # and to overlap some serialization/deserialization with the
+            # model forward pass.
+            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+            self.input_queue: queue.Queue[
+                EngineCoreRequestUnion] = queue.Queue()
+            self.output_queue: queue.Queue[
+                List[EngineCoreOutput]] = queue.Queue()
+            threading.Thread(target=self.process_input_socket,
+                             args=(input_path, ),
+                             daemon=True).start()
+            threading.Thread(target=self.process_output_socket,
+                             args=(output_path, ),
+                             daemon=True).start()
+            # Send Readiness signal to EngineClient.
+            ready_pipe.send({"status": "READY"})
+        except Exception:
+            ready_pipe.send({"status": "FAILED"})
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b0a7affbebb7..9eb81f104a26 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -110,8 +110,8 @@ def __init__(
 
         # Wait for startup.
         if reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{process_name} initialization failed. "
-                               "See root cause above.")
+            raise Exception(f"{process_name} initialization failed. "
+                            "See stack trace for root cause.")
 
     def shutdown(self):
         self._finalizer()

From 6128b1acb64ac1b79e91dad73ec06ac07c513843 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 22:58:11 +0000
Subject: [PATCH 046/130] update test coverage

---
 tests/v1/shutdown/startup_error.py     | 17 ++++++++++++-----
 vllm/v1/engine/async_llm.py            |  2 +-
 vllm/v1/engine/core.py                 |  4 +++-
 vllm/v1/executor/multiproc_executor.py |  2 +-
 4 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py
index 4a3119e29f47..25f2b77b2f3d 100644
--- a/tests/v1/shutdown/startup_error.py
+++ b/tests/v1/shutdown/startup_error.py
@@ -20,9 +20,15 @@ def evil_forward(self, *args, **kwargs):
     return self.model(*args, **kwargs, intermediate_tensors=None)
 
 
-@pytest.mark.asyncio
+MODELS = [
+    "meta-llama/Llama-3.2-1B",  # Raises on first fwd pass.
+    "mistralai/Mixtral-8x22B-Instruct-v0.1"  # Causes OOM.
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size):
+def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size):
 
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
@@ -34,7 +40,7 @@ async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size):
         monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
 
         engine_args = AsyncEngineArgs(
-            model="meta-llama/Llama-3.2-1B",
+            model=model,
             enforce_eager=True,
             tensor_parallel_size=tensor_parallel_size)
 
@@ -50,9 +56,10 @@ async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size):
         )
 
 
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+@pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-def test_llm_startup_error(monkeypatch, tensor_parallel_size,
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+def test_llm_startup_error(monkeypatch, model, tensor_parallel_size,
                            enable_multiprocessing):
 
     if cuda_device_count_stateless() < tensor_parallel_size:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 947a186cd0d6..1e0cf6d5a810 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -318,7 +318,7 @@ async def _run_output_handler(self):
         except Exception as e:
             logger.error(
                 "AsyncLLM output_handler got an exception, shutting down",
-                exec_info=e)
+                exc_info=e)
             self._set_errored_and_propagate()
 
     def _set_errored_and_propagate(self):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e65195c79c67..b0b7cf3eecfe 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -165,7 +165,9 @@ def __init__(
                              daemon=True).start()
             # Send Readiness signal to EngineClient.
             ready_pipe.send({"status": "READY"})
-        except Exception:
+
+        except Exception as e:
+            logger.exception("EngineCore got error at startup:", exc_info=e)
             ready_pipe.send({"status": "FAILED"})
 
     @staticmethod
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 9464744e184b..140fc8293134 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -385,7 +385,7 @@ def wait_for_startup(
 
             # Wait for Worker to send READY.
             while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for WorkerProc to startup.")
+                logger.info("Waiting for WorkerProc to startup.")
 
                 if not proc.is_alive():
                     raise RuntimeError("WorkerProc failed to start.")

From de2455930fdc47316767c15e56fd8925171d028a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sun, 5 Jan 2025 23:25:44 +0000
Subject: [PATCH 047/130] stash

---
 tests/v1/shutdown/processor_error.py | 2 +-
 vllm/v1/engine/core_client.py        | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py
index 0b7a40bdbd97..056851025eca 100644
--- a/tests/v1/shutdown/processor_error.py
+++ b/tests/v1/shutdown/processor_error.py
@@ -1,4 +1,4 @@
-"""Test error handling in Processor."""
+"""Test error handling in Processor. Should not impact other reqs."""
 
 import asyncio
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4a069b0edba9..4f009cbff166 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -170,6 +170,7 @@ def __init__(
 
     def shutdown(self):
         """Clean up background resources."""
+        print("IN MPCLIENT.shutdown.")
         if hasattr(self, "proc_handle"):
             self.proc_handle.shutdown()
 

From 7adf26ec2c1f3b276dbb4e4d767f48c08c86dfc5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 6 Jan 2025 03:15:23 +0000
Subject: [PATCH 048/130] added tests

---
 ...forward_error.py => test_forward_error.py} |  0
 ...essor_error.py => test_processor_error.py} |  0
 ...startup_error.py => test_startup_error.py} |  0
 vllm/v1/engine/async_llm.py                   | 37 ++-------
 vllm/v1/engine/core_client.py                 | 83 ++++++++++++-------
 vllm/v1/executor/multiproc_executor.py        |  2 +-
 vllm/v1/utils.py                              | 31 +++++--
 7 files changed, 87 insertions(+), 66 deletions(-)
 rename tests/v1/shutdown/{forward_error.py => test_forward_error.py} (100%)
 rename tests/v1/shutdown/{processor_error.py => test_processor_error.py} (100%)
 rename tests/v1/shutdown/{startup_error.py => test_startup_error.py} (100%)

diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/test_forward_error.py
similarity index 100%
rename from tests/v1/shutdown/forward_error.py
rename to tests/v1/shutdown/test_forward_error.py
diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/test_processor_error.py
similarity index 100%
rename from tests/v1/shutdown/processor_error.py
rename to tests/v1/shutdown/test_processor_error.py
diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/test_startup_error.py
similarity index 100%
rename from tests/v1/shutdown/startup_error.py
rename to tests/v1/shutdown/test_startup_error.py
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1e0cf6d5a810..aaf4ca6ccaed 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,5 +1,4 @@
 import asyncio
-import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
 
 from vllm.config import ModelConfig, VllmConfig
@@ -17,7 +16,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.core_client import AsyncMPClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.processor import Processor
@@ -48,16 +47,6 @@ def __init__(
         self.stat_loggers = stat_loggers
         self.model_config = vllm_config.model_config
 
-        # Background processes send SIGUSR1 when unrecoverable
-        # errors occur. Start the shutdown process if this happens.
-        def sigusr1_handler():
-            logger.fatal("AsyncLLM got fatal signal from background process, "
-                         "starting shutdown. See stack trace for root cause.")
-            self._set_errored_and_propagate()
-
-        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
-                                                      sigusr1_handler)
-
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -87,15 +76,14 @@ def sigusr1_handler():
         )
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = EngineCoreClient.make_client(
-            multiprocess_mode=True,
-            asyncio_mode=True,
+        self.engine_core = AsyncMPClient(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
         )
 
-        self.output_handler: Optional[asyncio.Task] = None
+        # Output handler background task.
+        self.output_handler = asyncio.create_task(self._run_output_handler())
 
     @classmethod
     def from_engine_args(
@@ -129,13 +117,12 @@ def from_engine_args(
 
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
+        if handler := getattr(self, "output_handler", None):
+            handler.cancel()
 
         if engine_core := getattr(self, "engine_core", None):
             engine_core.shutdown()
 
-        if handler := getattr(self, "output_handler", None):
-            handler.cancel()
-
     @classmethod
     def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]:
         executor_class: Type[Executor]
@@ -224,13 +211,6 @@ async def generate(
         """
 
         try:
-            # We start the output_handler on the first call to generate() so
-            # we can call __init__ before the event loop, which enables us
-            # to handle startup failure gracefully in the OpenAI server.
-            if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
-
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -316,9 +296,8 @@ async def _run_output_handler(self):
             raise
 
         except Exception as e:
-            logger.error(
-                "AsyncLLM output_handler got an exception, shutting down",
-                exc_info=e)
+            logger.error("AsyncLLM output_handler got an Exception:",
+                         exc_info=e)
             self._set_errored_and_propagate()
 
     def _set_errored_and_propagate(self):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 4f009cbff166..b446bac2e701 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,3 +1,4 @@
+import asyncio
 import signal
 import weakref
 from abc import ABC, abstractmethod
@@ -157,6 +158,7 @@ def __init__(
                                             zmq.constants.PUSH)
 
         # Start EngineCore in background process.
+        self.engine_core_errored = False
         self.proc_handle = BackgroundProcHandle(
             input_path=input_path,
             output_path=output_path,
@@ -167,15 +169,31 @@ def __init__(
                 "executor_class": executor_class,
                 "log_stats": log_stats,
             })
+        self.proc_handle.wait_for_startup()
 
     def shutdown(self):
         """Clean up background resources."""
-        print("IN MPCLIENT.shutdown.")
-        if hasattr(self, "proc_handle"):
-            self.proc_handle.shutdown()
 
+        self.proc_handle.shutdown()
         self._finalizer()
 
+    def _sigusr1_handler(self):
+        """
+        EngineCoreProc sends SIGUSR1 if it encounters an Exception.
+        Set self in errored state and begin shutdown.
+        """
+        logger.fatal("Got fatal signal from EngineCore, shutting down.")
+        self.engine_core_errored = True
+        self.shutdown()
+
+    def _format_exception(self, e: Exception) -> Exception:
+        """If errored, use EngineDeadError so root cause is clear."""
+
+        return (EngineDeadError(
+            "EngineCore encountered an issue. See stack trace "
+            "for the root cause.",
+            suppress_context=True) if self.engine_core_errored else e)
+
 
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
@@ -185,21 +203,11 @@ def __init__(self,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
 
-        # TODO(rob): signal handler only needed for SyncMPClient
-        # because AsyncLLM needs to handle the signal rather
-        # than the AsyncMPClient. TODO(rob): move the Client def
-        # to async_llm and llm_engine to make this clearer.
-        # Background procs sent SIGUSR1 if they hit error.
-        # Handle by setting _errored=True and shutting down.
-        # Next action taken will raise EngineDeadError.
+        # Setup EngineCore signal handler.
         def sigusr1_handler(signum, frame):
-            logger.fatal("LLMEngine got fatal signal from background "
-                         "process, shutting down.")
-            self._errored = True
-            self.shutdown()
+            self._sigusr1_handler()
 
         signal.signal(signal.SIGUSR1, sigusr1_handler)
-        self._errored = False
 
         super().__init__(
             asyncio_mode=False,
@@ -208,13 +216,6 @@ def sigusr1_handler(signum, frame):
             log_stats=log_stats,
         )
 
-    def _format_exception(self, e: Exception) -> Exception:
-        """If _errored, use EngineDeadError so root cause is clear."""
-        return (EngineDeadError(
-            "EngineCore encountered an issue. See stack trace "
-            "for the root cause.",
-            suppress_context=True) if self._errored else e)
-
     def get_output(self) -> List[EngineCoreOutput]:
         try:
             (frame, ) = self.output_socket.recv_multipart(copy=False)
@@ -249,6 +250,23 @@ def __init__(self,
                  vllm_config: VllmConfig,
                  executor_class: Type[Executor],
                  log_stats: bool = False):
+
+        # EngineCore sends SIGUSR1 when it gets an Exception.
+        def sigusr1_handler_asyncio():
+            self._sigusr1_handler()
+
+        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
+                                                      sigusr1_handler_asyncio)
+
+        # super().__init__ blocks the event loop until background
+        # procs are setup. This handler allows us to catch issues
+        # during startup.
+        def sigusr1_handler(signum, frame):
+            self._sigusr1_handler()
+
+        signal.signal(signal.SIGUSR1, sigusr1_handler)
+
+        # Initialize EngineCore + all background processes.
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
@@ -256,18 +274,23 @@ def __init__(self,
             log_stats=log_stats,
         )
 
-    async def get_output_async(self) -> List[EngineCoreOutput]:
-
-        frames = await self.output_socket.recv_multipart(copy=False)
-        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
+        # Remove the non-asyncio handler.
+        signal.signal(signal.SIGUSR1, signal.SIG_DFL)
 
-        return engine_core_outputs
+    async def get_output_async(self) -> List[EngineCoreOutput]:
+        try:
+            frames = await self.output_socket.recv_multipart(copy=False)
+            return self.decoder.decode(frames[0].buffer).outputs
+        except Exception as e:
+            raise self._format_exception(e) from None
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
-
-        msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
+        try:
+            msg = (request_type.value, self.encoder.encode(request))
+            await self.input_socket.send_multipart(msg, copy=False)
+        except Exception as e:
+            raise self._format_exception(e) from None
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         await self._send_input(EngineCoreRequestType.ADD, request)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 140fc8293134..36eca453307b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -48,8 +48,8 @@ def sigusr1_handler(signum, frame):
                 "shutting down. See stack trace above for root cause issue.")
             # Shutdown first (avoid SysExit exceptions in __del__).
             self.shutdown()
-            # TODO(rob): move this to the VLLMConfig.
             if VLLM_ENABLE_V1_MULTIPROCESSING:
+                # TODO(rob): move this to the VLLMConfig.
                 # Propagate up if using the mp engine. Note that
                 # sending in non-mp mode crashes caller process.
                 psutil.Process().parent().send_signal(signal.SIGUSR1)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 9eb81f104a26..1ebd71544d9a 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -93,7 +93,8 @@ def __init__(
         process_kwargs: Dict[Any, Any],
     ):
         context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
+        self.reader, writer = context.Pipe(duplex=False)
+        self.process_name = process_name
 
         assert ("ready_pipe" not in process_kwargs
                 and "input_path" not in process_kwargs
@@ -102,20 +103,38 @@ def __init__(
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
+        # Flag for shutdown state. BackgroundProcs send signals
+        # when errors occur which calls shutdown(). If we are in
+        # startup loop when signaled, this flag breaks us out.
+        self.shutting_down = False
+
         # Run busy loop in background process.
         self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                            input_path, output_path)
         self.proc.start()
 
-        # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise Exception(f"{process_name} initialization failed. "
-                            "See stack trace for root cause.")
-
     def shutdown(self):
+        self.shutting_down = True
         self._finalizer()
 
+    def wait_for_startup(self):
+        """Wait until the background process is ready."""
+
+        e = Exception(f"{self.process_name} initialization failed due to "
+                      "an exception in a background process. See stack trace "
+                      "for root cause.")
+
+        while not self.reader.poll(timeout=1):
+            if self.shutting_down:
+                raise e
+        try:
+            if self.reader.recv()["status"] != "READY":
+                raise e
+        except EOFError:
+            e.__suppress_context__ = True
+            raise e from None
+
 
 # Note(rob): shutdown function cannot be a bound method,
 # else the gc cannot collect the object.

From bf928540d16e13c7dde02a758031f9a14902d78a Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Tue, 7 Jan 2025 01:07:58 +0000
Subject: [PATCH 049/130] stash

---
 vllm/v1/engine/async_llm.py   | 10 ++++------
 vllm/v1/engine/core_client.py | 28 ++++++++++++++++------------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index aaf4ca6ccaed..f15317498a20 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -41,7 +41,6 @@ def __init__(
         start_engine_loop: bool = True,
     ) -> None:
 
-        self.engine_core_errored = False
         self.log_requests = log_requests
         self.log_stats = log_stats
         self.stat_loggers = stat_loggers
@@ -154,7 +153,7 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        if self.engine_core_errored:
+        if self.errored:
             raise EngineDeadError()
 
         # 1) Create a new output queue for the request.
@@ -298,11 +297,10 @@ async def _run_output_handler(self):
         except Exception as e:
             logger.error("AsyncLLM output_handler got an Exception:",
                          exc_info=e)
-            self._set_errored_and_propagate()
+            self._propagate_error()
 
-    def _set_errored_and_propagate(self):
+    def _propagate_error(self):
         """Propagate to all generate() tasks."""
-        self.engine_core_errored = True
 
         # Put EngineDeadError() into each generate() task's queue,
         # each of which will raise it in their own context.
@@ -377,7 +375,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return self.engine_core_errored
+        return self.engine_core.engine_core_errored
 
     @property
     def dead_error(self) -> BaseException:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b446bac2e701..dcbb8eb80fbf 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -252,19 +252,14 @@ def __init__(self,
                  log_stats: bool = False):
 
         # EngineCore sends SIGUSR1 when it gets an Exception.
-        def sigusr1_handler_asyncio():
-            self._sigusr1_handler()
-
-        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
-                                                      sigusr1_handler_asyncio)
-
-        # super().__init__ blocks the event loop until background
-        # procs are setup. This handler allows us to catch issues
-        # during startup.
+        # NOTE: super().__init__ blocks the event loop until 
+        # background procs are setup. This handler allows us
+        # to catch  issues during startup (e.g. OOM). We switch 
+        # to a signal handler in the event loop __init__.
         def sigusr1_handler(signum, frame):
             self._sigusr1_handler()
 
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        # signal.signal(signal.SIGUSR1, sigusr1_handler)
 
         # Initialize EngineCore + all background processes.
         super().__init__(
@@ -274,8 +269,17 @@ def sigusr1_handler(signum, frame):
             log_stats=log_stats,
         )
 
-        # Remove the non-asyncio handler.
-        signal.signal(signal.SIGUSR1, signal.SIG_DFL)
+        # Unregister the hander in the main trhead, 
+        # signal.signal(signal.SIGUSR1, signal.SIG_DFL)
+
+        # NOTE TO SELF: putting this in AsyncMPClient is causing issues
+        # where the AsyncLLM is not triggering shutdown since the Excpections
+        # are not being raised. TODO: move it back to AsyncLLM.
+        def sigusr1_handler_asyncio():
+            self._sigusr1_handler()
+
+        asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1,
+                                                      sigusr1_handler_asyncio)
 
     async def get_output_async(self) -> List[EngineCoreOutput]:
         try:

From 6b4fe88fa065ccd997b4d4ceb3689c054fe948a6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 7 Feb 2025 23:43:42 +0000
Subject: [PATCH 050/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py        | 14 ++-----------
 vllm/v1/engine/core_client.py      | 33 +++++++++++++++++++++++++-----
 vllm/v1/engine/output_processor.py |  7 +++++++
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 5307df870f65..f318af5fa833 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -236,10 +236,8 @@ async def generate(
                 logger.info("Request %s aborted.", request_id)
             raise
 
-        # EngineCore or output_handler pushed error.
+        # Engine is dead. Do not abort since we shut down.
         except EngineDeadError:
-            # NOTE: we do not abort, since the EngineCore is dead
-            # and we will shut down anyways (unrecoverable).
             if self.log_requests:
                 logger.info("Request %s failed.", request_id)
             raise
@@ -299,15 +297,7 @@ async def _run_output_handler(self):
         except Exception as e:
             logger.error("AsyncLLM output_handler got an Exception:",
                          exc_info=e)
-            self._propagate_error()
-
-    def _propagate_error(self):
-        """Propagate to all generate() tasks."""
-
-        # Put EngineDeadError() into each generate() task's queue,
-        # each of which will raise it in their own context.
-        for _, q in self.rid_to_queue.items():
-            q.put_nowait(EngineDeadError())
+            self.output_processor.propagate_error(EngineDeadError())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 342dd4bbd33c..e28696e150fa 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -4,7 +4,7 @@
 import signal
 import weakref
 from abc import ABC, abstractmethod
-from typing import List, Optional, Type
+from typing import List, Optional, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -286,13 +286,36 @@ def sigusr1_handler(signum, frame):
         )
 
         self.queue_task: Optional[asyncio.Task] = None
+        self.outputs_queue: Optional[asyncio.Queue[Union[EngineCoreOutputs,
+                                                         Exception]]] = None
 
-    async def get_output_async(self) -> EngineCoreOutputs:
+    async def _process_outputs_socket_loop(self):
+        """
+        ZMQ IO background loop. This helps performance because
+        ZMQ IO releases the GIL so we can overlap with output_handler_loop.
+        """
+
+        assert self.outputs_queue is not None
         try:
-            (frame, ) = await self.output_socket.recv_multipart(copy=False)
-            return self.decoder.decode(frame.buffer)
+            while True:
+                (frame, ) = await self.output_socket.recv_multipart(copy=False)
+                outputs = self.decoder.decode(frame.buffer)
+                self.outputs_queue.put_nowait(outputs)
         except Exception as e:
-            raise self._format_exception(e) from None
+            self.outputs_queue.put_nowait(e)
+
+    async def get_output_async(self) -> EngineCoreOutputs:
+
+        if self.outputs_queue is None:
+            self.outputs_queue = asyncio.Queue()
+            self.queue_task = asyncio.create_task(
+                self._process_outputs_socket_loop())
+
+        outputs = await self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+
+        return outputs
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5dbf530caa17..69ff2fa2a802 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -93,6 +93,13 @@ def get_num_unfinished_requests(self):
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    def propagate_error(self, e: Exception):
+        """Propagate error to all generate() tasks."""
+
+        for _, state in self.request_states.items():
+            assert state.queue is not None
+            state.queue.put_nowait(e)
+
     def abort_requests(
         self,
         request_ids: List[str],

From efe85ee8ed687d0d7631a08f2ea4edfc30a02eac Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 7 Feb 2025 23:45:03 +0000
Subject: [PATCH 051/130] updared

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/exceptions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 5313c1e0943c..34ec1f6b0cd0 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
 class EngineGenerateError(Exception):
-    """Raised when a AsyncLLM.generate() fails. Maybe recoverable."""
+    """Raised when a AsyncLLM.generate() fails. Recoverable."""
     pass
 
 

From 619579554c5f4daa27ca17b95de4d22c59669aec Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 7 Feb 2025 23:47:58 +0000
Subject: [PATCH 052/130] fix typo

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py   | 11 ++++++++---
 vllm/v1/engine/core_client.py |  1 -
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f318af5fa833..70cdefc31196 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
 from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
 
@@ -83,8 +82,7 @@ def __init__(
             executor_class=executor_class,
         )
 
-        # Output handler background task.
-        self.output_handler = asyncio.create_task(self._run_output_handler())
+        self.output_handler: Optional[asyncio.Task] = None
 
     @classmethod
     def from_engine_args(
@@ -193,6 +191,13 @@ async def generate(
         """
 
         try:
+            # We start the output_handler on the first call to generate() so
+            # we can call __init__ before the event loop, which enables us
+            # to handle startup failure gracefully in the OpenAI server.
+            if self.output_handler is None:
+                self.output_handler = asyncio.create_task(
+                    self._run_output_handler())
+
             q = await self.add_request(
                 request_id,
                 prompt,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e28696e150fa..40c06d024f3b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
 import signal
 import weakref

From 0b2558695973886a45d6e17ebb755169a3e01fe1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Fri, 7 Feb 2025 23:58:03 +0000
Subject: [PATCH 053/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 40c06d024f3b..2fe1873b6440 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -284,17 +284,19 @@ def sigusr1_handler(signum, frame):
             log_stats=True,
         )
 
+        # ZMQ IO. Run it in background task so that we can
+        # overlap with AsyncLLM.output_handler_loop. This
+        # works because ZMQ IO releases the GIL.
         self.queue_task: Optional[asyncio.Task] = None
-        self.outputs_queue: Optional[asyncio.Queue[Union[EngineCoreOutputs,
-                                                         Exception]]] = None
+        self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs,
+                                                Exception]] = asyncio.Queue()
 
-    async def _process_outputs_socket_loop(self):
-        """
-        ZMQ IO background loop. This helps performance because
-        ZMQ IO releases the GIL so we can overlap with output_handler_loop.
-        """
+    def shutdown(self):
+        super().shutdown()
+        if queue_task := getattr(self, "queue_task", None):
+            queue_task.cancel()
 
-        assert self.outputs_queue is not None
+    async def _process_outputs_socket_loop(self):
         try:
             while True:
                 (frame, ) = await self.output_socket.recv_multipart(copy=False)
@@ -305,11 +307,13 @@ async def _process_outputs_socket_loop(self):
 
     async def get_output_async(self) -> EngineCoreOutputs:
 
-        if self.outputs_queue is None:
-            self.outputs_queue = asyncio.Queue()
+        # Start output loop on the first call.
+        if self.queue_task is None:
             self.queue_task = asyncio.create_task(
                 self._process_outputs_socket_loop())
 
+        # NOTE: if an exception arises processing the socket,
+        # the exception is forwarded to the queue.
         outputs = await self.outputs_queue.get()
         if isinstance(outputs, Exception):
             raise self._format_exception(outputs) from None

From 0b77b795bf9abc1d9035715f8cbee0473fe37000 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 13:50:23 +0000
Subject: [PATCH 054/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 116 ++++++++++++-------------
 1 file changed, 55 insertions(+), 61 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 2bf094e9d726..ed02e52474ad 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,12 +9,12 @@
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
+from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import cloudpickle
 import psutil
-import zmq
 
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
@@ -26,7 +26,7 @@
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
-                        get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
+                        get_open_port)
 from vllm.v1.executor.abstract import Executor
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -191,7 +191,7 @@ def check_health(self) -> None:
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    ready_path: str
+    ready_pipe: Connection
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
 
 
@@ -207,44 +207,45 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_path: str,
+        ready_pipe: Connection,
     ):
-        self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
-        # TODO: move `init_worker` to executor level as a collective rpc call
-        all_kwargs: List[Dict] = [
-            {} for _ in range(vllm_config.parallel_config.world_size)
-        ]
-        all_kwargs[rank] = {
-            "vllm_config": vllm_config,
-            "local_rank": local_rank,
-            "rank": rank,
-            "distributed_init_method": distributed_init_method,
-        }
-        wrapper.init_worker(all_kwargs)
-        self.worker = wrapper.worker
-
-        pid = os.getpid()
-        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
-        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
-
-        # Initialize MessageQueue for receiving SchedulerOutput
-        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-            input_shm_handle, self.worker.rank)
-
-        # Initializes a message queue for sending the model output
-        self.worker_response_mq = MessageQueue(1, 1)
-        worker_response_mq_handle = self.worker_response_mq.export_handle()
-
-        # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
-            payload = pickle.dumps(worker_response_mq_handle,
-                                   protocol=pickle.HIGHEST_PROTOCOL)
-            ready_socket.send_string(WorkerProc.READY_STR)
-            ready_socket.send(payload)
+        try:
+            self.rank = rank
+            wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+            # TODO: move `init_worker` to executor as a collective rpc call
+            all_kwargs: List[Dict] = [
+                {} for _ in range(vllm_config.parallel_config.world_size)
+            ]
+            all_kwargs[rank] = {
+                "vllm_config": vllm_config,
+                "local_rank": local_rank,
+                "rank": rank,
+                "distributed_init_method": distributed_init_method,
+            }
+            wrapper.init_worker(all_kwargs)
+            self.worker = wrapper.worker
+
+            pid = os.getpid()
+            _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
+            _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
+
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank)
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq = MessageQueue(1, 1)
+            worker_response_mq_handle = self.worker_response_mq.export_handle()
+
+            self.worker.init_device()
+            self.worker.load_model()
+
+            # Send Readiness signal to Executor.
+            ready_pipe.send({"status": "READY"})
 
-        self.worker.init_device()
-        self.worker.load_model()
+        except Exception as e:
+            logger.exception("WorkerProc got error at startup:", exc_info=e)
+            ready_pipe.send({"status": "FAILED"})
 
     @staticmethod
     def make_worker_process(
@@ -255,10 +256,7 @@ def make_worker_process(
             input_shm_handle,  # Receive SchedulerOutput
     ) -> WorkerProcHandle:
         context = get_mp_context()
-
-        # ZMQ path for worker to send ready message and shm_broadcast handle
-        # back to core process.
-        ready_path = get_open_zmq_ipc_path()
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -266,7 +264,7 @@ def make_worker_process(
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_path": ready_path,
+            "ready_pipe": writer,
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
@@ -275,13 +273,12 @@ def make_worker_process(
         proc.start()
 
         # Wait for startup
-        worker_response_mq_handle = WorkerProc.wait_for_startup(
-            proc, ready_path)
+        worker_response_mq_handle = WorkerProc.wait_for_startup(proc, reader)
 
         worker_response_mq = MessageQueue.create_from_handle(
             worker_response_mq_handle, 0)
 
-        return WorkerProcHandle(proc, rank, ready_path, worker_response_mq)
+        return WorkerProcHandle(proc, rank, reader, worker_response_mq)
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -346,24 +343,21 @@ def signal_handler(signum, frame):
 
     @staticmethod
     def wait_for_startup(
-        proc: BaseProcess,
-        ready_path: str,
+        process_name: str,
+        reader: Connection,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
-
-            # Wait for Worker to send READY.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.info("Waiting for WorkerProc to startup.")
 
-                if not proc.is_alive():
-                    raise RuntimeError("WorkerProc failed to start.")
+        e = Exception(f"{process_name} initialization failed due to "
+                      "an exception in a background process. See stack trace "
+                      "for root cause.")
 
-            message = socket.recv_string()
-            assert message == WorkerProc.READY_STR
-            handle_frame = socket.recv(copy=False)
-            handle = pickle.loads(handle_frame.buffer)
-            return handle
+        try:
+            if reader.recv()["status"] != "READY":
+                raise e
+        except EOFError:
+            e.__suppress_context__ = True
+            raise e from None
 
     class ResponseStatus(Enum):
         SUCCESS = auto()

From 61f3dd7c2bd8549448a53e5128f65c87200803e5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 14:15:50 +0000
Subject: [PATCH 055/130] stash

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 30 +++++++++++++++-----------
 vllm/v1/worker/gpu_worker.py           |  5 +++++
 vllm/worker/worker.py                  |  3 +++
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ed02e52474ad..e8078f8de96a 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -163,15 +163,6 @@ def wait_for_termination(procs, timeout):
             for p in active_procs:
                 p.kill()
 
-        self._cleanup_sockets()
-
-    def _cleanup_sockets(self):
-        for w in self.workers:
-            # Remove the zmq ipc socket file
-            socket_path = w.ready_path.replace("ipc://", "")
-            if os and os.path.exists(socket_path):
-                os.remove(socket_path)
-
     def shutdown(self):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
@@ -237,11 +228,20 @@ def __init__(
             self.worker_response_mq = MessageQueue(1, 1)
             worker_response_mq_handle = self.worker_response_mq.export_handle()
 
+            # Load model before we send readiness signal, such that
+            # we can catch any errors.
+            print("ABOUT TO INIT DEVICE")
             self.worker.init_device()
+            print("ABOUT TO LOAD MODEL")
             self.worker.load_model()
 
+            print("SENDING TO READINESS PIPE")
             # Send Readiness signal to Executor.
-            ready_pipe.send({"status": "READY"})
+            ready_pipe.send({
+                "status": "READY",
+                "handle": pickle.dumps(worker_response_mq_handle)
+            })
+            print("SENT TO READINESS PIPE")
 
         except Exception as e:
             logger.exception("WorkerProc got error at startup:", exc_info=e)
@@ -345,7 +345,7 @@ def signal_handler(signum, frame):
     def wait_for_startup(
         process_name: str,
         reader: Connection,
-    ) -> Optional[Handle]:
+    ) -> WorkerProcHandle:
         """Wait until the Worker is ready."""
 
         e = Exception(f"{process_name} initialization failed due to "
@@ -353,8 +353,14 @@ def wait_for_startup(
                       "for root cause.")
 
         try:
-            if reader.recv()["status"] != "READY":
+            response = reader.recv()
+            if getattr(response, "status", None) != "READY":
                 raise e
+            assert hasattr(response, "handle")
+            handle = pickle.loads(response["handle"])
+            assert isinstance(handle, WorkerProcHandle)
+            return handle
+
         except EOFError:
             e.__suppress_context__ = True
             raise e from None
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0adb69073397..985f86bd3e26 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -98,6 +98,7 @@ def wake_up(self) -> None:
         allocator.wake_up()
 
     def init_device(self):
+        print("init_device")
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
@@ -119,15 +120,19 @@ def init_device(self):
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
+        print("init_worker_distributed_environment")
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.parallel_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
+        print("set_random_seed")
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
+        print("model_runner")
         self.model_runner = GPUModelRunner(self.vllm_config, self.device)
+        print("done")
 
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 582aa460eb4f..c1f06175901a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -140,6 +140,7 @@ def wake_up(self) -> None:
         allocator.wake_up()
 
     def init_device(self) -> None:
+        print("init_device")
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
@@ -162,11 +163,13 @@ def init_device(self) -> None:
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
+        print("init_worker_dist_enviornment")
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
+        print("set_random_seed")
         set_random_seed(self.model_config.seed)
 
     def load_model(self):

From fbf19ad656900551e41c00d2f0f2a044aa27dad6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 14:16:43 +0000
Subject: [PATCH 056/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/worker/worker.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index c1f06175901a..582aa460eb4f 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -140,7 +140,6 @@ def wake_up(self) -> None:
         allocator.wake_up()
 
     def init_device(self) -> None:
-        print("init_device")
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
@@ -163,13 +162,11 @@ def init_device(self) -> None:
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
-        print("init_worker_dist_enviornment")
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.vllm_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
-        print("set_random_seed")
         set_random_seed(self.model_config.seed)
 
     def load_model(self):

From d25ce5ce58670183e2cd4347d76030bb5ce85219 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 15:03:21 +0000
Subject: [PATCH 057/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 134 +++++++++++++------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e8078f8de96a..d36a4d57153f 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -21,7 +21,6 @@
                               destroy_model_parallel)
 from vllm.distributed.device_communicators.shm_broadcast import (Handle,
                                                                  MessageQueue)
-from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
 from vllm.executor.multiproc_worker_utils import (
     _add_prefix, set_multiprocessing_worker_envs)
 from vllm.logger import init_logger
@@ -43,22 +42,6 @@ def _init_executor(self) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
 
-        # The child processes will send SIGUSR1 when unrecoverable
-        # errors happen.
-        def sigusr1_handler(signum, frame):
-            logger.fatal(
-                "MulitprocExecutor got fatal signal from worker processes, "
-                "shutting down. See stack trace above for root cause issue.")
-            # Shutdown first (avoid SysExit exceptions in __del__).
-            self.shutdown()
-            if VLLM_ENABLE_V1_MULTIPROCESSING:
-                # TODO(rob): move this to the VLLMConfig.
-                # Propagate up if using the mp engine. Note that
-                # sending in non-mp mode crashes caller process.
-                psutil.Process().parent().send_signal(signal.SIGUSR1)
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
-
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
         assert self.world_size == tensor_parallel_size, (
@@ -81,12 +64,25 @@ def sigusr1_handler(signum, frame):
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-        self.workers: List[WorkerProcHandle] = []
+        unready_workers: List[UnreadyWorkerProcHandle] = []
         for rank in range(self.world_size):
-            worker = WorkerProc.make_worker_process(self.vllm_config, rank,
-                                                    rank,
-                                                    distributed_init_method,
-                                                    scheduler_output_handle)
+            unready_worker = WorkerProc.make_worker_process(
+                vllm_config=self.vllm_config,
+                local_rank=rank,
+                rank=rank,
+                distributed_init_method=distributed_init_method,
+                input_shm_handle=scheduler_output_handle,
+            )
+            unready_workers.append(unready_worker)
+
+        # All workers are created before wait_for_ready, since
+        # initialization calls self.init_device(), which does a sync.
+        self.workers: List[WorkerProcHandle] = []
+        for unready_worker in unready_workers:
+            # NOTE: the WorkerProc wraps startup in a try ... catch
+            # so if there are any issues in loading in a WorkerProcess
+            # (e.g. OOM), an Exception will be raised here.
+            worker = WorkerProc.wait_for_ready(unready_worker)
             self.workers.append(worker)
 
         # Ensure message queues are ready. Will deadlock if re-ordered
@@ -178,13 +174,30 @@ def check_health(self) -> None:
         return
 
 
+@dataclass
+class UnreadyWorkerProcHandle:
+    """WorkerProcess handle before READY."""
+    proc: BaseProcess
+    rank: int
+    ready_pipe: Tuple[Connection, Connection]
+
+
 @dataclass
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    ready_pipe: Connection
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
 
+    @classmethod
+    def from_unready_handle(
+            cls, unready_handle: UnreadyWorkerProcHandle,
+            worker_response_mq: MessageQueue) -> "WorkerProcHandle":
+        return cls(
+            proc=unready_handle.proc,
+            rank=unready_handle.rank,
+            worker_response_mq=worker_response_mq,
+        )
+
 
 class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
@@ -228,20 +241,15 @@ def __init__(
             self.worker_response_mq = MessageQueue(1, 1)
             worker_response_mq_handle = self.worker_response_mq.export_handle()
 
-            # Load model before we send readiness signal, such that
-            # we can catch any errors.
-            print("ABOUT TO INIT DEVICE")
+            # Initialize device and loads weights
             self.worker.init_device()
-            print("ABOUT TO LOAD MODEL")
             self.worker.load_model()
 
-            print("SENDING TO READINESS PIPE")
-            # Send Readiness signal to Executor.
+            # Send READY once we know everything is loaded
             ready_pipe.send({
                 "status": "READY",
                 "handle": pickle.dumps(worker_response_mq_handle)
             })
-            print("SENT TO READINESS PIPE")
 
         except Exception as e:
             logger.exception("WorkerProc got error at startup:", exc_info=e)
@@ -254,9 +262,10 @@ def make_worker_process(
             rank: int,
             distributed_init_method: str,
             input_shm_handle,  # Receive SchedulerOutput
-    ) -> WorkerProcHandle:
+    ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
+        # (reader, writer)
+        pipe_tuple = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -264,7 +273,7 @@ def make_worker_process(
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_pipe": writer,
+            "ready_pipe": pipe_tuple[1],
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
@@ -272,13 +281,38 @@ def make_worker_process(
                                daemon=True)
         proc.start()
 
-        # Wait for startup
-        worker_response_mq_handle = WorkerProc.wait_for_startup(proc, reader)
+        return UnreadyWorkerProcHandle(proc, rank, pipe_tuple)
 
-        worker_response_mq = MessageQueue.create_from_handle(
-            worker_response_mq_handle, 0)
+    @staticmethod
+    def wait_for_ready(
+            unready_proc_handle: UnreadyWorkerProcHandle) -> WorkerProcHandle:
+
+        e = Exception("WorkerProc initialization failed due to "
+                      "an exception in a background process. "
+                      "See stack trace for root cause.")
+
+        ready_pipe = unready_proc_handle.ready_pipe[0]
+        try:
+            response = ready_pipe.recv()
+            if getattr(response, "status", None) != "READY_TO_LOAD":
+                raise e
+
+            assert hasattr(response, "handle")
+            mq_handle = pickle.loads(response["handle"])
+            assert isinstance(mq_handle, Handle)
 
-        return WorkerProcHandle(proc, rank, reader, worker_response_mq)
+            worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0)
+            return WorkerProcHandle.from_unready_handle(
+                unready_proc_handle, worker_response_mq)
+
+        except EOFError:
+            e.__suppress_context__ = True
+            raise e from None
+
+        finally:
+            # Close connection.
+            unready_proc_handle.ready_pipe[0].close()
+            unready_proc_handle.ready_pipe[1].close()
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -341,30 +375,6 @@ def signal_handler(signum, frame):
                 worker.shutdown()
                 worker = None
 
-    @staticmethod
-    def wait_for_startup(
-        process_name: str,
-        reader: Connection,
-    ) -> WorkerProcHandle:
-        """Wait until the Worker is ready."""
-
-        e = Exception(f"{process_name} initialization failed due to "
-                      "an exception in a background process. See stack trace "
-                      "for root cause.")
-
-        try:
-            response = reader.recv()
-            if getattr(response, "status", None) != "READY":
-                raise e
-            assert hasattr(response, "handle")
-            handle = pickle.loads(response["handle"])
-            assert isinstance(handle, WorkerProcHandle)
-            return handle
-
-        except EOFError:
-            e.__suppress_context__ = True
-            raise e from None
-
     class ResponseStatus(Enum):
         SUCCESS = auto()
         FAILURE = auto()

From 23342d76fc77e4f2d46eb58bdac13c8d5d9cec65 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 15:14:19 +0000
Subject: [PATCH 058/130] remove signal handler

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 24 +++++++++++-------------
 vllm/v1/worker/gpu_worker.py           |  5 -----
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index d36a4d57153f..02b503fae7d5 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -14,7 +14,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import cloudpickle
-import psutil
 
 from vllm.config import VllmConfig
 from vllm.distributed import (destroy_distributed_environment,
@@ -355,20 +354,19 @@ def signal_handler(signum, frame):
             logger.debug("Worker interrupted.")
 
         except Exception as e:
-            # Log rather than raise so the stack trace is in order of
-            # WorkerProc -> EngineCore -> AsyncLLM.
+            # NOTE: if an Exception arises in busy_loop, we send
+            # a FAILURE message over the MQ RPC to notify the Executor,
+            # which triggers system shutdown.
+            # TODO(rob): handle case where the MQ itself breaks.
+
+            # Log so stack trace order is: Worker -> EngineCore -> AsyncLLM
             logger.exception("WorkerProc got an Exception:", exc_info=e)
 
-            # The parent will send a SIGTERM to all worker processes
-            # after we send SIGUSR. Set this value so we don't re-throw
-            # SystemExit(), to avoid zmq exceptions during __del__.
+            # The parent sends a SIGTERM to all worker processes if
+            # any worker dies. Set this value so we don't re-throw
+            # SystemExit() to avoid zmq exceptions in __del__.
             shutdown_requested = True
 
-            # worker_busy_loop sends exceptions exceptons to Executor
-            # for shutdown, but if there is an error in startup or an
-            # error with IPC itself, we need to alert the parent.
-            psutil.Process().parent().send_signal(signal.SIGUSR1)
-
         finally:
             # Clean up once worker exits busy loop
             if worker is not None:
@@ -381,9 +379,9 @@ class ResponseStatus(Enum):
 
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
-        while True:
-            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+        method, args, kwargs = self.rpc_broadcast_mq.dequeue()
 
+        while True:
             try:
                 if isinstance(method, str):
                     func = getattr(self.worker, method)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 985f86bd3e26..0adb69073397 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -98,7 +98,6 @@ def wake_up(self) -> None:
         allocator.wake_up()
 
     def init_device(self):
-        print("init_device")
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until
             # the synchronization point. This causes the memory usage to grow
@@ -120,19 +119,15 @@ def init_device(self):
         else:
             raise RuntimeError(
                 f"Not support device type: {self.device_config.device}")
-        print("init_worker_distributed_environment")
         # Initialize the distributed environment.
         init_worker_distributed_environment(self.parallel_config, self.rank,
                                             self.distributed_init_method,
                                             self.local_rank)
         # Set random seed.
-        print("set_random_seed")
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
-        print("model_runner")
         self.model_runner = GPUModelRunner(self.vllm_config, self.device)
-        print("done")
 
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:

From ebdf8f90b85d5b5577d87c0336484b6e7b1d169f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 15:18:23 +0000
Subject: [PATCH 059/130] remove signal handler

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 02b503fae7d5..d6edab688a28 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -254,6 +254,9 @@ def __init__(
             logger.exception("WorkerProc got error at startup:", exc_info=e)
             ready_pipe.send({"status": "FAILED"})
 
+        finally:
+            ready_pipe.close()
+
     @staticmethod
     def make_worker_process(
             vllm_config: VllmConfig,

From 6a37020fcb275bd64da7ed009cf6e7bd43b7a1d2 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 15:20:02 +0000
Subject: [PATCH 060/130] update comment

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/executor/multiproc_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index d6edab688a28..5a5c29a3eb82 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -74,8 +74,8 @@ def _init_executor(self) -> None:
             )
             unready_workers.append(unready_worker)
 
-        # All workers are created before wait_for_ready, since
-        # initialization calls self.init_device(), which does a sync.
+        # Workers must be created before wait_for_ready to avoid
+        # deadlock, since worker.init_device() does a device sync.
         self.workers: List[WorkerProcHandle] = []
         for unready_worker in unready_workers:
             # NOTE: the WorkerProc wraps startup in a try ... catch

From 2ed3349de296e22f57da8e2af5137f4f0c0da42d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 16:53:51 +0000
Subject: [PATCH 061/130] avoid sigusr1

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/__init__.py             |  7 ++++
 vllm/v1/engine/async_llm.py            |  2 +-
 vllm/v1/engine/core.py                 | 54 +++++++++++++++++---------
 vllm/v1/engine/core_client.py          | 43 ++++++--------------
 vllm/v1/executor/multiproc_executor.py |  1 -
 5 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index b05ef3cc8c74..0b44baf4828d 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -93,6 +93,13 @@ class EngineCoreOutputs(
     outputs: List[EngineCoreOutput]
     scheduler_stats: SchedulerStats
 
+    @classmethod
+    def make_empty(cls):
+        cls(outputs=[], scheduler_stats=SchedulerStats())
+
+
+ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
 
 @dataclass
 class EngineCoreProfile:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 70cdefc31196..d80dedae5d85 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -384,7 +384,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return self.engine_core.engine_core_errored
+        return self.engine_core.is_engine_dead
 
     @property
     def dead_error(self) -> BaseException:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0bf0ecb9112d..170645249d73 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -6,9 +6,8 @@
 import threading
 import time
 from multiprocessing.connection import Connection
-from typing import List, Tuple, Type
+from typing import List, Tuple, Type, Union
 
-import psutil
 import zmq
 import zmq.asyncio
 
@@ -16,10 +15,10 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import get_exception_traceback, zmq_socket_ctx
+from vllm.utils import zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs,
                             EngineCoreProfile, EngineCoreRequest,
                             EngineCoreRequestType, EngineCoreRequestUnion,
                             EngineCoreResetPrefixCache)
@@ -132,7 +131,8 @@ def step(self) -> EngineCoreOutputs:
         return engine_core_outputs
 
     def shutdown(self):
-        self.model_executor.shutdown()
+        if model_executor := getattr(self, "model_executor", None):
+            model_executor.shutdown()
 
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
@@ -165,8 +165,8 @@ def __init__(
             # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
             self.input_queue: queue.Queue[
                 EngineCoreRequestUnion] = queue.Queue()
-            self.output_queue: queue.Queue[
-                List[EngineCoreOutput]] = queue.Queue()
+            self.output_queue: queue.Queue[Union[
+                bytes, EngineCoreOutputs]] = queue.Queue()
             threading.Thread(target=self.process_input_socket,
                              args=(input_path, ),
                              daemon=True).start()
@@ -174,12 +174,16 @@ def __init__(
                              args=(output_path, ),
                              daemon=True).start()
 
+            self.errored_sent_event = threading.Event()
+
             # Send Readiness signal to EngineClient.
             ready_pipe.send({"status": "READY"})
 
         except Exception as e:
             logger.exception("EngineCore got error at startup:", exc_info=e)
             ready_pipe.send({"status": "FAILED"})
+        finally:
+            ready_pipe.close()
 
     @staticmethod
     def run_engine_core(*args, **kwargs):
@@ -203,20 +207,12 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        parent_process = psutil.Process().parent()
-        engine_core = None
+        engine_core = EngineCoreProc(*args, **kwargs)
         try:
-            engine_core = EngineCoreProc(*args, **kwargs)
             engine_core.run_busy_loop()
-
-        except SystemExit:
-            logger.debug("EngineCore interrupted.")
-
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGUSR1)
-
+        except Exception as e:
+            logger.exception("EngineCore got an Exception:", exc_info=e)
+            engine_core._send_engine_dead()
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
@@ -266,6 +262,19 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
             assert isinstance(request, list)
             self.abort_requests(request)
 
+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD to the front of the queue.
+        with self.output_queue.mutex:
+            self.output_queue.queue.clear()
+            self.output_queue.put_nowait(ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        if not self.errored_sent_event.wait(timeout=10):
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
@@ -306,5 +315,12 @@ def process_output_socket(self, output_path: str):
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 outputs = self.output_queue.get()
+                if outputs == ENGINE_CORE_DEAD:
+                    socket.send_multipart((ENGINE_CORE_DEAD, ), copy=False)
+                    break
+
                 encoder.encode_into(outputs, buffer)
                 socket.send_multipart((buffer, ), copy=False)
+
+        # Signal to main thread that ENGINE_CORE_DEAD was sent.
+        self.errored_sent_event.set()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 2fe1873b6440..51b27082dfc4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
-import signal
 import weakref
 from abc import ABC, abstractmethod
 from typing import List, Optional, Type, Union
@@ -11,9 +10,10 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
-from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
-                            EngineCoreRequest, EngineCoreRequestType,
-                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
+from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs,
+                            EngineCoreProfile, EngineCoreRequest,
+                            EngineCoreRequestType, EngineCoreRequestUnion,
+                            EngineCoreResetPrefixCache)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
@@ -167,7 +167,7 @@ def __init__(
                                             zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        self.engine_core_errored = False
+        self.is_engine_dead = False
         self.proc_handle = BackgroundProcHandle(
             input_path=input_path,
             output_path=output_path,
@@ -186,22 +186,13 @@ def shutdown(self):
         self.proc_handle.shutdown()
         self._finalizer()
 
-    def _sigusr1_handler(self):
-        """
-        EngineCoreProc sends SIGUSR1 if it encounters an Exception.
-        Set self in errored state and begin shutdown.
-        """
-        logger.fatal("Got fatal signal from EngineCore, shutting down.")
-        self.engine_core_errored = True
-        self.shutdown()
-
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
 
         return (EngineDeadError(
             "EngineCore encountered an issue. See stack trace "
             "for the root cause.",
-            suppress_context=True) if self.engine_core_errored else e)
+            suppress_context=True) if self.is_engine_dead else e)
 
 
 class SyncMPClient(MPClient):
@@ -210,12 +201,6 @@ class SyncMPClient(MPClient):
     def __init__(self, vllm_config: VllmConfig,
                  executor_class: Type[Executor]):
 
-        # Setup EngineCore signal handler.
-        def sigusr1_handler(signum, frame):
-            self._sigusr1_handler()
-
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
-
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -227,6 +212,9 @@ def get_output(self) -> EngineCoreOutputs:
 
         try:
             (frame, ) = self.output_socket.recv_multipart(copy=False)
+            if frame == ENGINE_CORE_DEAD:
+                self.is_engine_dead = True
+                raise EngineDeadError
             engine_core_outputs = self.decoder.decode(frame.buffer)
             return engine_core_outputs
         except Exception as e:
@@ -266,16 +254,6 @@ class AsyncMPClient(MPClient):
     def __init__(self, vllm_config: VllmConfig,
                  executor_class: Type[Executor]):
 
-        # EngineCore sends SIGUSR1 when it gets an Exception.
-        # NOTE: super().__init__ blocks the event loop until
-        # background procs are setup. This handler allows us
-        # to catch  issues during startup (e.g. OOM). We switch
-        # to a signal handler in the event loop __init__.
-        def sigusr1_handler(signum, frame):
-            self._sigusr1_handler()
-
-        # signal.signal(signal.SIGUSR1, sigusr1_handler)
-
         # Initialize EngineCore + all background processes.
         super().__init__(
             asyncio_mode=True,
@@ -300,6 +278,9 @@ async def _process_outputs_socket_loop(self):
         try:
             while True:
                 (frame, ) = await self.output_socket.recv_multipart(copy=False)
+                if frame == ENGINE_CORE_DEAD:
+                    self.is_engine_dead = True
+                    raise EngineDeadError
                 outputs = self.decoder.decode(frame.buffer)
                 self.outputs_queue.put_nowait(outputs)
         except Exception as e:
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 5a5c29a3eb82..1daf71830c00 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -362,7 +362,6 @@ def signal_handler(signum, frame):
             # which triggers system shutdown.
             # TODO(rob): handle case where the MQ itself breaks.
 
-            # Log so stack trace order is: Worker -> EngineCore -> AsyncLLM
             logger.exception("WorkerProc got an Exception:", exc_info=e)
 
             # The parent sends a SIGTERM to all worker processes if

From f9ef3d811b6a9a3411bee188f7bbe72002156fca Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:24:52 +0000
Subject: [PATCH 062/130] cleanup

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/__init__.py             |  4 --
 vllm/v1/engine/core.py                 |  6 +++
 vllm/v1/engine/core_client.py          | 54 ++++++++++++--------------
 vllm/v1/executor/multiproc_executor.py | 11 ++----
 4 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 0b44baf4828d..bfba0c7f6a0a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -93,10 +93,6 @@ class EngineCoreOutputs(
     outputs: List[EngineCoreOutput]
     scheduler_stats: SchedulerStats
 
-    @classmethod
-    def make_empty(cls):
-        cls(outputs=[], scheduler_stats=SchedulerStats())
-
 
 ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 170645249d73..2f0351ca5688 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -174,6 +174,10 @@ def __init__(
                              args=(output_path, ),
                              daemon=True).start()
 
+            # Signal from process_output_socket that EngineDead
+            # message was sent. Since process_output_socket is a
+            # daemon thread, we need to ensure this message is
+            # sent before we exit from the main thread.
             self.errored_sent_event = threading.Event()
 
             # Send Readiness signal to EngineClient.
@@ -182,6 +186,8 @@ def __init__(
         except Exception as e:
             logger.exception("EngineCore got error at startup:", exc_info=e)
             ready_pipe.send({"status": "FAILED"})
+            raise e
+
         finally:
             ready_pipe.close()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 51b27082dfc4..8a813463761a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,7 +2,7 @@
 import asyncio
 import weakref
 from abc import ABC, abstractmethod
-from typing import List, Optional, Type, Union
+from typing import Any, List, Optional, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -186,6 +186,11 @@ def shutdown(self):
         self.proc_handle.shutdown()
         self._finalizer()
 
+    def _validate_alive(self, frame: Any):
+        if frame == ENGINE_CORE_DEAD:
+            self.is_engine_dead = True
+            raise EngineDeadError
+
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
 
@@ -212,11 +217,8 @@ def get_output(self) -> EngineCoreOutputs:
 
         try:
             (frame, ) = self.output_socket.recv_multipart(copy=False)
-            if frame == ENGINE_CORE_DEAD:
-                self.is_engine_dead = True
-                raise EngineDeadError
-            engine_core_outputs = self.decoder.decode(frame.buffer)
-            return engine_core_outputs
+            self._validate_alive(frame)
+            return self.decoder.decode(frame.buffer)
         except Exception as e:
             raise self._format_exception(e) from None
 
@@ -253,8 +255,6 @@ class AsyncMPClient(MPClient):
 
     def __init__(self, vllm_config: VllmConfig,
                  executor_class: Type[Executor]):
-
-        # Initialize EngineCore + all background processes.
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
@@ -262,39 +262,35 @@ def __init__(self, vllm_config: VllmConfig,
             log_stats=True,
         )
 
-        # ZMQ IO. Run it in background task so that we can
-        # overlap with AsyncLLM.output_handler_loop. This
-        # works because ZMQ IO releases the GIL.
-        self.queue_task: Optional[asyncio.Task] = None
         self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs,
                                                 Exception]] = asyncio.Queue()
+        self.queue_task: Optional[asyncio.Task] = None
 
     def shutdown(self):
         super().shutdown()
         if queue_task := getattr(self, "queue_task", None):
             queue_task.cancel()
 
-    async def _process_outputs_socket_loop(self):
-        try:
-            while True:
-                (frame, ) = await self.output_socket.recv_multipart(copy=False)
-                if frame == ENGINE_CORE_DEAD:
-                    self.is_engine_dead = True
-                    raise EngineDeadError
-                outputs = self.decoder.decode(frame.buffer)
-                self.outputs_queue.put_nowait(outputs)
-        except Exception as e:
-            self.outputs_queue.put_nowait(e)
-
     async def get_output_async(self) -> EngineCoreOutputs:
 
-        # Start output loop on the first call.
         if self.queue_task is None:
-            self.queue_task = asyncio.create_task(
-                self._process_outputs_socket_loop())
 
-        # NOTE: if an exception arises processing the socket,
-        # the exception is forwarded to the queue.
+            async def process_outputs_socket():
+                try:
+                    (frame, ) = await self.output_socket.recv_multipart(
+                        copy=False)
+                    self._validate_alive(frame)
+                    self.outputs_queue.put_nowait(frame.buffer)
+                except Exception as e:
+                    self.outputs_queue.put_nowait(e)
+
+            # Run ZMQ IO (which releases the GIL) in a background task
+            # to overlap with this task (run_output_handler).
+            self.queue_task = asyncio.create_task(process_outputs_socket())
+
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
         outputs = await self.outputs_queue.get()
         if isinstance(outputs, Exception):
             raise self._format_exception(outputs) from None
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 1daf71830c00..ce0522b81137 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -295,14 +295,14 @@ def wait_for_ready(
 
         ready_pipe = unready_proc_handle.ready_pipe[0]
         try:
+            # Wait until the WorkerProc is ready.
             response = ready_pipe.recv()
-            if getattr(response, "status", None) != "READY_TO_LOAD":
+            if response["status"] != "READY":
                 raise e
 
-            assert hasattr(response, "handle")
+            # Extract the message queue handle.
             mq_handle = pickle.loads(response["handle"])
-            assert isinstance(mq_handle, Handle)
-
+            print(f"{mq_handle=}")
             worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0)
             return WorkerProcHandle.from_unready_handle(
                 unready_proc_handle, worker_response_mq)
@@ -353,9 +353,6 @@ def signal_handler(signum, frame):
 
             worker.worker_busy_loop()
 
-        except SystemExit:
-            logger.debug("Worker interrupted.")
-
         except Exception as e:
             # NOTE: if an Exception arises in busy_loop, we send
             # a FAILURE message over the MQ RPC to notify the Executor,

From 95c249f76a516684a7ef3b5bb6114ef6aeb198f6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:25:37 +0000
Subject: [PATCH 063/130] cleanup

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8a813463761a..48afca8de536 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -272,7 +272,6 @@ def shutdown(self):
             queue_task.cancel()
 
     async def get_output_async(self) -> EngineCoreOutputs:
-
         if self.queue_task is None:
 
             async def process_outputs_socket():

From 030c671efb8076ec28307e99c0cae893f7385040 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:25:58 +0000
Subject: [PATCH 064/130] cleanup

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 48afca8de536..644c32213458 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -273,7 +273,8 @@ def shutdown(self):
 
     async def get_output_async(self) -> EngineCoreOutputs:
         if self.queue_task is None:
-
+            # Run ZMQ IO (which releases the GIL) in a background task
+            # to overlap with this task (run_output_handler).
             async def process_outputs_socket():
                 try:
                     (frame, ) = await self.output_socket.recv_multipart(
@@ -283,8 +284,6 @@ async def process_outputs_socket():
                 except Exception as e:
                     self.outputs_queue.put_nowait(e)
 
-            # Run ZMQ IO (which releases the GIL) in a background task
-            # to overlap with this task (run_output_handler).
             self.queue_task = asyncio.create_task(process_outputs_socket())
 
         # If an exception arises in process_outputs_socket task,

From 1bdb212d1ac80856f9fe31572f3d6b7665a6d058 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:26:50 +0000
Subject: [PATCH 065/130] cleanup

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 644c32213458..fdd3e28b724e 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -205,7 +205,6 @@ class SyncMPClient(MPClient):
 
     def __init__(self, vllm_config: VllmConfig,
                  executor_class: Type[Executor]):
-
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,

From 25412a0a3f08716c91cff6f2d72a5026cd496f55 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:33:12 +0000
Subject: [PATCH 066/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 4 +---
 vllm/v1/engine/exceptions.py  | 5 ++++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index fdd3e28b724e..7e0dba5c4ec4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -189,14 +189,12 @@ def shutdown(self):
     def _validate_alive(self, frame: Any):
         if frame == ENGINE_CORE_DEAD:
             self.is_engine_dead = True
-            raise EngineDeadError
+            raise EngineDeadError()
 
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
 
         return (EngineDeadError(
-            "EngineCore encountered an issue. See stack trace "
-            "for the root cause.",
             suppress_context=True) if self.is_engine_dead else e)
 
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 34ec1f6b0cd0..ff74556cc160 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -8,7 +8,10 @@ class EngineDeadError(Exception):
     """Raised when the EngineCore dies. Unrecoverable."""
 
     def __init__(self, *args, suppress_context: bool = False, **kwargs):
-        super().__init__(args, kwargs)
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. \
+            See stack trace for the root cause issue."
+
+        super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
         # Make stack trace clearer when using with LLMEngine by
         # silencing irrelevant ZMQError.
         self.__suppress_context__ = suppress_context

From 7cf0647ab0352578c9ac08a265db4ea8e7ff1680 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 17:38:35 +0000
Subject: [PATCH 067/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/__init__.py    |  3 ---
 vllm/v1/engine/core.py        | 15 ++++++++-------
 vllm/v1/engine/core_client.py |  9 ++++-----
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index bfba0c7f6a0a..b05ef3cc8c74 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -94,9 +94,6 @@ class EngineCoreOutputs(
     scheduler_stats: SchedulerStats
 
 
-ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
-
-
 @dataclass
 class EngineCoreProfile:
     is_start: bool
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2f0351ca5688..9dadc0eb5c19 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -18,10 +18,9 @@
 from vllm.utils import zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion,
-                            EngineCoreResetPrefixCache)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
@@ -144,6 +143,8 @@ def reset_prefix_cache(self):
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
 
+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
     def __init__(
         self,
         input_path: str,
@@ -274,7 +275,7 @@ def _send_engine_dead(self):
         # Put ENGINE_CORE_DEAD to the front of the queue.
         with self.output_queue.mutex:
             self.output_queue.queue.clear()
-            self.output_queue.put_nowait(ENGINE_CORE_DEAD)
+            self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
 
         # Wait until msg sent by the daemon before shutdown.
         if not self.errored_sent_event.wait(timeout=10):
@@ -321,8 +322,8 @@ def process_output_socket(self, output_path: str):
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 outputs = self.output_queue.get()
-                if outputs == ENGINE_CORE_DEAD:
-                    socket.send_multipart((ENGINE_CORE_DEAD, ), copy=False)
+                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
+                    socket.send_multipart((outputs, ), copy=False)
                     break
 
                 encoder.encode_into(outputs, buffer)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 7e0dba5c4ec4..b52a5400c5b1 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -10,10 +10,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
-from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs,
-                            EngineCoreProfile, EngineCoreRequest,
-                            EngineCoreRequestType, EngineCoreRequestUnion,
-                            EngineCoreResetPrefixCache)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
+                            EngineCoreRequest, EngineCoreRequestType,
+                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.executor.abstract import Executor
@@ -187,7 +186,7 @@ def shutdown(self):
         self._finalizer()
 
     def _validate_alive(self, frame: Any):
-        if frame == ENGINE_CORE_DEAD:
+        if frame == EngineCoreProc.ENGINE_CORE_DEAD:
             self.is_engine_dead = True
             raise EngineDeadError()
 

From 352da94dc7cfcbf8d2d0a5a860ab40910c98312c Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 18:27:35 +0000
Subject: [PATCH 068/130] it starts?

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 examples/online_serving/openai_completion_client.py | 7 ++++---
 vllm/v1/engine/async_llm.py                         | 8 ++++++--
 vllm/v1/engine/core.py                              | 8 ++------
 vllm/v1/engine/core_client.py                       | 2 +-
 vllm/v1/engine/exceptions.py                        | 3 +--
 vllm/v1/executor/multiproc_executor.py              | 7 +++----
 vllm/v1/utils.py                                    | 7 +++++--
 7 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 06b93d7d1931..20fa043d9670 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -16,14 +16,15 @@
 model = models.data[0].id
 
 # Completion API
-stream = False
+stream = True
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
     echo=False,
-    n=2,
+    # n=2,
     stream=stream,
-    logprobs=3)
+)
+# logprobs=3)
 
 print("Completion results:")
 if stream:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d80dedae5d85..22e882d49976 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -376,7 +376,11 @@ async def reset_prefix_cache(self) -> None:
 
     @property
     def is_running(self) -> bool:
-        return not self.errored
+        # Have not started the loop yet.
+        if self.output_handler is None:
+            return True
+
+        return not self.output_handler.done()
 
     @property
     def is_stopped(self) -> bool:
@@ -384,7 +388,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return self.engine_core.is_engine_dead
+        return (self.engine_core.is_engine_dead or not self.is_running)
 
     @property
     def dead_error(self) -> BaseException:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9dadc0eb5c19..dacd95f96fd5 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -47,6 +47,7 @@ def __init__(
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
+        print("EXECUTOR_READY")
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
@@ -168,6 +169,7 @@ def __init__(
                 EngineCoreRequestUnion] = queue.Queue()
             self.output_queue: queue.Queue[Union[
                 bytes, EngineCoreOutputs]] = queue.Queue()
+            self.errored_sent_event = threading.Event()
             threading.Thread(target=self.process_input_socket,
                              args=(input_path, ),
                              daemon=True).start()
@@ -175,12 +177,6 @@ def __init__(
                              args=(output_path, ),
                              daemon=True).start()
 
-            # Signal from process_output_socket that EngineDead
-            # message was sent. Since process_output_socket is a
-            # daemon thread, we need to ensure this message is
-            # sent before we exit from the main thread.
-            self.errored_sent_event = threading.Event()
-
             # Send Readiness signal to EngineClient.
             ready_pipe.send({"status": "READY"})
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b52a5400c5b1..e1b9d65e3340 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -289,7 +289,7 @@ async def process_outputs_socket():
         if isinstance(outputs, Exception):
             raise self._format_exception(outputs) from None
 
-        return outputs
+        return self.decoder.decode(outputs)
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index ff74556cc160..266745124f37 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -8,8 +8,7 @@ class EngineDeadError(Exception):
     """Raised when the EngineCore dies. Unrecoverable."""
 
     def __init__(self, *args, suppress_context: bool = False, **kwargs):
-        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. \
-            See stack trace for the root cause issue."
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause issue."  # noqa: E501
 
         super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
         # Make stack trace clearer when using with LLMEngine by
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index ce0522b81137..00dbb510b382 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -80,7 +80,7 @@ def _init_executor(self) -> None:
         for unready_worker in unready_workers:
             # NOTE: the WorkerProc wraps startup in a try ... catch
             # so if there are any issues in loading in a WorkerProcess
-            # (e.g. OOM), an Exception will be raised here.
+            # (e.g. OOM), an Exception will be caught here.
             worker = WorkerProc.wait_for_ready(unready_worker)
             self.workers.append(worker)
 
@@ -302,7 +302,6 @@ def wait_for_ready(
 
             # Extract the message queue handle.
             mq_handle = pickle.loads(response["handle"])
-            print(f"{mq_handle=}")
             worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0)
             return WorkerProcHandle.from_unready_handle(
                 unready_proc_handle, worker_response_mq)
@@ -378,9 +377,9 @@ class ResponseStatus(Enum):
 
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
-        method, args, kwargs = self.rpc_broadcast_mq.dequeue()
-
         while True:
+            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+
             try:
                 if isinstance(method, str):
                     func = getattr(self.worker, method)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 2f5168296e67..396f4bac75a2 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -102,13 +102,13 @@ def __init__(
         process_kwargs: Dict[Any, Any],
     ):
         context = get_mp_context()
-        self.reader, writer = context.Pipe(duplex=False)
+        self.reader, self.writer = context.Pipe(duplex=False)
         self.process_name = process_name
 
         assert ("ready_pipe" not in process_kwargs
                 and "input_path" not in process_kwargs
                 and "output_path" not in process_kwargs)
-        process_kwargs["ready_pipe"] = writer
+        process_kwargs["ready_pipe"] = self.writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
@@ -143,6 +143,9 @@ def wait_for_startup(self):
         except EOFError:
             e.__suppress_context__ = True
             raise e from None
+        finally:
+            self.reader.close()
+            self.writer.close()
 
 
 # Note(rob): shutdown function cannot be a bound method,

From a69e04096a7114b7f4293482cce7acf014c60734 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 18:45:44 +0000
Subject: [PATCH 069/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 .../online_serving/openai_completion_client.py    |  7 +++----
 vllm/v1/engine/core.py                            |  3 +--
 vllm/v1/engine/core_client.py                     | 15 ++++++++-------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 20fa043d9670..06b93d7d1931 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -16,15 +16,14 @@
 model = models.data[0].id
 
 # Completion API
-stream = True
+stream = False
 completion = client.completions.create(
     model=model,
     prompt="A robot may not injure a human being",
     echo=False,
-    # n=2,
+    n=2,
     stream=stream,
-)
-# logprobs=3)
+    logprobs=3)
 
 print("Completion results:")
 if stream:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index dacd95f96fd5..ae9a33110216 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -47,7 +47,6 @@ def __init__(
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
-        print("EXECUTOR_READY")
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
@@ -248,7 +247,7 @@ def run_busy_loop(self):
             # 3) Step the engine core.
             outputs = self.step()
 
-            # 5) Put EngineCoreOutputs into the output queue.
+            # 4) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
 
     def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e1b9d65e3340..33eaa2c211ca 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -185,8 +185,8 @@ def shutdown(self):
         self.proc_handle.shutdown()
         self._finalizer()
 
-    def _validate_alive(self, frame: Any):
-        if frame == EngineCoreProc.ENGINE_CORE_DEAD:
+    def _validate_alive(self, buffer: Any):
+        if buffer == EngineCoreProc.ENGINE_CORE_DEAD:
             self.is_engine_dead = True
             raise EngineDeadError()
 
@@ -213,7 +213,7 @@ def get_output(self) -> EngineCoreOutputs:
 
         try:
             (frame, ) = self.output_socket.recv_multipart(copy=False)
-            self._validate_alive(frame)
+            self._validate_alive(frame.buffer)
             return self.decoder.decode(frame.buffer)
         except Exception as e:
             raise self._format_exception(e) from None
@@ -273,10 +273,11 @@ async def get_output_async(self) -> EngineCoreOutputs:
             # to overlap with this task (run_output_handler).
             async def process_outputs_socket():
                 try:
-                    (frame, ) = await self.output_socket.recv_multipart(
-                        copy=False)
-                    self._validate_alive(frame)
-                    self.outputs_queue.put_nowait(frame.buffer)
+                    while True:
+                        (frame, ) = await self.output_socket.recv_multipart(
+                            copy=False)
+                        self._validate_alive(frame.buffer)
+                        self.outputs_queue.put_nowait(frame.buffer)
                 except Exception as e:
                     self.outputs_queue.put_nowait(e)
 

From 8dddc206a5ff2c1262b9d84885c57f4f782cebb0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 18:47:47 +0000
Subject: [PATCH 070/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7ef40564c5bd..3a94511e3992 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -187,6 +187,7 @@ steps:
   commands:
     # split the test to avoid interference
     - VLLM_USE_V1=1 pytest -v -s v1/core
+    - VLLM_USE_V1=1 pytest -v -s v1/shutdown
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample
     - VLLM_USE_V1=1 pytest -v -s v1/worker

From 7b48b87a2ffb52c2f0e352235ebadbf4ad22d58b Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 18:58:14 +0000
Subject: [PATCH 071/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core.py       | 6 ++----
 vllm/v1/engine/exceptions.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ae9a33110216..93b433e67f36 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -268,12 +268,10 @@ def _send_engine_dead(self):
         """Send EngineDead status to the EngineCoreClient."""
 
         # Put ENGINE_CORE_DEAD to the front of the queue.
-        with self.output_queue.mutex:
-            self.output_queue.queue.clear()
-            self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
 
         # Wait until msg sent by the daemon before shutdown.
-        if not self.errored_sent_event.wait(timeout=10):
+        if not self.errored_sent_event.wait(timeout=10.):
             logger.fatal("vLLM shutdown signal from EngineCore failed "
                          "to send. Please report this issue.")
 
diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index 266745124f37..aa8a1227420e 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -8,7 +8,7 @@ class EngineDeadError(Exception):
     """Raised when the EngineCore dies. Unrecoverable."""
 
     def __init__(self, *args, suppress_context: bool = False, **kwargs):
-        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause issue."  # noqa: E501
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause."  # noqa: E501
 
         super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
         # Make stack trace clearer when using with LLMEngine by

From 74008529a3307414ea059309423981557e50bf54 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 19:12:26 +0000
Subject: [PATCH 072/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py   | 3 ++-
 tests/v1/shutdown/test_processor_error.py | 1 +
 tests/v1/shutdown/test_startup_error.py   | 1 +
 vllm/v1/engine/async_llm.py               | 2 ++
 vllm/v1/executor/multiproc_executor.py    | 2 +-
 5 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 5017bc21ac71..6cc143bfaddd 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that we handle an Error in model forward and shutdown."""
 
 import asyncio
@@ -84,7 +85,7 @@ async def generate(request_id: str):
             timeout_s=60,
         )
 
-        # NOTE: shutdown is handled by the API Server. If an exception
+        # NOTE: shutdown is handled by the API Server if an exception
         # occurs, so it is expected that we would need to call this.
         async_llm.shutdown()
 
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 056851025eca..a98ed6f12324 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test error handling in Processor. Should not impact other reqs."""
 
 import asyncio
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 25f2b77b2f3d..0516e22b31f0 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""
 
 import pytest
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 22e882d49976..dcf6c78930a4 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -388,6 +388,8 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
+        print(f"{self.engine_core.is_engine_dead=}")
+        print(f"{self.is_running=}")
         return (self.engine_core.is_engine_dead or not self.is_running)
 
     @property
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 00dbb510b382..8b6777f769db 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -389,7 +389,7 @@ def worker_busy_loop(self):
             except Exception as e:
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
-                logger.exception("WorkerProc hit an exception: %s", exc_info=e)
+                logger.exception("WorkerProc hit an exception:", exc_info=e)
                 continue
 
             self.worker_response_mq.enqueue(

From 80317a0fb62ed9a01ea00b36b03773dc8d855965 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 20:18:11 +0000
Subject: [PATCH 073/130] updated

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/shutdown/test_processor_error.py |  6 ++++--
 vllm/v1/engine/core_client.py             | 15 ++++++---------
 vllm/v1/utils.py                          | 15 +++++----------
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index a98ed6f12324..2d0a382ba99b 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -46,12 +46,14 @@ async def generate(request_id: str):
         assert not async_llm.errored
 
         # This should be no problem.
+        EXPECTED_TOKENS = 5
         outputs = []
         async for out in async_llm.generate(
                 "Hello my name is",
                 request_id="abc",
-                sampling_params=SamplingParams(max_tokens=5)):
+                sampling_params=SamplingParams(max_tokens=EXPECTED_TOKENS)):
             outputs.append(out)
-        assert len(outputs) == 5
+        print(f"{outputs=}")
+        assert len(outputs) == EXPECTED_TOKENS
 
         async_llm.shutdown()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 33eaa2c211ca..b2911abcfd1d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
-import weakref
 from abc import ABC, abstractmethod
 from typing import Any, List, Optional, Type, Union
 
@@ -152,11 +151,6 @@ def __init__(
             zmq.asyncio.Context()  # type: ignore[attr-defined]
             if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
 
-        # Note(rob): shutdown function cannot be a bound method,
-        # else the gc cannot collect the object.
-        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
-                                           self.ctx)
-
         # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
@@ -177,13 +171,16 @@ def __init__(
                 "executor_class": executor_class,
                 "log_stats": log_stats,
             })
-        self.proc_handle.wait_for_startup()
+        self.proc_handle.wait_for_startup(self.shutdown)
 
     def shutdown(self):
         """Clean up background resources."""
 
-        self.proc_handle.shutdown()
-        self._finalizer()
+        if ctx := getattr(self, "ctx", None):
+            ctx.destroy(linger=0)
+
+        if proc_handle := getattr(self, "proc_handle", None):
+            proc_handle.shutdown()
 
     def _validate_alive(self, buffer: Any):
         if buffer == EngineCoreProc.ENGINE_CORE_DEAD:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 396f4bac75a2..b40d36858f03 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -112,11 +112,6 @@ def __init__(
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
-        # Flag for shutdown state. BackgroundProcs send signals
-        # when errors occur which calls shutdown(). If we are in
-        # startup loop when signaled, this flag breaks us out.
-        self.shutting_down = False
-
         # Run busy loop in background process.
         self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
@@ -124,24 +119,24 @@ def __init__(
         self.proc.start()
 
     def shutdown(self):
-        self.shutting_down = True
         self._finalizer()
 
-    def wait_for_startup(self):
+    def wait_for_startup(self, shutdown_callback: Callable):
         """Wait until the background process is ready."""
 
         e = Exception(f"{self.process_name} initialization failed due to "
                       "an exception in a background process. See stack trace "
                       "for root cause.")
 
-        while not self.reader.poll(timeout=1):
-            if self.shutting_down:
-                raise e
         try:
             if self.reader.recv()["status"] != "READY":
                 raise e
         except EOFError:
             e.__suppress_context__ = True
+            shutdown_callback()
+            raise e from None
+        except Exception:
+            shutdown_callback()
             raise e from None
         finally:
             self.reader.close()

From ca3796021d0ea6810940e4d1ea49c6816b7fc640 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 20:28:48 +0000
Subject: [PATCH 074/130] nits

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/shutdown/test_processor_error.py | 6 ++++--
 vllm/v1/engine/async_llm.py               | 2 --
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 2d0a382ba99b..7f387b6693ad 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -8,6 +8,7 @@
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
+from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
 
@@ -51,9 +52,10 @@ async def generate(request_id: str):
         async for out in async_llm.generate(
                 "Hello my name is",
                 request_id="abc",
-                sampling_params=SamplingParams(max_tokens=EXPECTED_TOKENS)):
+                sampling_params=SamplingParams(
+                    max_tokens=EXPECTED_TOKENS,
+                    output_kind=RequestOutputKind.DELTA)):
             outputs.append(out)
-        print(f"{outputs=}")
         assert len(outputs) == EXPECTED_TOKENS
 
         async_llm.shutdown()
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dcf6c78930a4..22e882d49976 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -388,8 +388,6 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        print(f"{self.engine_core.is_engine_dead=}")
-        print(f"{self.is_running=}")
         return (self.engine_core.is_engine_dead or not self.is_running)
 
     @property

From 2d41499119b6085b051225fb50a57b1d3fad9129 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 20:35:28 +0000
Subject: [PATCH 075/130] fix test for bunched streaming

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 tests/v1/shutdown/test_processor_error.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 7f387b6693ad..681b9bb19162 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -56,6 +56,10 @@ async def generate(request_id: str):
                     max_tokens=EXPECTED_TOKENS,
                     output_kind=RequestOutputKind.DELTA)):
             outputs.append(out)
-        assert len(outputs) == EXPECTED_TOKENS
+
+        generated_tokens = []
+        for out in outputs:
+            generated_tokens.extend(out.outputs[0].token_ids)
+        assert len(generated_tokens) == EXPECTED_TOKENS
 
         async_llm.shutdown()

From 4a39d39dac7a831e01d46dd3268367c3014c8946 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Sat, 8 Feb 2025 20:37:15 +0000
Subject: [PATCH 076/130] tweak typing

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 22e882d49976..f7d7232da5e2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -215,13 +215,13 @@ async def generate(
                 # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() if q.qsize() > 0 else await q.get()
-                if isinstance(out, EngineDeadError):
+                if isinstance(out, Exception):
                     raise out
 
                 # Coalesce any additional queued outputs
                 while not q.empty():
                     next_out = q.get_nowait()
-                    if isinstance(next_out, EngineDeadError):
+                    if isinstance(next_out, Exception):
                         raise out
                     if sampling_params.output_kind == RequestOutputKind.DELTA:
                         out.add(next_out)

From 43360f032d9d7861cd124ec1b83f292c53844557 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 10 Feb 2025 10:31:35 -0500
Subject: [PATCH 077/130] Update tests/v1/shutdown/test_forward_error.py

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/v1/shutdown/test_forward_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 6cc143bfaddd..a8f4b6097c90 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -16,7 +16,7 @@
 
 
 def evil_forward(self, *args, **kwargs):
-    """Evil forward method that raise an exception after 5 calls."""
+    """Evil forward method that raise an exception after 10 calls."""
     NUMBER_OF_GOOD_PASSES = 10
 
     if not hasattr(self, "num_calls"):

From 218d095af6a7462439b51c084810449bf0ef20fd Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Mon, 10 Feb 2025 16:30:37 +0000
Subject: [PATCH 078/130] pre commit

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
---
 vllm/v1/engine/core.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f9bd1dfe93ec..beb994a37456 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -164,7 +164,8 @@ def __init__(
             # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
             self.input_queue: queue.Queue[Tuple[EngineCoreRequestType,
                                                 Any]] = queue.Queue()
-            self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+            self.output_queue: queue.Queue[Union[EngineCoreOutputs,
+                                                 bytes]] = queue.Queue()
             self.errored_sent_event = threading.Event()
             threading.Thread(target=self.process_input_socket,
                              args=(input_path, ),
@@ -175,7 +176,7 @@ def __init__(
 
             # Send Readiness signal to EngineClient.
             ready_pipe.send({"status": "READY"})
-        
+
         except Exception as e:
             logger.exception("EngineCore got error at startup:", exc_info=e)
             ready_pipe.send({"status": "FAILED"})

From c395634dd1a86e2f93bc6dc0a1ae9475617b3f1b Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:38:14 -0500
Subject: [PATCH 079/130] Update tests/v1/shutdown/test_forward_error.py

Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 tests/v1/shutdown/test_forward_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index a8f4b6097c90..e0d27c47a8ec 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -105,7 +105,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
-        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+        m.setattr(LlamaForCausalLM, "forward", evil_forward)
 
         llm = LLM(model="meta-llama/Llama-3.2-1B",
                   enforce_eager=True,

From 042c486b27b065d6d493f1ca78f2ba4ac4dee1fe Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:42:55 -0500
Subject: [PATCH 080/130] Update vllm/v1/engine/core.py

Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/v1/engine/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index beb994a37456..e012f6a00a67 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -128,7 +128,7 @@ def step(self) -> EngineCoreOutputs:
         return engine_core_outputs
 
     def shutdown(self):
-        if model_executor := getattr(self, "model_executor", None):
+        if self.model_executor:
             model_executor.shutdown()
 
     def profile(self, is_start: bool = True):

From b5a7b6f26ff03a384d532fad164db5cd86737698 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:45:34 -0500
Subject: [PATCH 081/130] Update vllm/v1/engine/core.py

Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/v1/engine/core.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e012f6a00a67..c53144f0bd18 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -234,8 +234,6 @@ def run_busy_loop(self):
                         # Break out the loop so we can log_stats in step().
                         if self.log_stats:
                             break
-                    except Exception:
-                        raise
 
             # 2) Handle any new client requests.
             while not self.input_queue.empty():

From dab77cf6fceae25d2a74249a1457e31daa1fbc6f Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:47:50 -0500
Subject: [PATCH 082/130] Update tests/v1/shutdown/test_forward_error.py

Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 tests/v1/shutdown/test_forward_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index e0d27c47a8ec..217d21957e24 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -63,7 +63,7 @@ async def generate(request_id: str):
         tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
         outputs = await asyncio.gather(*tasks)
 
-        # Every request should have get an EngineDeadError.
+        # Every request should get an EngineDeadError.
         for output in outputs:
             assert isinstance(output, EngineDeadError)
 

From a0102812625a4038201bc3db54db496c791a718e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 25 Mar 2025 00:48:41 +0000
Subject: [PATCH 083/130] intermed tensors

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 217d21957e24..5204764cdf44 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -27,7 +27,7 @@ def evil_forward(self, *args, **kwargs):
         raise Exception("Simulated illegal memory access on Rank 0!")
     self.num_calls += 1
 
-    return self.model(*args, **kwargs, intermediate_tensors=None)
+    return self.model(*args, **kwargs)
 
 
 @pytest.mark.asyncio
@@ -115,8 +115,8 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
             llm.generate("Hello my name is Robert and I")
 
     # Confirm all the processes are cleaned up.
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(tensor_parallel_size)),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
+    # wait_for_gpu_memory_to_clear(
+    #     devices=list(range(tensor_parallel_size)),
+    #     threshold_bytes=2 * 2**30,
+    #     timeout_s=60,
+    # )

From adebbe3bdb0863f327053b08b3a49c57c3e66f31 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 25 Mar 2025 20:25:19 +0000
Subject: [PATCH 084/130] added multiproc on/off tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py   | 20 +++++++++++---------
 tests/v1/shutdown/test_processor_error.py |  6 ++++--
 tests/v1/shutdown/test_startup_error.py   |  8 +++++---
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 5204764cdf44..e3bdca0b0356 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -32,13 +32,16 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-async def test_async_llm_model_error(monkeypatch, tensor_parallel_size):
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size,
+                                     enable_multiprocessing):
 
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
         monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
@@ -99,7 +102,6 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
 
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
@@ -114,9 +116,9 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
         with pytest.raises(EngineDeadError):
             llm.generate("Hello my name is Robert and I")
 
-    # Confirm all the processes are cleaned up.
-    # wait_for_gpu_memory_to_clear(
-    #     devices=list(range(tensor_parallel_size)),
-    #     threshold_bytes=2 * 2**30,
-    #     timeout_s=60,
-    # )
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 681b9bb19162..8fde7e619585 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -14,10 +14,12 @@
 
 
 @pytest.mark.asyncio
-async def test_async_llm_processor_error(monkeypatch):
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+async def test_async_llm_processor_error(monkeypatch, enable_multiprocessing):
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
                                       enforce_eager=True)
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 0516e22b31f0..2756a68d36e0 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -29,13 +29,16 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size):
+@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size,
+                                 enable_multiprocessing):
 
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
         monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
@@ -67,7 +70,6 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
 
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)

From f23bc2557d8bebb773de5cfa021062870ddea039 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 25 Mar 2025 21:06:40 +0000
Subject: [PATCH 085/130] wip sync

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index efeda75d8113..99f9fbe41ab9 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -329,7 +329,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.outputs_queue: queue.Queue[Union[EngineCoreOutputs,
+                                              Exception]] = queue.Queue()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -363,7 +364,7 @@ def process_outputs_socket():
                         self._validate_alive(frame.buffer)
                         outputs = decoder.decode(frame.buffer)
                     except Exception as e:
-                        raise self._format_exception(e) from None
+                        self.outputs_queue.put_nowait(e)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
@@ -381,7 +382,13 @@ def process_outputs_socket():
         self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
-        return self.outputs_queue.get()
+        # If an exception arises in process_outputs_socket task,
+        # it is forwarded to the outputs_queue so we can raise it
+        # from this (run_output_handler) task to shut down the server.
+        outputs = self.outputs_queue.get()
+        if isinstance(outputs, Exception):
+            raise self._format_exception(outputs) from None
+        return outputs
 
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: Any) -> None:

From ae1dc32f4b9eabc1334022cf3ae68e0eafd07605 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 25 Mar 2025 21:41:03 +0000
Subject: [PATCH 086/130] check for correct exception

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index e3bdca0b0356..801a99d1bb02 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -113,12 +113,13 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
                   enforce_eager=True,
                   tensor_parallel_size=tensor_parallel_size)
 
-        with pytest.raises(EngineDeadError):
+        with pytest.raises(
+                EngineDeadError if enable_multiprocessing else Exception):
             llm.generate("Hello my name is Robert and I")
 
-        # Confirm all the processes are cleaned up.
-        wait_for_gpu_memory_to_clear(
-            devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
-        )
+            # Confirm all the processes are cleaned up.
+            wait_for_gpu_memory_to_clear(
+                devices=list(range(tensor_parallel_size)),
+                threshold_bytes=2 * 2**30,
+                timeout_s=60,
+            )

From c2afedc980eaa10c7695816a904d95c360eb6b67 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 25 Mar 2025 21:52:48 +0000
Subject: [PATCH 087/130] wip llm tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_startup_error.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 2756a68d36e0..0c373ffc3cde 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -77,7 +77,10 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size,
         # Monkeypatch an error in the model.
         monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-        with pytest.raises(Exception, match="initialization failed"):
+        with pytest.raises(
+                Exception,
+                match="initialization failed"
+                if enable_multiprocessing else "Simulated Error in startup!"):
             _ = LLM(model="meta-llama/Llama-3.2-1B",
                     enforce_eager=True,
                     tensor_parallel_size=tensor_parallel_size)

From 89a5461676137dfade0ec8db678ddb23c7f1b54e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 27 Mar 2025 23:54:34 +0000
Subject: [PATCH 088/130] removed tests of LLM engine without MP

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py   | 139 +++++++++++-----------
 tests/v1/shutdown/test_processor_error.py |  88 +++++++-------
 tests/v1/shutdown/test_startup_error.py   |  57 ++++-----
 3 files changed, 144 insertions(+), 140 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 801a99d1bb02..b8e7f76c32b3 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -14,6 +14,8 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
 
 def evil_forward(self, *args, **kwargs):
     """Evil forward method that raise an exception after 10 calls."""
@@ -32,72 +34,73 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
-async def test_async_llm_model_error(monkeypatch, tensor_parallel_size,
-                                     enable_multiprocessing):
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
 
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
-    with monkeypatch.context() as m:
-        MP_VALUE = "1" if enable_multiprocessing else "0"
-        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
-
-        # Monkeypatch an error in the model.
-        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
-
-        engine_args = AsyncEngineArgs(
-            model="meta-llama/Llama-3.2-1B",
-            enforce_eager=True,
-            tensor_parallel_size=tensor_parallel_size)
-        async_llm = AsyncLLM.from_engine_args(engine_args)
-
-        async def generate(request_id: str):
-            generator = async_llm.generate("Hello my name is",
-                                           request_id=request_id,
-                                           sampling_params=SamplingParams())
-            try:
-                async for _ in generator:
-                    pass
-            except Exception as e:
-                return e
-
-        NUM_REQS = 3
-        tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
-        outputs = await asyncio.gather(*tasks)
-
-        # Every request should get an EngineDeadError.
-        for output in outputs:
-            assert isinstance(output, EngineDeadError)
-
-        # AsyncLLM should be errored.
-        assert async_llm.errored
-
-        # We should not be able to make another request.
-        with pytest.raises(EngineDeadError):
-            async for _ in async_llm.generate(
-                    "Hello my name is",
-                    request_id="abc",
-                    sampling_params=SamplingParams()):
-                raise Exception("We should not get here.")
-
-        # Confirm all the processes are cleaned up.
-        wait_for_gpu_memory_to_clear(
-            devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
-        )
-
-        # NOTE: shutdown is handled by the API Server if an exception
-        # occurs, so it is expected that we would need to call this.
-        async_llm.shutdown()
-
-
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
+@pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-def test_llm_model_error(monkeypatch, tensor_parallel_size,
-                         enable_multiprocessing):
-
+@pytest.mark.parametrize("model", MODELS)
+def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                         enable_multiprocessing: bool, model: str) -> None:
+    """Test that LLM propagates a forward pass error and frees memory.
+    TODO(andy) - LLM without multiprocessing.
+    """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
@@ -109,7 +112,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
         # Monkeypatch an error in the model.
         m.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-        llm = LLM(model="meta-llama/Llama-3.2-1B",
+        llm = LLM(model=model,
                   enforce_eager=True,
                   tensor_parallel_size=tensor_parallel_size)
 
@@ -117,9 +120,9 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size,
                 EngineDeadError if enable_multiprocessing else Exception):
             llm.generate("Hello my name is Robert and I")
 
-            # Confirm all the processes are cleaned up.
-            wait_for_gpu_memory_to_clear(
-                devices=list(range(tensor_parallel_size)),
-                threshold_bytes=2 * 2**30,
-                timeout_s=60,
-            )
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 8fde7e619585..eb1341cbddf3 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -12,56 +12,56 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
 
+MODELS = ["meta-llama/Llama-3.2-1B"]
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
-async def test_async_llm_processor_error(monkeypatch, enable_multiprocessing):
-
-    with monkeypatch.context() as m:
-        MP_VALUE = "1" if enable_multiprocessing else "0"
-        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
-        engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
-                                      enforce_eager=True)
-        async_llm = AsyncLLM.from_engine_args(engine_args)
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_processor_error(model: str) -> None:
+    """Test that AsyncLLM propagates a processor error.
+    Test empty tokens prompt (failure) and non-empty prompt (no failure.)
+    AsyncLLM always uses an MP client.
+    """
+    engine_args = AsyncEngineArgs(model=model, enforce_eager=True)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
 
-        async def generate(request_id: str):
-            # [] is not allowed and will raise a ValueError in Processor.
-            generator = async_llm.generate(TokensPrompt([]),
-                                           request_id=request_id,
-                                           sampling_params=SamplingParams())
-            try:
-                async for _ in generator:
-                    pass
-            except Exception as e:
-                return e
+    async def generate(request_id: str):
+        # [] is not allowed and will raise a ValueError in Processor.
+        generator = async_llm.generate(TokensPrompt([]),
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
 
-        NUM_REQS = 3
-        tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
-        outputs = await asyncio.gather(*tasks)
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
 
-        # Every request should have get an EngineGenerateError.
-        for output in outputs:
-            with pytest.raises(EngineGenerateError):
-                raise output
+    # Every request should have get an EngineGenerateError.
+    for output in outputs:
+        with pytest.raises(EngineGenerateError):
+            raise output
 
-        # AsyncLLM should be errored.
-        assert not async_llm.errored
+    # AsyncLLM should be errored.
+    assert not async_llm.errored
 
-        # This should be no problem.
-        EXPECTED_TOKENS = 5
-        outputs = []
-        async for out in async_llm.generate(
-                "Hello my name is",
-                request_id="abc",
-                sampling_params=SamplingParams(
-                    max_tokens=EXPECTED_TOKENS,
-                    output_kind=RequestOutputKind.DELTA)):
-            outputs.append(out)
+    # This should be no problem.
+    EXPECTED_TOKENS = 5
+    outputs = []
+    async for out in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=EXPECTED_TOKENS,
+                output_kind=RequestOutputKind.DELTA)):
+        outputs.append(out)
 
-        generated_tokens = []
-        for out in outputs:
-            generated_tokens.extend(out.outputs[0].token_ids)
-        assert len(generated_tokens) == EXPECTED_TOKENS
+    generated_tokens = []
+    for out in outputs:
+        generated_tokens.extend(out.outputs[0].token_ids)
+    assert len(generated_tokens) == EXPECTED_TOKENS
 
-        async_llm.shutdown()
+    async_llm.shutdown()
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 0c373ffc3cde..62f6c3186339 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -29,43 +29,44 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
-def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size,
-                                 enable_multiprocessing):
-
+def test_async_llm_startup_error(monkeypatch, model: str,
+                                 tensor_parallel_size: int) -> None:
+    """Test that AsyncLLM propagates an __init__ error & frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
-    with monkeypatch.context() as m:
-        MP_VALUE = "1" if enable_multiprocessing else "0"
-        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
-
-        # Monkeypatch an error in the model.
-        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
 
-        engine_args = AsyncEngineArgs(
-            model=model,
-            enforce_eager=True,
-            tensor_parallel_size=tensor_parallel_size)
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
 
-        # Confirm we get an exception.
-        with pytest.raises(Exception, match="initialization failed"):
-            _ = AsyncLLM.from_engine_args(engine_args)
+    # Confirm we get an exception.
+    with pytest.raises(Exception, match="initialization failed"):
+        _ = AsyncLLM.from_engine_args(engine_args)
 
-        # Confirm all the processes are cleaned up.
-        wait_for_gpu_memory_to_clear(
-            devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
-        )
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-@pytest.mark.parametrize("enable_multiprocessing", [True, False])
-def test_llm_startup_error(monkeypatch, model, tensor_parallel_size,
-                           enable_multiprocessing):
-
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
+                           enable_multiprocessing: bool) -> None:
+    """Test that LLM propagates an __init__ error and frees memory.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if model != "meta-llama/Llama-3.2-1B":
+        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
@@ -81,7 +82,7 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size,
                 Exception,
                 match="initialization failed"
                 if enable_multiprocessing else "Simulated Error in startup!"):
-            _ = LLM(model="meta-llama/Llama-3.2-1B",
+            _ = LLM(model=model,
                     enforce_eager=True,
                     tensor_parallel_size=tensor_parallel_size)
 

From f60c8b5f73609aea746fd8d5fbdbda140f94776e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 28 Mar 2025 06:13:47 +0000
Subject: [PATCH 089/130] SyncMPClient & MPClient finalizers works

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py | 69 ++++++++++++++++++++++++++++++++
 vllm/v1/engine/core_client.py    | 19 ++++++---
 2 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 tests/v1/shutdown/test_delete.py

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
new file mode 100644
index 000000000000..3e45b8736188
--- /dev/null
+++ b/tests/v1/shutdown/test_delete.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Test that we handle a startup Error and shutdown."""
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from vllm import LLM
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
+
+MODELS = [
+    "meta-llama/Llama-3.2-1B",  # Raises on first fwd pass.
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
+    """Test that AsyncLLM frees GPU memory upon deletion.
+    AsyncLLM always uses an MP client.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+
+    # Instantiate & delete AsyncLLM
+    inst = AsyncLLM.from_engine_args(engine_args)
+    del inst
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("enable_multiprocessing", [True])
+def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
+                    enable_multiprocessing: bool) -> None:
+    """Test that LLM frees GPU memory upon deletion.
+    TODO(andy) - LLM without multiprocessing.
+    """
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    with monkeypatch.context() as m:
+
+        MP_VALUE = "1" if enable_multiprocessing else "0"
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
+
+        # Instantiate and delete LLM
+        inst = LLM(model=model,
+                   enforce_eager=True,
+                   tensor_parallel_size=tensor_parallel_size)
+        del inst
+
+        # Confirm all the processes are cleaned up.
+        wait_for_gpu_memory_to_clear(
+            devices=list(range(tensor_parallel_size)),
+            threshold_bytes=2 * 2**30,
+            timeout_s=60,
+        )
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 99f9fbe41ab9..62acb2ea4303 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,6 +2,7 @@
 import asyncio
 import queue
 import uuid
+import weakref
 from abc import ABC, abstractmethod
 from concurrent.futures import Future
 from dataclasses import dataclass
@@ -265,7 +266,9 @@ def __init__(
         # This will ensure resources created so far are closed
         # when the client is garbage collected,  even if an
         # exception is raised mid-construction.
-        self.resources = BackgroundResources(ctx=sync_ctx)
+        resources = BackgroundResources(ctx=sync_ctx)
+        self.resources = resources
+        self._finalizer = weakref.finalize(self, resources)
 
         # Paths for IPC.
         self.output_path = get_open_zmq_ipc_path()
@@ -293,7 +296,7 @@ def __init__(
 
     def shutdown(self):
         # Terminate background resources
-        self.resources()
+        self._finalizer()
 
     def _validate_alive(self, buffer: Any):
         if buffer == EngineCoreProc.ENGINE_CORE_DEAD:
@@ -343,10 +346,13 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         shutdown_path = get_open_zmq_inproc_path()
         self.resources.shutdown_path = shutdown_path
 
+        self_weakref = weakref.ref(self)
+
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
             shutdown_socket.bind(shutdown_path)
             out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
+            local_self = None
             try:
                 poller = zmq.Poller()
                 poller.register(shutdown_socket)
@@ -358,13 +364,16 @@ def process_outputs_socket():
                     if len(socks) == 2 or socks[0][0] == shutdown_socket:
                         # shutdown signal, exit thread.
                         break
-
+                    local_self = self_weakref()
+                    if local_self is None:
+                        # Instance is being gc'd, exit loop
+                        break
                     try:
                         (frame, ) = out_socket.recv_multipart(copy=False)
-                        self._validate_alive(frame.buffer)
+                        local_self._validate_alive(frame.buffer)
                         outputs = decoder.decode(frame.buffer)
                     except Exception as e:
-                        self.outputs_queue.put_nowait(e)
+                        local_self.outputs_queue.put_nowait(e)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)

From 9aed319944a1337f430eba4328a36eb2c6b7b498 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 31 Mar 2025 12:47:21 +0000
Subject: [PATCH 090/130] wip delete tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py | 35 ++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 3e45b8736188..2c9cb374a4cc 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -4,8 +4,9 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
@@ -14,9 +15,10 @@
 ]
 
 
+@pytest.mark.asyncio
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
+async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
     """
@@ -27,9 +29,16 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
                                   enforce_eager=True,
                                   tensor_parallel_size=tensor_parallel_size)
 
-    # Instantiate & delete AsyncLLM
-    inst = AsyncLLM.from_engine_args(engine_args)
-    del inst
+    # Instantiate AsyncLLM; make request to complete any deferred
+    # initialization; then delete instance
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+    async for _ in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+        pass
+    del async_llm
 
     # Confirm all the processes are cleaned up.
     wait_for_gpu_memory_to_clear(
@@ -51,15 +60,19 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
         pytest.skip(reason="Not enough CUDA devices")
 
     with monkeypatch.context() as m:
-
         MP_VALUE = "1" if enable_multiprocessing else "0"
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
-        # Instantiate and delete LLM
-        inst = LLM(model=model,
-                   enforce_eager=True,
-                   tensor_parallel_size=tensor_parallel_size)
-        del inst
+        # Instantiate LLM; make request to complete any deferred
+        # initialization; then delete instance
+        llm = LLM(model=model,
+                  enforce_eager=True,
+                  tensor_parallel_size=tensor_parallel_size)
+        # llm.generate(
+        #     "Hello my name is",
+        #     sampling_params=SamplingParams(
+        #         max_tokens=1))
+        del llm
 
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(

From be1a23df492cdbfd25fac8e004ca7a0ef54835ab Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 31 Mar 2025 13:40:31 +0000
Subject: [PATCH 091/130] rollback

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 2c9cb374a4cc..5f07984790c2 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -4,9 +4,8 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from vllm import LLM, SamplingParams
+from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
@@ -15,10 +14,9 @@
 ]
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
+def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
     """
@@ -32,12 +30,6 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     # Instantiate AsyncLLM; make request to complete any deferred
     # initialization; then delete instance
     async_llm = AsyncLLM.from_engine_args(engine_args)
-    async for _ in async_llm.generate(
-            "Hello my name is",
-            request_id="abc",
-            sampling_params=SamplingParams(
-                max_tokens=1, output_kind=RequestOutputKind.DELTA)):
-        pass
     del async_llm
 
     # Confirm all the processes are cleaned up.

From 9f672d8868bc8ab1e07e58195dcc96d15a94a9ef Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 31 Mar 2025 18:22:07 +0000
Subject: [PATCH 092/130] async fix

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 1d82becde41a..9852a0d59125 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -588,6 +588,7 @@ async def process_outputs_socket():
             try:
                 while True:
                     (frame, ) = await output_socket.recv_multipart(copy=False)
+                    self._validate_alive(frame.buffer)
                     outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,

From 79c4e1933300f240fc362cdd3fe7ac0913acefa6 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 1 Apr 2025 18:23:24 +0000
Subject: [PATCH 093/130] remove strong refs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9852a0d59125..987518bf442f 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -292,7 +292,7 @@ class BackgroundResources:
 
     def __call__(self):
         """Clean up background resources."""
-
+        print("\n\n\nHYPERBANANA\n\n\n")
         for core_engine in self.core_engines:
             core_engine.close()
 
@@ -578,7 +578,7 @@ def _ensure_output_queue_task(self):
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
         output_handler = self.outputs_handler
-        _self_ref = weakref.ref(self) if output_handler else None
+        _self_ref = weakref.ref(self)
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
@@ -586,9 +586,13 @@ def _ensure_output_queue_task(self):
 
         async def process_outputs_socket():
             try:
+                _self = _self_ref()
+                if not _self:
+                    # Client has been garbage collected, abort.
+                    return
                 while True:
                     (frame, ) = await output_socket.recv_multipart(copy=False)
-                    self._validate_alive(frame.buffer)
+                    _self._validate_alive(frame.buffer)
                     outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
@@ -606,7 +610,8 @@ async def process_outputs_socket():
                     if outputs.outputs or outputs.scheduler_stats:
                         outputs_queue.put_nowait(outputs)
             except Exception as e:
-                self.outputs_queue.put_nowait(e)
+                if _self:
+                    _self.outputs_queue.put_nowait(e)
 
         self.queue_task = asyncio.create_task(process_outputs_socket(),
                                               name="EngineCoreOutputQueueTask")

From 07824d5bea9df15357f2bb5ba22fa29425fbabb0 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 1 Apr 2025 18:41:57 +0000
Subject: [PATCH 094/130] add back strong refs

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/core_client.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 987518bf442f..9852a0d59125 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -292,7 +292,7 @@ class BackgroundResources:
 
     def __call__(self):
         """Clean up background resources."""
-        print("\n\n\nHYPERBANANA\n\n\n")
+
         for core_engine in self.core_engines:
             core_engine.close()
 
@@ -578,7 +578,7 @@ def _ensure_output_queue_task(self):
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
         output_handler = self.outputs_handler
-        _self_ref = weakref.ref(self)
+        _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
@@ -586,13 +586,9 @@ def _ensure_output_queue_task(self):
 
         async def process_outputs_socket():
             try:
-                _self = _self_ref()
-                if not _self:
-                    # Client has been garbage collected, abort.
-                    return
                 while True:
                     (frame, ) = await output_socket.recv_multipart(copy=False)
-                    _self._validate_alive(frame.buffer)
+                    self._validate_alive(frame.buffer)
                     outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
@@ -610,8 +606,7 @@ async def process_outputs_socket():
                     if outputs.outputs or outputs.scheduler_stats:
                         outputs_queue.put_nowait(outputs)
             except Exception as e:
-                if _self:
-                    _self.outputs_queue.put_nowait(e)
+                self.outputs_queue.put_nowait(e)
 
         self.queue_task = asyncio.create_task(process_outputs_socket(),
                                               name="EngineCoreOutputQueueTask")

From 74d8e8fa95828b43553d0fa0380b15655a6f13bb Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 2 Apr 2025 13:54:04 +0000
Subject: [PATCH 095/130] removed async forward error test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py | 67 +------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index b8e7f76c32b3..5ec5bd1cee53 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle an Error in model forward and shutdown."""
 
-import asyncio
-
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from vllm import LLM, SamplingParams
+from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
 MODELS = ["meta-llama/Llama-3.2-1B"]
@@ -32,67 +28,6 @@ def evil_forward(self, *args, **kwargs):
     return self.model(*args, **kwargs)
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-@pytest.mark.parametrize("model", MODELS)
-async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
-                                     model: str) -> None:
-    """Test that AsyncLLM propagates a forward pass error and frees memory.
-    
-    AsyncLLM always uses an MP client.
-    """
-
-    if cuda_device_count_stateless() < tensor_parallel_size:
-        pytest.skip(reason="Not enough CUDA devices")
-
-    # Monkeypatch an error in the model.
-    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
-
-    engine_args = AsyncEngineArgs(model=model,
-                                  enforce_eager=True,
-                                  tensor_parallel_size=tensor_parallel_size)
-    async_llm = AsyncLLM.from_engine_args(engine_args)
-
-    async def generate(request_id: str):
-        generator = async_llm.generate("Hello my name is",
-                                       request_id=request_id,
-                                       sampling_params=SamplingParams())
-        try:
-            async for _ in generator:
-                pass
-        except Exception as e:
-            return e
-
-    NUM_REQS = 3
-    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
-    outputs = await asyncio.gather(*tasks)
-
-    # Every request should get an EngineDeadError.
-    for output in outputs:
-        assert isinstance(output, EngineDeadError)
-
-    # AsyncLLM should be errored.
-    assert async_llm.errored
-
-    # We should not be able to make another request.
-    with pytest.raises(EngineDeadError):
-        async for _ in async_llm.generate("Hello my name is",
-                                          request_id="abc",
-                                          sampling_params=SamplingParams()):
-            raise Exception("We should not get here.")
-
-    # Confirm all the processes are cleaned up.
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(tensor_parallel_size)),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
-
-    # NOTE: shutdown is handled by the API Server if an exception
-    # occurs, so it is expected that we would need to call this.
-    async_llm.shutdown()
-
-
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)

From d66844fa391acf3b043b0402f55a8a8f3226e26d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 2 Apr 2025 14:07:27 +0000
Subject: [PATCH 096/130] removed sync delete dummy request

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 5f07984790c2..31bbabc1c232 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -60,10 +60,6 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
         llm = LLM(model=model,
                   enforce_eager=True,
                   tensor_parallel_size=tensor_parallel_size)
-        # llm.generate(
-        #     "Hello my name is",
-        #     sampling_params=SamplingParams(
-        #         max_tokens=1))
         del llm
 
         # Confirm all the processes are cleaned up.

From 2ee74b6f7201951f94ea3a56c9b2959e63652da4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 4 Apr 2025 15:03:42 +0000
Subject: [PATCH 097/130] temporarily removed test case

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_startup_error.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 62f6c3186339..cc7087e10c79 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -23,7 +23,6 @@ def evil_forward(self, *args, **kwargs):
 
 MODELS = [
     "meta-llama/Llama-3.2-1B",  # Raises on first fwd pass.
-    "mistralai/Mixtral-8x22B-Instruct-v0.1"  # Causes OOM.
 ]
 
 

From 86263dc5c5bd966a0251dcf436e04e5b6c6b0a9f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 4 Apr 2025 16:11:00 +0000
Subject: [PATCH 098/130] test load weights failure

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_startup_error.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index cc7087e10c79..a40823dbd45b 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -12,8 +12,8 @@
 from vllm.v1.engine.async_llm import AsyncLLM
 
 
-def evil_forward(self, *args, **kwargs):
-    """Evil forward method that raise an exception."""
+def evil_method(self, *args, **kwargs):
+    """Evil method that raises an exception."""
 
     if get_tensor_model_parallel_rank() == 0:
         raise Exception("Simulated Error in startup!")
@@ -28,17 +28,19 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
 def test_async_llm_startup_error(monkeypatch, model: str,
-                                 tensor_parallel_size: int) -> None:
+                                 tensor_parallel_size: int,
+                                 failing_method: str) -> None:
     """Test that AsyncLLM propagates an __init__ error & frees memory.
-    
+    Test profiling (forward()) and load weights failures.
     AsyncLLM always uses an MP client.
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 
     # Monkeypatch an error in the model.
-    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+    monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
     engine_args = AsyncEngineArgs(model=model,
                                   enforce_eager=True,
@@ -59,9 +61,12 @@ def test_async_llm_startup_error(monkeypatch, model: str,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
 def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
-                           enable_multiprocessing: bool) -> None:
+                           enable_multiprocessing: bool,
+                           failing_method: str) -> None:
     """Test that LLM propagates an __init__ error and frees memory.
+    Test profiling (forward()) and load weights failures.
     TODO(andy) - LLM without multiprocessing.
     """
     if model != "meta-llama/Llama-3.2-1B":
@@ -75,7 +80,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE)
 
         # Monkeypatch an error in the model.
-        monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+        monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method)
 
         with pytest.raises(
                 Exception,

From f824c154df143f7b88c2aed4e314cd7cb0345cd9 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:14:22 -0400
Subject: [PATCH 099/130] Update vllm/v1/engine/exceptions.py

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/v1/engine/exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py
index aa8a1227420e..97dd31d5e521 100644
--- a/vllm/v1/engine/exceptions.py
+++ b/vllm/v1/engine/exceptions.py
@@ -8,7 +8,7 @@ class EngineDeadError(Exception):
     """Raised when the EngineCore dies. Unrecoverable."""
 
     def __init__(self, *args, suppress_context: bool = False, **kwargs):
-        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause."  # noqa: E501
+        ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause."  # noqa: E501
 
         super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs)
         # Make stack trace clearer when using with LLMEngine by

From 7dc02fa03da17d29e0353bcbb8979fd1b294aa94 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 8 Apr 2025 17:00:20 -0700
Subject: [PATCH 100/130] Post main-merge cleanup/fixes

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/launcher.py       |  2 +-
 vllm/v1/engine/async_llm.py        | 24 +++++-----
 vllm/v1/engine/core.py             | 70 +++++++++++++-----------------
 vllm/v1/engine/core_client.py      | 68 +++++++++++++++--------------
 vllm/v1/engine/output_processor.py | 29 ++++++++-----
 5 files changed, 95 insertions(+), 98 deletions(-)

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 1056ffec93c9..a4f70a51ebaf 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -103,7 +103,7 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
     for this request.
     """
     engine_errored = engine.errored and not engine.is_running
-    if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored):
+    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
         server.should_exit = True
 
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0db3d057d5b5..6e3a3c658ea8 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -26,7 +26,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import AsyncMPClient
+from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 from vllm.v1.engine.output_processor import (OutputProcessor,
                                              RequestOutputCollector)
@@ -96,7 +96,11 @@ def __init__(
                                                 log_stats=self.log_stats)
 
         # EngineCore (starts the engine in background process).
-        self.engine_core = AsyncMPClient(
+        core_client_class = AsyncMPClient if (
+            vllm_config.parallel_config.data_parallel_size
+            == 1) else DPAsyncMPClient
+
+        self.engine_core = core_client_class(
             vllm_config=vllm_config,
             executor_class=executor_class,
             log_stats=self.log_stats,
@@ -280,8 +284,6 @@ async def generate(
                 # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
                 out = q.get_nowait() or await q.get()
-                if isinstance(out, Exception):
-                    raise out
 
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
@@ -355,9 +357,8 @@ async def _run_output_handler(self):
                     iteration_stats=iteration_stats,
                 )
 
-        except Exception as e:
-            logger.error("AsyncLLM output_handler got an Exception:",
-                         exc_info=e)
+        except Exception:
+            logger.exception("AsyncLLM output_handler failed.")
             self.output_processor.propagate_error(EngineDeadError())
 
     async def abort(self, request_id: str) -> None:
@@ -463,11 +464,8 @@ async def pin_lora(self, lora_id: int) -> bool:
 
     @property
     def is_running(self) -> bool:
-        # Have not started the loop yet.
-        if self.output_handler is None:
-            return True
-
-        return not self.output_handler.done()
+        # Is None before the loop is started.
+        return self.output_handler is None or not self.output_handler.done()
 
     @property
     def is_stopped(self) -> bool:
@@ -475,7 +473,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return (self.engine_core.is_engine_dead or not self.is_running)
+        return self.engine_core.is_engine_dead or not self.is_running
 
     @property
     def dead_error(self) -> BaseException:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index dc75a17b4248..014b5431d39e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -317,32 +317,28 @@ def __init__(
         log_stats: bool,
         engine_index: int = 0,
     ):
-        try:
-            super().__init__(vllm_config, executor_class, log_stats)
-
-            self.errored_sent_event = threading.Event()
-            self.step_fn = (self.step if self.batch_queue is None else
-                            self.step_with_batch_queue)
-            self.global_unfinished_reqs = False
-
-            # Background Threads and Queues for IO. These enable us to
-            # overlap ZMQ socket IO with GPU since they release the GIL,
-            # and to overlap some serialization/deserialization with the
-            # model forward pass.
-            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-            self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                                Any]] = queue.Queue()
-            self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
-            threading.Thread(target=self.process_input_socket,
-                             args=(input_path, engine_index),
-                             daemon=True).start()
-            threading.Thread(target=self.process_output_socket,
-                             args=(output_path, engine_index),
-                             daemon=True).start()
-
-        except Exception as e:
-            logger.exception("Error during EngineCore initialization.")
-            raise e
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        self.errored_sent_event = threading.Event()
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+        self.global_unfinished_reqs = False
+
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
+                                            Any]] = queue.Queue()
+        self.output_queue: queue.Queue[Union[EngineCoreOutputs,
+                                             bytes]] = queue.Queue()
+        threading.Thread(target=self.process_input_socket,
+                         args=(input_path, engine_index),
+                         daemon=True).start()
+        threading.Thread(target=self.process_output_socket,
+                         args=(output_path, engine_index),
+                         daemon=True).start()
 
     @staticmethod
     def run_engine_core(*args,
@@ -370,7 +366,6 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGINT, signal_handler)
 
         engine_core: Optional[EngineCoreProc] = None
-        startup_failed = True
         try:
             parallel_config: ParallelConfig = kwargs[
                 "vllm_config"].parallel_config
@@ -382,21 +377,18 @@ def signal_handler(signum, frame):
             else:
                 engine_core = EngineCoreProc(*args, **kwargs)
 
-            startup_failed = False
             engine_core.run_busy_loop()
 
         except SystemExit:
-            logger.debug("EngineCore interrupted.")
+            logger.debug("EngineCore exiting.")
+
         except Exception as e:
-            if startup_failed:
-                logger.exception("EngineCore got error at startup:",
-                                 exc_info=e)
-                # TODO need to send failure here
-                raise e
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
             else:
-                assert engine_core is not None
-                logger.exception("EngineCore got an Exception:", exc_info=e)
+                logger.exception("EngineCore encountered a fatal error.")
                 engine_core._send_engine_dead()
+            raise e
         finally:
             if engine_core is not None:
                 engine_core.shutdown()
@@ -486,11 +478,11 @@ def _convert_msgspec_args(method, args):
     def _send_engine_dead(self):
         """Send EngineDead status to the EngineCoreClient."""
 
-        # Put ENGINE_CORE_DEAD to the front of the queue.
+        # Put ENGINE_CORE_DEAD in the queue.
         self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
 
         # Wait until msg sent by the daemon before shutdown.
-        if not self.errored_sent_event.wait(timeout=10.):
+        if not self.errored_sent_event.wait(timeout=5.):
             logger.fatal("vLLM shutdown signal from EngineCore failed "
                          "to send. Please report this issue.")
 
@@ -536,7 +528,7 @@ def process_output_socket(self, output_path: str, engine_index: int):
             while True:
                 outputs = self.output_queue.get()
                 if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
-                    socket.send_multipart((outputs, ), copy=False)
+                    socket.send(outputs, copy=False)
                     break
                 assert not isinstance(outputs, bytes)
                 outputs.engine_index = engine_index
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 441be3b4e377..62bf7f9fe809 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -354,34 +354,38 @@ def __init__(
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
-        # when the client is garbage collected,  even if an
+        # when the client is garbage collected, even if an
         # exception is raised mid-construction.
-        resources = BackgroundResources(ctx=sync_ctx)
-        self.resources = resources
-        self._finalizer = weakref.finalize(self, resources)
-
-        # Paths and sockets for IPC.
-        self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(self.ctx,
-                                            input_path,
-                                            zmq.ROUTER,
-                                            bind=True)
-        self.resources.input_socket = self.input_socket
-
-        self.is_engine_dead = False
-        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
-            vllm_config, executor_class, log_stats, input_path, self.
-            output_path, index, local_dp_rank)
-
-        # Start engine core process(es).
-        self._init_core_engines(vllm_config, new_core_engine,
-                                self.resources.core_engines)
-
-        # Wait for engine core process(es) to start.
-        self._wait_for_engine_startup()
-
-        self.utility_results: dict[int, AnyFuture] = {}
+        self.resources = BackgroundResources(ctx=sync_ctx)
+        self._finalizer = weakref.finalize(self, self.resources)
+        success = False
+        try:
+            # Paths and sockets for IPC.
+            self.output_path = get_open_zmq_ipc_path()
+            input_path = get_open_zmq_ipc_path()
+            self.input_socket = make_zmq_socket(self.ctx,
+                                                input_path,
+                                                zmq.ROUTER,
+                                                bind=True)
+            self.resources.input_socket = self.input_socket
+
+            self.is_engine_dead = False
+            new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+                vllm_config, executor_class, log_stats, input_path, self.
+                output_path, index, local_dp_rank)
+
+            # Start engine core process(es).
+            self._init_core_engines(vllm_config, new_core_engine,
+                                    self.resources.core_engines)
+
+            # Wait for engine core process(es) to start.
+            self._wait_for_engine_startup()
+
+            self.utility_results: dict[int, AnyFuture] = {}
+            success = True
+        finally:
+            if not success:
+                self._finalizer()
 
     def _wait_for_engine_startup(self):
         # Get a sync handle to the socket which can be sync or async.
@@ -429,7 +433,7 @@ def _init_core_engines(
         self.core_engine = core_engine
 
     def shutdown(self):
-        # Terminate background resources
+        # Terminate background resources.
         self._finalizer()
 
     def _validate_alive(self, buffer: Any):
@@ -440,8 +444,8 @@ def _validate_alive(self, buffer: Any):
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
 
-        return (EngineDeadError(
-            suppress_context=True) if self.is_engine_dead else e)
+        return EngineDeadError(
+            suppress_context=True) if self.is_engine_dead else e
 
 
 def _process_utility_output(output: UtilityOutput,
@@ -485,7 +489,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
             out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
-            local_self = None
             try:
                 shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
@@ -631,7 +634,6 @@ def _ensure_output_queue_task(self):
 
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
-        self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
@@ -645,7 +647,7 @@ def _ensure_output_queue_task(self):
         async def process_outputs_socket():
             try:
                 while True:
-                    (frame, ) = await output_socket.recv_multipart(copy=False)
+                    frame = await output_socket.recv(copy=False)
                     self._validate_alive(frame.buffer)
                     outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 917a4c52afd3..21e2a1aee4e2 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -28,35 +28,40 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[RequestOutput] = None
+        self.output: Optional[Union[RequestOutput, Exception]] = None
         self.ready = asyncio.Event()
 
-    def put(self, output: RequestOutput) -> None:
-        '''Non-blocking put operation'''
-        if self.output is None:
+    def put(self, output: Union[RequestOutput, Exception]) -> None:
+        """Non-blocking put operation."""
+        if self.output is None or isinstance(output, Exception):
             self.output = output
             self.ready.set()
-        elif self.aggregate:
-            # Coalesce the outputs in delta case.
-            self.output.add(output)
-        else:
-            # Just replace latest in non-delta case.
-            self.output = output
+        elif isinstance(self.output, RequestOutput):
+            if self.aggregate:
+                # Coalesce the outputs in delta case.
+                self.output.add(output)
+            else:
+                # Just replace latest in non-delta case.
+                self.output = output
 
     async def get(self) -> RequestOutput:
-        '''Get operation blocks on put event'''
+        """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
         self.output = None
         self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
     def get_nowait(self) -> Optional[RequestOutput]:
-        '''Non-blocking get operation'''
+        """Non-blocking get operation."""
         output = self.output
         if output is not None:
             self.output = None
             self.ready.clear()
+        if isinstance(output, Exception):
+            raise output
         return output
 
 

From f1bce10c936e41fafbd0997dea6057f00a93812a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 8 Apr 2025 20:43:25 -0700
Subject: [PATCH 101/130] Some updates to MultiprocExecutor

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 135 ++++++++++++++-----------
 1 file changed, 74 insertions(+), 61 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 3cab18d94042..a146cad8332b 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import multiprocessing
 import os
 import pickle
 import signal
@@ -65,31 +65,33 @@ def _init_executor(self) -> None:
 
         # Create workers
         unready_workers: list[UnreadyWorkerProcHandle] = []
-        for rank in range(self.world_size):
-            unready_worker = WorkerProc.make_worker_process(
-                vllm_config=self.vllm_config,
-                local_rank=rank,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                input_shm_handle=scheduler_output_handle,
-            )
-            unready_workers.append(unready_worker)
-
-        # Workers must be created before wait_for_ready to avoid
-        # deadlock, since worker.init_device() does a device sync.
-        self.workers: list[WorkerProcHandle] = []
-        for unready_worker in unready_workers:
-            # NOTE: the WorkerProc wraps startup in a try ... catch
-            # so if there are any issues in loading in a WorkerProcess
-            # (e.g. OOM), an Exception will be caught here.
-            worker = WorkerProc.wait_for_ready(unready_worker)
-            self.workers.append(worker)
-
-        # Ensure message queues are ready. Will deadlock if re-ordered
-        # Must be kept consistent with the WorkerProc
-        self.rpc_broadcast_mq.wait_until_ready()
-        for w in self.workers:
-            w.worker_response_mq.wait_until_ready()
+        success = False
+        try:
+            for rank in range(self.world_size):
+                unready_worker = WorkerProc.make_worker_process(
+                    vllm_config=self.vllm_config,
+                    local_rank=rank,
+                    rank=rank,
+                    distributed_init_method=distributed_init_method,
+                    input_shm_handle=scheduler_output_handle,
+                )
+                unready_workers.append(unready_worker)
+
+            # Workers must be created before wait_for_ready to avoid
+            # deadlock, since worker.init_device() does a device sync.
+            self.workers = WorkerProc.wait_for_ready(unready_workers)
+
+            # Ensure message queues are ready. Will deadlock if re-ordered
+            # Must be kept consistent with the WorkerProc.
+            self.rpc_broadcast_mq.wait_until_ready()
+            for w in self.workers:
+                w.worker_response_mq.wait_until_ready()
+            success = True
+        finally:
+            if not success:
+                # Clean up the worker procs if there was a failure.
+                for handle in unready_workers:
+                    handle.proc.kill()
 
     def collective_rpc(self,
                        method: Union[str, Callable],
@@ -178,7 +180,7 @@ class UnreadyWorkerProcHandle:
     """WorkerProcess handle before READY."""
     proc: BaseProcess
     rank: int
-    ready_pipe: tuple[Connection, Connection]
+    ready_pipe: Connection
 
 
 @dataclass
@@ -211,8 +213,10 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_pipe: Connection,
+        ready_pipe: tuple[Connection, Connection],
     ):
+        reader, writer = ready_pipe
+        reader.close()
         try:
             self.rank = rank
             wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
@@ -244,21 +248,20 @@ def __init__(
             worker_response_mq_handle = self.worker_response_mq.export_handle()
 
             # Send READY once we know everything is loaded
-            ready_pipe.send({
+            writer.send({
                 "status": WorkerProc.READY_STR,
-                "handle": pickle.dumps(worker_response_mq_handle)
+                "handle": worker_response_mq_handle,
             })
 
             # Initialize device and loads weights
             self.worker.init_device()
             self.worker.load_model()
 
-        except Exception as e:
-            logger.exception("WorkerProc got error at startup:", exc_info=e)
-            ready_pipe.send({"status": WorkerProc.FAILED_STR})
+        except Exception:
+            logger.exception("WorkerProc startup failed.")
 
         finally:
-            ready_pipe.close()
+            writer.close()
 
     @staticmethod
     def make_worker_process(
@@ -270,7 +273,7 @@ def make_worker_process(
     ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
         # (reader, writer)
-        pipe_tuple = context.Pipe(duplex=False)
+        reader, writer = context.Pipe(duplex=False)
 
         process_kwargs = {
             "vllm_config": vllm_config,
@@ -278,7 +281,7 @@ def make_worker_process(
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_pipe": pipe_tuple[1],
+            "ready_pipe": (reader, writer),
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
@@ -286,37 +289,47 @@ def make_worker_process(
                                daemon=True)
 
         proc.start()
-        return UnreadyWorkerProcHandle(proc, rank, pipe_tuple)
+        writer.close()
+        return UnreadyWorkerProcHandle(proc, rank, reader)
 
     @staticmethod
     def wait_for_ready(
-            unready_proc_handle: UnreadyWorkerProcHandle) -> WorkerProcHandle:
+        unready_proc_handles: list[UnreadyWorkerProcHandle]
+    ) -> list[WorkerProcHandle]:
 
         e = Exception("WorkerProc initialization failed due to "
                       "an exception in a background process. "
                       "See stack trace for root cause.")
 
-        ready_pipe = unready_proc_handle.ready_pipe[0]
-        try:
-            # Wait until the WorkerProc is ready.
-            response = ready_pipe.recv()
-            if response["status"] != "READY":
-                raise e
-
-            # Extract the message queue handle.
-            mq_handle = pickle.loads(response["handle"])
-            worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0)
-            return WorkerProcHandle.from_unready_handle(
-                unready_proc_handle, worker_response_mq)
-
-        except EOFError:
-            e.__suppress_context__ = True
-            raise e from None
-
-        finally:
-            # Close connection.
-            unready_proc_handle.ready_pipe[0].close()
-            unready_proc_handle.ready_pipe[1].close()
+        pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
+        ready_proc_handles = []
+        while pipes:
+            ready = multiprocessing.connection.wait(pipes.keys())
+            for pipe in ready:
+                assert isinstance(pipe, Connection)
+                try:
+                    # Wait until the WorkerProc is ready.
+                    unready_proc_handle = pipes.pop(pipe)
+                    response: dict[str, Any] = pipe.recv()
+                    if response["status"] != "READY":
+                        raise e
+
+                    # Extract the message queue handle.
+                    worker_response_mq = MessageQueue.create_from_handle(
+                        response["handle"], 0)
+                    ready_proc_handles.append(
+                        WorkerProcHandle.from_unready_handle(
+                            unready_proc_handle, worker_response_mq))
+
+                except EOFError:
+                    e.__suppress_context__ = True
+                    raise e from None
+
+                finally:
+                    # Close connection.
+                    pipe.close()
+
+        return ready_proc_handles
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -355,13 +368,13 @@ def signal_handler(signum, frame):
 
             worker.worker_busy_loop()
 
-        except Exception as e:
+        except Exception:
             # NOTE: if an Exception arises in busy_loop, we send
             # a FAILURE message over the MQ RPC to notify the Executor,
             # which triggers system shutdown.
             # TODO(rob): handle case where the MQ itself breaks.
 
-            logger.exception("WorkerProc got an Exception:", exc_info=e)
+            logger.exception("WorkerProc got an Exception.")
 
             # The parent sends a SIGTERM to all worker processes if
             # any worker dies. Set this value so we don't re-throw

From d014a6bc0920b030550199b2ba2a2315ac76fb69 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 9 Apr 2025 13:39:18 -0700
Subject: [PATCH 102/130] More multiproc_executor.py streamlining

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 120 ++++++++++++-------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index a146cad8332b..772d4aeb54c4 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -68,14 +68,14 @@ def _init_executor(self) -> None:
         success = False
         try:
             for rank in range(self.world_size):
-                unready_worker = WorkerProc.make_worker_process(
-                    vllm_config=self.vllm_config,
-                    local_rank=rank,
-                    rank=rank,
-                    distributed_init_method=distributed_init_method,
-                    input_shm_handle=scheduler_output_handle,
-                )
-                unready_workers.append(unready_worker)
+                unready_workers.append(
+                    WorkerProc.make_worker_process(
+                        vllm_config=self.vllm_config,
+                        local_rank=rank,
+                        rank=rank,
+                        distributed_init_method=distributed_init_method,
+                        input_shm_handle=scheduler_output_handle,
+                    ))
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
@@ -90,8 +90,8 @@ def _init_executor(self) -> None:
         finally:
             if not success:
                 # Clean up the worker procs if there was a failure.
-                for handle in unready_workers:
-                    handle.proc.kill()
+                self._ensure_worker_termination(
+                    [w.proc for w in unready_workers])
 
     def collective_rpc(self,
                        method: Union[str, Callable],
@@ -133,7 +133,8 @@ def collective_rpc(self,
             # Re-raise any other exceptions
             raise e
 
-    def _ensure_worker_termination(self):
+    @staticmethod
+    def _ensure_worker_termination(worker_procs: list[BaseProcess]):
         """Ensure that all worker processes are terminated. Assumes workers have
         received termination requests. Waits for processing, then sends
         termination and kill signals if needed."""
@@ -151,7 +152,7 @@ def wait_for_termination(procs, timeout):
             return False
 
         # Send SIGTERM if still running
-        active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
+        active_procs = [proc for proc in worker_procs if proc.is_alive()]
         for p in active_procs:
             p.terminate()
         if not wait_for_termination(active_procs, 4):
@@ -166,7 +167,7 @@ def shutdown(self):
             self.shutting_down = True
             for w in self.workers:
                 w.worker_response_mq = None
-            self._ensure_worker_termination()
+            self._ensure_worker_termination([w.proc for w in self.workers])
 
         self.rpc_broadcast_mq = None
 
@@ -204,7 +205,6 @@ class WorkerProc:
     """Wrapper that runs one Worker in a separate process."""
 
     READY_STR = "READY"
-    FAILED_STR = "FAILED"
 
     def __init__(
         self,
@@ -213,55 +213,38 @@ def __init__(
         rank: int,
         distributed_init_method: str,
         input_shm_handle: Handle,
-        ready_pipe: tuple[Connection, Connection],
     ):
-        reader, writer = ready_pipe
-        reader.close()
-        try:
-            self.rank = rank
-            wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
-            # TODO: move `init_worker` to executor level as a collective rpc
-            # call
-            all_kwargs: list[dict] = [
-                {} for _ in range(vllm_config.parallel_config.world_size)
-            ]
-            all_kwargs[rank] = {
-                "vllm_config": vllm_config,
-                "local_rank": local_rank,
-                "rank": rank,
-                "distributed_init_method": distributed_init_method,
-                "is_driver_worker": rank == 0,
-            }
-            wrapper.init_worker(all_kwargs)
-            self.worker = wrapper
-
-            pid = os.getpid()
-            _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
-            _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
-
-            # Initialize MessageQueue for receiving SchedulerOutput
-            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-                input_shm_handle, self.worker.rank)
-
-            # Initializes a message queue for sending the model output
-            self.worker_response_mq = MessageQueue(1, 1)
-            worker_response_mq_handle = self.worker_response_mq.export_handle()
+        self.rank = rank
+        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        # TODO: move `init_worker` to executor level as a collective rpc
+        # call
+        all_kwargs: list[dict] = [
+            {} for _ in range(vllm_config.parallel_config.world_size)
+        ]
+        all_kwargs[rank] = {
+            "vllm_config": vllm_config,
+            "local_rank": local_rank,
+            "rank": rank,
+            "distributed_init_method": distributed_init_method,
+            "is_driver_worker": rank == 0,
+        }
+        wrapper.init_worker(all_kwargs)
+        self.worker = wrapper
 
-            # Send READY once we know everything is loaded
-            writer.send({
-                "status": WorkerProc.READY_STR,
-                "handle": worker_response_mq_handle,
-            })
+        pid = os.getpid()
+        _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
+        _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid)
 
-            # Initialize device and loads weights
-            self.worker.init_device()
-            self.worker.load_model()
+        # Initialize MessageQueue for receiving SchedulerOutput
+        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+            input_shm_handle, self.worker.rank)
 
-        except Exception:
-            logger.exception("WorkerProc startup failed.")
+        # Initializes a message queue for sending the model output
+        self.worker_response_mq = MessageQueue(1, 1)
 
-        finally:
-            writer.close()
+        # Initialize device and loads weights
+        self.worker.init_device()
+        self.worker.load_model()
 
     @staticmethod
     def make_worker_process(
@@ -358,13 +341,26 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGINT, signal_handler)
 
         worker = None
+        # tuple[Connection, Connection]
+        reader, ready_writer = kwargs.pop("ready_pipe")
         try:
+            reader.close()
             worker = WorkerProc(*args, **kwargs)
 
+            # Send READY once we know everything is loaded
+            ready_writer.send({
+                "status":
+                WorkerProc.READY_STR,
+                "handle":
+                worker.worker_response_mq.export_handle(),
+            })
+
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
             worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
+            ready_writer.close()
+            ready_writer = None
 
             worker.worker_busy_loop()
 
@@ -374,7 +370,10 @@ def signal_handler(signum, frame):
             # which triggers system shutdown.
             # TODO(rob): handle case where the MQ itself breaks.
 
-            logger.exception("WorkerProc got an Exception.")
+            if ready_writer is not None:
+                logger.exception("WorkerProc failed to start.")
+            else:
+                logger.exception("WorkerProc failed.")
 
             # The parent sends a SIGTERM to all worker processes if
             # any worker dies. Set this value so we don't re-throw
@@ -382,10 +381,11 @@ def signal_handler(signum, frame):
             shutdown_requested = True
 
         finally:
+            if ready_writer is not None:
+                ready_writer.close()
             # Clean up once worker exits busy loop
             if worker is not None:
                 worker.shutdown()
-                worker = None
 
     class ResponseStatus(Enum):
         SUCCESS = auto()

From c9941da5455db3bda8190d0ac46b6458ac57bd6c Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 9 Apr 2025 15:56:19 -0700
Subject: [PATCH 103/130] core_client.py streamlining

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/async_llm.py            |  2 +-
 vllm/v1/engine/core_client.py          | 81 ++++++++++++--------------
 vllm/v1/executor/multiproc_executor.py |  1 +
 3 files changed, 40 insertions(+), 44 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6e3a3c658ea8..82a7fc9abf4b 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -473,7 +473,7 @@ def is_stopped(self) -> bool:
 
     @property
     def errored(self) -> bool:
-        return self.engine_core.is_engine_dead or not self.is_running
+        return self.engine_core.resources.engine_dead or not self.is_running
 
     @property
     def dead_error(self) -> BaseException:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 62bf7f9fe809..631467705f2d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -304,6 +304,10 @@ class BackgroundResources:
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     shutdown_path: Optional[str] = None
 
+    # Set if any of the engines are dead. Here so that the output
+    # processing threads can access it without holding a ref to the client.
+    engine_dead: bool = False
+
     def __call__(self):
         """Clean up background resources."""
 
@@ -324,6 +328,11 @@ def __call__(self):
                 # Send shutdown signal.
                 shutdown_sender.send(b'')
 
+    def validate_alive(self, buffer: Any):
+        if buffer == EngineCoreProc.ENGINE_CORE_DEAD:
+            self.engine_dead = True
+            raise EngineDeadError()
+
 
 class MPClient(EngineCoreClient):
     """
@@ -369,7 +378,6 @@ def __init__(
                                                 bind=True)
             self.resources.input_socket = self.input_socket
 
-            self.is_engine_dead = False
             new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
                 vllm_config, executor_class, log_stats, input_path, self.
                 output_path, index, local_dp_rank)
@@ -436,16 +444,14 @@ def shutdown(self):
         # Terminate background resources.
         self._finalizer()
 
-    def _validate_alive(self, buffer: Any):
-        if buffer == EngineCoreProc.ENGINE_CORE_DEAD:
-            self.is_engine_dead = True
-            raise EngineDeadError()
-
     def _format_exception(self, e: Exception) -> Exception:
         """If errored, use EngineDeadError so root cause is clear."""
-
         return EngineDeadError(
-            suppress_context=True) if self.is_engine_dead else e
+            suppress_context=True) if self.resources.engine_dead else e
+
+    def ensure_alive(self):
+        if self.resources.engine_dead:
+            raise EngineDeadError()
 
 
 def _process_utility_output(output: UtilityOutput,
@@ -482,9 +488,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         outputs_queue = self.outputs_queue
 
         shutdown_path = get_open_zmq_inproc_path()
-        self.resources.shutdown_path = shutdown_path
-
-        self_weakref = weakref.ref(self)
+        resources = self.resources
+        resources.shutdown_path = shutdown_path
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
@@ -501,21 +506,16 @@ def process_outputs_socket():
                     if len(socks) == 2 or socks[0][0] == shutdown_socket:
                         # shutdown signal, exit thread.
                         break
-                    local_self = self_weakref()
-                    if local_self is None:
-                        # Instance is being gc'd, exit loop
-                        break
-                    try:
-                        frame = out_socket.recv(copy=False)
-                        local_self._validate_alive(frame.buffer)
-                        outputs = decoder.decode(frame.buffer)
-                    except Exception as e:
-                        local_self.outputs_queue.put_nowait(e)
+                    frame = out_socket.recv(copy=False)
+                    resources.validate_alive(frame.buffer)
+                    outputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
+            except Exception as e:
+                outputs_queue.put_nowait(e)
             finally:
                 # Close sockets.
                 shutdown_socket.close(linger=0)
@@ -537,13 +537,11 @@ def get_output(self) -> EngineCoreOutputs:
         return outputs
 
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
-        try:
-            # (Identity, RequestType, SerializedRequest)
-            msg = (self.core_engine.identity, request_type.value,
-                   self.encoder.encode(request))
-            self.input_socket.send_multipart(msg, copy=False)
-        except Exception as e:
-            raise self._format_exception(e) from None
+        self.ensure_alive()
+        # (Identity, RequestType, SerializedRequest)
+        msg = (self.core_engine.identity, request_type.value,
+               self.encoder.encode(request))
+        self.input_socket.send_multipart(msg, copy=False)
 
     def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
@@ -642,13 +640,14 @@ def _ensure_output_queue_task(self):
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
-        self.resources.output_socket = output_socket
+        resources = self.resources
+        resources.output_socket = output_socket
 
         async def process_outputs_socket():
             try:
                 while True:
                     frame = await output_socket.recv(copy=False)
-                    self._validate_alive(frame.buffer)
+                    resources.validate_alive(frame.buffer)
                     outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
@@ -666,7 +665,7 @@ async def process_outputs_socket():
                     if outputs.outputs or outputs.scheduler_stats:
                         outputs_queue.put_nowait(outputs)
             except Exception as e:
-                self.outputs_queue.put_nowait(e)
+                outputs_queue.put_nowait(e)
 
         self.queue_task = asyncio.create_task(process_outputs_socket(),
                                               name="EngineCoreOutputQueueTask")
@@ -691,22 +690,18 @@ def _send_input(self,
                     request_type: EngineCoreRequestType,
                     request: Any,
                     engine: Optional[CoreEngine] = None) -> Awaitable[None]:
-        try:
-            if engine is None:
-                engine = self.core_engine
+        self.ensure_alive()
+        if engine is None:
+            engine = self.core_engine
 
-            message = (request_type.value, self.encoder.encode(request))
-            return self._send_input_message(message, engine)
-        except Exception as e:
-            raise self._format_exception(e) from None
+        message = (request_type.value, self.encoder.encode(request))
+        return self._send_input_message(message, engine)
 
     def _send_input_message(self, message: tuple[bytes, bytes],
                             engine: CoreEngine) -> Awaitable[None]:
-        try:
-            message = (engine.identity, ) + message  # type: ignore[assignment]
-            return self.input_socket.send_multipart(message, copy=False)
-        except Exception as e:
-            raise self._format_exception(e) from None
+        self.ensure_alive()
+        message = (engine.identity, ) + message  # type: ignore[assignment]
+        return self.input_socket.send_multipart(message, copy=False)
 
     async def call_utility_async(self, method: str, *args) -> Any:
         return await self._call_utility_async(method,
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 772d4aeb54c4..64ae03587001 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -269,6 +269,7 @@ def make_worker_process(
         # Run EngineCore busy loop in background process.
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
+                               name=f"VllmWorker-{rank}",
                                daemon=True)
 
         proc.start()

From 72740ca2c04e9f53e71857c271f8fe0a1abd4c5d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 00:04:53 +0000
Subject: [PATCH 104/130] timeout

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py          | 3 +++
 tests/v1/shutdown/test_forward_error.py   | 2 ++
 tests/v1/shutdown/test_processor_error.py | 2 ++
 tests/v1/shutdown/test_startup_error.py   | 3 +++
 tests/v1/shutdown/util.py                 | 4 ++++
 5 files changed, 14 insertions(+)
 create mode 100644 tests/v1/shutdown/util.py

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 31bbabc1c232..639ecf0c5727 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.utils import cuda_device_count_stateless
@@ -14,6 +15,7 @@
 ]
 
 
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
@@ -40,6 +42,7 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     )
 
 
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 5ec5bd1cee53..9851524724fa 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -28,6 +29,7 @@ def evil_forward(self, *args, **kwargs):
     return self.model(*args, **kwargs)
 
 
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index eb1341cbddf3..84328500202b 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -5,6 +5,7 @@
 
 import pytest
 
+from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
@@ -16,6 +17,7 @@
 
 
 @pytest.mark.asyncio
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("model", MODELS)
 async def test_async_llm_processor_error(model: str) -> None:
     """Test that AsyncLLM propagates a processor error.
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index a40823dbd45b..62b14cd5f66f 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
+from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -26,6 +27,7 @@ def evil_method(self, *args, **kwargs):
 ]
 
 
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
@@ -58,6 +60,7 @@ def test_async_llm_startup_error(monkeypatch, model: str,
     )
 
 
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
diff --git a/tests/v1/shutdown/util.py b/tests/v1/shutdown/util.py
new file mode 100644
index 000000000000..ab6111cde348
--- /dev/null
+++ b/tests/v1/shutdown/util.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Shutdown test utils"""
+
+SHUTDOWN_TEST_TIMEOUT = 60

From 1a76f3655c7931d2bf5e89fcac59160ec6a876be Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 00:33:59 +0000
Subject: [PATCH 105/130] refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py          | 2 +-
 tests/v1/shutdown/test_forward_error.py   | 7 ++++---
 tests/v1/shutdown/test_processor_error.py | 2 +-
 tests/v1/shutdown/test_startup_error.py   | 2 +-
 tests/v1/shutdown/{util.py => utils.py}   | 0
 5 files changed, 7 insertions(+), 6 deletions(-)
 rename tests/v1/shutdown/{util.py => utils.py} (100%)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 639ecf0c5727..c01447b2e137 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -4,7 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.utils import cuda_device_count_stateless
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 9851524724fa..e9b6022774f6 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -4,7 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -31,12 +31,13 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
 @pytest.mark.parametrize("enable_multiprocessing", [True])
-@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
+@pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("model", MODELS)
 def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
                          enable_multiprocessing: bool, model: str) -> None:
     """Test that LLM propagates a forward pass error and frees memory.
-    TODO(andy) - LLM without multiprocessing.
+    TODO(andy) - LLM without multiprocessing; LLM with multiprocessing
+    and >1 rank
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 84328500202b..78221c0f173a 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 62b14cd5f66f..c7abf03bc9a6 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -4,7 +4,7 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
diff --git a/tests/v1/shutdown/util.py b/tests/v1/shutdown/utils.py
similarity index 100%
rename from tests/v1/shutdown/util.py
rename to tests/v1/shutdown/utils.py

From 5bde29d1c4fd6969d014e6f01c705e1887395618 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 00:42:32 +0000
Subject: [PATCH 106/130] refactor

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py          | 17 +++++++----------
 tests/v1/shutdown/test_forward_error.py   |  8 ++++----
 tests/v1/shutdown/test_processor_error.py |  4 ++--
 tests/v1/shutdown/test_startup_error.py   | 20 ++++++++------------
 tests/v1/shutdown/utils.py                |  3 ++-
 5 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index c01447b2e137..e11f267f15d9 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -4,18 +4,17 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
-MODELS = [
-    "meta-llama/Llama-3.2-1B",  # Raises on first fwd pass.
-]
+MODELS = ["meta-llama/Llama-3.2-1B"]
 
 
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
@@ -37,12 +36,11 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     # Confirm all the processes are cleaned up.
     wait_for_gpu_memory_to_clear(
         devices=list(range(tensor_parallel_size)),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
     )
 
 
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
@@ -68,6 +66,5 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(
             devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
         )
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index e9b6022774f6..04e585046461 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -4,7 +4,8 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -29,7 +30,7 @@ def evil_forward(self, *args, **kwargs):
     return self.model(*args, **kwargs)
 
 
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("model", MODELS)
@@ -61,6 +62,5 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(
             devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
         )
diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py
index 78221c0f173a..0fe48da475c6 100644
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs.data import TokensPrompt
@@ -17,7 +17,7 @@
 
 
 @pytest.mark.asyncio
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 async def test_async_llm_processor_error(model: str) -> None:
     """Test that AsyncLLM propagates a processor error.
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index c7abf03bc9a6..1bba19102ec6 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -4,7 +4,8 @@
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
-from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT
+from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
+                                     SHUTDOWN_TEST_TIMEOUT_SEC)
 from vllm import LLM
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -12,6 +13,8 @@
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
+MODELS = ["meta-llama/Llama-3.2-1B"]
+
 
 def evil_method(self, *args, **kwargs):
     """Evil method that raises an exception."""
@@ -22,12 +25,7 @@ def evil_method(self, *args, **kwargs):
     return self.model(*args, **kwargs, intermediate_tensors=None)
 
 
-MODELS = [
-    "meta-llama/Llama-3.2-1B",  # Raises on first fwd pass.
-]
-
-
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("failing_method", ["forward", "load_weights"])
@@ -55,12 +53,11 @@ def test_async_llm_startup_error(monkeypatch, model: str,
     # Confirm all the processes are cleaned up.
     wait_for_gpu_memory_to_clear(
         devices=list(range(tensor_parallel_size)),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
+        threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
     )
 
 
-@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT)
+@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
@@ -96,6 +93,5 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
         # Confirm all the processes are cleaned up.
         wait_for_gpu_memory_to_clear(
             devices=list(range(tensor_parallel_size)),
-            threshold_bytes=2 * 2**30,
-            timeout_s=60,
+            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
         )
diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py
index ab6111cde348..8f7c0380d407 100644
--- a/tests/v1/shutdown/utils.py
+++ b/tests/v1/shutdown/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 """Shutdown test utils"""
 
-SHUTDOWN_TEST_TIMEOUT = 60
+SHUTDOWN_TEST_TIMEOUT_SEC = 120
+SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30

From 1a0a217207593133bac1def2efa95eb47d1fbca5 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 9 Apr 2025 18:23:23 -0700
Subject: [PATCH 107/130] Process monitor for TP workers

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/__init__.py             |  2 +
 vllm/v1/engine/async_llm.py            | 11 +++++-
 vllm/v1/engine/core.py                 | 51 +++++++++++++++++---------
 vllm/v1/engine/core_client.py          | 19 ++++++++--
 vllm/v1/executor/abstract.py           |  9 ++++-
 vllm/v1/executor/multiproc_executor.py | 46 +++++++++++++++++++++--
 6 files changed, 110 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 1264e43c79d9..af4122a51077 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -156,3 +156,5 @@ class EngineCoreRequestType(enum.Enum):
     ABORT = b'\x01'
     START_DP = b'\x02'
     UTILITY = b'\x03'
+    # Sentinel used within EngineCoreProc.
+    EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 82a7fc9abf4b..44ead7b22cfa 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -107,6 +107,13 @@ def __init__(
         )
 
         self.output_handler: Optional[asyncio.Task] = None
+        try:
+            # Start output handler eagerly if we are in the asyncio eventloop.
+            asyncio.get_running_loop()
+            self.output_handler = asyncio.create_task(
+                self._run_output_handler())
+        except RuntimeError:
+            pass
 
     @classmethod
     def from_vllm_config(
@@ -357,9 +364,9 @@ async def _run_output_handler(self):
                     iteration_stats=iteration_stats,
                 )
 
-        except Exception:
+        except Exception as e:
             logger.exception("AsyncLLM output_handler failed.")
-            self.output_processor.propagate_error(EngineDeadError())
+            self.output_processor.propagate_error(e)
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 4c92cd8d86e1..8b2b2311d199 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -48,12 +48,11 @@
 class EngineCore:
     """Inner loop of vLLM's Engine."""
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -63,6 +62,9 @@ def __init__(
 
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)
 
         # Setup KV Caches and update CacheConfig after profiling.
         num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
@@ -317,9 +319,15 @@ def __init__(
         log_stats: bool,
         engine_index: int = 0,
     ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        input_queue: queue.Queue[tuple[EngineCoreRequestType,
+                                       Any]] = queue.Queue()
+
+        executor_fail_callback = lambda: input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         executor_fail_callback)
 
-        self.errored_sent_event = threading.Event()
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
         self.global_unfinished_reqs = False
@@ -329,16 +337,17 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                            Any]] = queue.Queue()
+        self.input_queue = input_queue
         self.output_queue: queue.Queue[Union[EngineCoreOutputs,
                                              bytes]] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, engine_index),
                          daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, engine_index),
-                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_socket,
+            args=(output_path, engine_index),
+            daemon=True)
+        self.output_thread.start()
 
     @staticmethod
     def run_engine_core(*args,
@@ -460,6 +469,11 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
                                           f" failed: {str(e)}")
             self.output_queue.put_nowait(
                 EngineCoreOutputs(utility_output=output))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)
 
     @staticmethod
     def _convert_msgspec_args(method, args):
@@ -482,7 +496,8 @@ def _send_engine_dead(self):
         self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
 
         # Wait until msg sent by the daemon before shutdown.
-        if not self.errored_sent_event.wait(timeout=5.):
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
             logger.fatal("vLLM shutdown signal from EngineCore failed "
                          "to send. Please report this issue.")
 
@@ -524,7 +539,10 @@ def process_output_socket(self, output_path: str, engine_index: int):
         # Reuse send buffer.
         buffer = bytearray()
 
-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
+                            linger=4000) as socket:
             while True:
                 outputs = self.output_queue.get()
                 if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
@@ -535,9 +553,6 @@ def process_output_socket(self, output_path: str, engine_index: int):
                 encoder.encode_into(outputs, buffer)
                 socket.send(buffer, copy=False)
 
-        # Signal to main thread that ENGINE_CORE_DEAD was sent.
-        self.errored_sent_event.set()
-
 
 ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 631467705f2d..342cbf5d47e4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -626,6 +626,16 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         self.outputs_handler: Optional[Callable[
             [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
 
+        try:
+            # If we are running in an asyncio event loop, start the queue task.
+            # Otherwise, it will be started lazily. If it is not started here,
+            # we could miss EXECUTOR_FAILED messages from engine core if they
+            # occur prior to any requests being sent.
+            asyncio.get_running_loop()
+            self._ensure_output_queue_task()
+        except RuntimeError:
+            pass
+
     def _ensure_output_queue_task(self):
         if self.queue_task is not None:
             return
@@ -648,7 +658,7 @@ async def process_outputs_socket():
                 while True:
                     frame = await output_socket.recv(copy=False)
                     resources.validate_alive(frame.buffer)
-                    outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
+                    outputs: EngineCoreOutputs = decoder.decode(frame)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
@@ -783,9 +793,6 @@ class DPAsyncMPClient(AsyncMPClient):
 
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
-        super().__init__(vllm_config, executor_class, log_stats)
-
-        assert len(self.core_engines) > 1
 
         # Control message used for triggering dp idle mode loop.
         self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
@@ -796,6 +803,10 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
 
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        assert len(self.core_engines) > 1
+
     def _init_core_engines(
         self,
         vllm_config: VllmConfig,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index e3a4cd98c1f8..dae6ca613080 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from concurrent.futures import Future
-from typing import Union
+from typing import Callable, Union
 
 import torch
 import torch.distributed as dist
@@ -62,6 +62,13 @@ def initialize_from_config(self,
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
+    def register_failure_callback(self, callback: Callable):
+        """
+        Register a function to be called if the executor enters a permanent
+        failed state.
+        """
+        pass
+
     def determine_available_memory(self) -> list[int]:  # in bytes
         output = self.collective_rpc("determine_available_memory")
         return output
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 64ae03587001..3734d431292c 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -12,6 +12,7 @@
 from functools import partial
 from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
+from threading import Thread
 from typing import Any, Callable, Optional, Union
 
 import cloudpickle
@@ -41,6 +42,8 @@ def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
+        self.is_failed = False
+        self.failure_callback: Optional[Callable] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -86,6 +89,8 @@ def _init_executor(self) -> None:
             self.rpc_broadcast_mq.wait_until_ready()
             for w in self.workers:
                 w.worker_response_mq.wait_until_ready()
+
+            self.start_worker_monitor()
             success = True
         finally:
             if not success:
@@ -93,6 +98,41 @@ def _init_executor(self) -> None:
                 self._ensure_worker_termination(
                     [w.proc for w in unready_workers])
 
+    def start_worker_monitor(self):
+        workers = self.workers
+        self_ref = weakref.ref(self)
+
+        # Monitors worker process liveness. If any die unexpectedly,
+        # logs an error, shuts down the executor and invokes the failure
+        # callback to inform the engine.
+        def monitor_workers():
+            sentinels = [h.proc.sentinel for h in workers]
+            died = multiprocessing.connection.wait(sentinels)
+            _self = self_ref()
+            if not _self or getattr(_self, 'shutting_down', False):
+                return
+            _self.is_failed = True
+            proc_name = next(h.proc.name for h in workers
+                             if h.proc.sentinel == died[0])
+            logger.error(
+                "Worker proc %s died unexpectedly, "
+                "shutting down executor.", proc_name)
+            _self.shutdown()
+            callback = _self.failure_callback
+            if callback is not None:
+                _self.failure_callback = None
+                callback()
+
+        Thread(target=monitor_workers,
+               daemon=True,
+               name="MultiprocWorkerMonitor").start()
+
+    def register_failure_callback(self, callback: Callable):
+        if self.is_failed:
+            callback()
+        else:
+            self.failure_callback = callback
+
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
@@ -101,6 +141,9 @@ def collective_rpc(self,
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
+        if self.is_failed:
+            raise RuntimeError("Executor failed.")
+
         # NOTE: If the args are heterogeneous, then we pack them into a list,
         # and unpack them in the method of every worker, because every worker
         # knows their own rank.
@@ -129,9 +172,6 @@ def collective_rpc(self,
             return responses
         except TimeoutError as e:
             raise TimeoutError(f"RPC call to {method} timed out.") from e
-        except Exception as e:
-            # Re-raise any other exceptions
-            raise e
 
     @staticmethod
     def _ensure_worker_termination(worker_procs: list[BaseProcess]):

From 1abcac38c9ab10da8f0b0e537d4007176df17022 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 15:00:10 +0000
Subject: [PATCH 108/130] ValueError exception

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 vllm/v1/engine/async_llm.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 44ead7b22cfa..84be8e83b8d2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -311,6 +311,12 @@ async def generate(
                 logger.info("Request %s failed.", request_id)
             raise
 
+        # Bad request discovered in generate()
+        except ValueError:
+            if self.log_requests:
+                logger.info("Request %s failed.", request_id)
+            raise
+
         # Error in the generate() task (possibly recoverable).
         except Exception as e:
             await self.abort(request_id)

From 1a4b6a097e2bca986a5a48ade70f80a2723b55cd Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 15:18:01 +0000
Subject: [PATCH 109/130] added llm 2-rank forward error test back

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 04e585046461..7340065b4a4f 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -32,7 +32,7 @@ def evil_forward(self, *args, **kwargs):
 
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("enable_multiprocessing", [True])
-@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
 def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
                          enable_multiprocessing: bool, model: str) -> None:

From 863aa08babdaec0df37d89be156a79d09734b88c Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 15:41:42 +0000
Subject: [PATCH 110/130] added back async test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_forward_error.py | 66 ++++++++++++++++++++++++-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 7340065b4a4f..4d0c72771908 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -1,15 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle an Error in model forward and shutdown."""
 
+import asyncio
+
 import pytest
 
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
                                      SHUTDOWN_TEST_TIMEOUT_SEC)
-from vllm import LLM
+from vllm import LLM, AsyncEngineArgs, SamplingParams
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
+from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
 MODELS = ["meta-llama/Llama-3.2-1B"]
@@ -30,6 +33,67 @@ def evil_forward(self, *args, **kwargs):
     return self.model(*args, **kwargs)
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("model", MODELS)
+async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
+                                     model: str) -> None:
+    """Test that AsyncLLM propagates a forward pass error and frees memory.
+    
+    AsyncLLM always uses an MP client.
+    """
+
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(reason="Not enough CUDA devices")
+
+    # Monkeypatch an error in the model.
+    monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward)
+
+    engine_args = AsyncEngineArgs(model=model,
+                                  enforce_eager=True,
+                                  tensor_parallel_size=tensor_parallel_size)
+    async_llm = AsyncLLM.from_engine_args(engine_args)
+
+    async def generate(request_id: str):
+        generator = async_llm.generate("Hello my name is",
+                                       request_id=request_id,
+                                       sampling_params=SamplingParams())
+        try:
+            async for _ in generator:
+                pass
+        except Exception as e:
+            return e
+
+    NUM_REQS = 3
+    tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)]
+    outputs = await asyncio.gather(*tasks)
+
+    # Every request should get an EngineDeadError.
+    for output in outputs:
+        assert isinstance(output, EngineDeadError)
+
+    # AsyncLLM should be errored.
+    assert async_llm.errored
+
+    # We should not be able to make another request.
+    with pytest.raises(EngineDeadError):
+        async for _ in async_llm.generate("Hello my name is",
+                                          request_id="abc",
+                                          sampling_params=SamplingParams()):
+            raise Exception("We should not get here.")
+
+    # Confirm all the processes are cleaned up.
+    wait_for_gpu_memory_to_clear(
+        devices=list(range(tensor_parallel_size)),
+        threshold_bytes=2 * 2**30,
+        timeout_s=60,
+    )
+
+    # NOTE: shutdown is handled by the API Server if an exception
+    # occurs, so it is expected that we would need to call this.
+    async_llm.shutdown()
+
+
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("enable_multiprocessing", [True])
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])

From be9d3560cc797dcd9fab00e46ab1c6c9c99c7406 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 12:10:32 -0700
Subject: [PATCH 111/130] Adjust per request failure log messages

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/async_llm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 84be8e83b8d2..51617fdbb28e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -308,16 +308,16 @@ async def generate(
         # Engine is dead. Do not abort since we shut down.
         except EngineDeadError:
             if self.log_requests:
-                logger.info("Request %s failed.", request_id)
+                logger.info("Request %s failed (engine dead).", request_id)
             raise
 
-        # Bad request discovered in generate()
+        # Request validation error.
         except ValueError:
             if self.log_requests:
-                logger.info("Request %s failed.", request_id)
+                logger.info("Request %s failed (bad request).", request_id)
             raise
 
-        # Error in the generate() task (possibly recoverable).
+        # Unexpected error in the generate() task (possibly recoverable).
         except Exception as e:
             await self.abort(request_id)
             if self.log_requests:

From 95a45baace0224fca3350380df2208f1d9f0bd34 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 13:18:03 -0700
Subject: [PATCH 112/130] Move output queue task ref / cleanup to
 BackgroundResource

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 397526af32ce..f359339994eb 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
+import contextlib
 import queue
 import uuid
 import weakref
@@ -302,6 +303,7 @@ class BackgroundResources:
     core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    output_queue_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
 
     # Set if any of the engines are dead. Here so that the output
@@ -314,6 +316,10 @@ def __call__(self):
         for core_engine in self.core_engines:
             core_engine.close()
 
+        if self.output_queue_task is not None:
+            with contextlib.suppress(Exception):
+                self.output_queue_task.cancel()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
@@ -622,8 +628,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs,
                                                 Exception]] = asyncio.Queue()
-        self.queue_task: Optional[asyncio.Task] = None
-
         self.outputs_handler: Optional[Callable[
             [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
 
@@ -638,7 +642,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             pass
 
     def _ensure_output_queue_task(self):
-        if self.queue_task is not None:
+        resources = self.resources
+        if resources.output_queue_task is not None:
             return
 
         # Perform IO in separate task to parallelize as much as possible.
@@ -651,7 +656,6 @@ def _ensure_output_queue_task(self):
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
-        resources = self.resources
         resources.output_socket = output_socket
 
         async def process_outputs_socket():
@@ -678,13 +682,8 @@ async def process_outputs_socket():
             except Exception as e:
                 outputs_queue.put_nowait(e)
 
-        self.queue_task = asyncio.create_task(process_outputs_socket(),
-                                              name="EngineCoreOutputQueueTask")
-
-    def shutdown(self):
-        super().shutdown()
-        if queue_task := getattr(self, "queue_task", None):
-            queue_task.cancel()
+        resources.output_queue_task = asyncio.create_task(
+            process_outputs_socket(), name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
         self._ensure_output_queue_task()

From cb70c37d62ffcdecfde738292c7e86e8d539da4d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 20:21:43 +0000
Subject: [PATCH 113/130] added tests back

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py        | 14 ++++++++++++--
 tests/v1/shutdown/test_forward_error.py |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index e11f267f15d9..9a3145502a3a 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -6,18 +6,20 @@
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES,
                                      SHUTDOWN_TEST_TIMEOUT_SEC)
-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
 MODELS = ["meta-llama/Llama-3.2-1B"]
 
 
+@pytest.mark.asyncio
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
+async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
     """
@@ -31,6 +33,12 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     # Instantiate AsyncLLM; make request to complete any deferred
     # initialization; then delete instance
     async_llm = AsyncLLM.from_engine_args(engine_args)
+    async for _ in async_llm.generate(
+            "Hello my name is",
+            request_id="abc",
+            sampling_params=SamplingParams(
+                max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+        pass
     del async_llm
 
     # Confirm all the processes are cleaned up.
@@ -61,6 +69,8 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
         llm = LLM(model=model,
                   enforce_eager=True,
                   tensor_parallel_size=tensor_parallel_size)
+        llm.generate("Hello my name is",
+                     sampling_params=SamplingParams(max_tokens=1))
         del llm
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 4d0c72771908..558e2cccb8af 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -34,7 +34,7 @@ def evil_forward(self, *args, **kwargs):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("tensor_parallel_size", [1])
+@pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("model", MODELS)
 async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
                                      model: str) -> None:

From 775e0c3aaf30a1d96308fd9b84eca3023358cd04 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 10 Apr 2025 20:42:39 +0000
Subject: [PATCH 114/130] knobs for tests

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/shutdown/test_delete.py        | 37 ++++++++++++++++++-------
 tests/v1/shutdown/test_forward_error.py |  3 +-
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 9a3145502a3a..2acb175571e5 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -19,9 +19,16 @@
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
+@pytest.mark.parametrize("send_one_request", [False])
+async def test_async_llm_delete(model: str, tensor_parallel_size: int,
+                                send_one_request: bool) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
     AsyncLLM always uses an MP client.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      send_one_request: send one request to engine before deleting
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
@@ -33,12 +40,13 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
     # Instantiate AsyncLLM; make request to complete any deferred
     # initialization; then delete instance
     async_llm = AsyncLLM.from_engine_args(engine_args)
-    async for _ in async_llm.generate(
-            "Hello my name is",
-            request_id="abc",
-            sampling_params=SamplingParams(
-                max_tokens=1, output_kind=RequestOutputKind.DELTA)):
-        pass
+    if send_one_request:
+        async for _ in async_llm.generate(
+                "Hello my name is",
+                request_id="abc",
+                sampling_params=SamplingParams(
+                    max_tokens=1, output_kind=RequestOutputKind.DELTA)):
+            pass
     del async_llm
 
     # Confirm all the processes are cleaned up.
@@ -52,10 +60,18 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
+@pytest.mark.parametrize("send_one_request", [False])
 def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
-                    enable_multiprocessing: bool) -> None:
+                    enable_multiprocessing: bool,
+                    send_one_request: bool) -> None:
     """Test that LLM frees GPU memory upon deletion.
     TODO(andy) - LLM without multiprocessing.
+
+    Args:
+      model: model under test
+      tensor_parallel_size: degree of tensor parallelism
+      enable_multiprocessing: enable workers in separate process(es)
+      send_one_request: send one request to engine before deleting
     """
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
@@ -69,8 +85,9 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
         llm = LLM(model=model,
                   enforce_eager=True,
                   tensor_parallel_size=tensor_parallel_size)
-        llm.generate("Hello my name is",
-                     sampling_params=SamplingParams(max_tokens=1))
+        if send_one_request:
+            llm.generate("Hello my name is",
+                         sampling_params=SamplingParams(max_tokens=1))
         del llm
 
         # Confirm all the processes are cleaned up.
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 558e2cccb8af..b9d0cd8b0b5f 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -42,7 +42,8 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
     
     AsyncLLM always uses an MP client.
     """
-
+    if tensor_parallel_size > 1:
+        pytest.skip(reason="Parallelism > 1 not yet supported for this test.")
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 

From 35241157325c78eb2084b39711a61e94bfb63d63 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 14:27:44 -0700
Subject: [PATCH 115/130] Fix rebase bug

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f359339994eb..813320dd8ca5 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -335,7 +335,8 @@ def __call__(self):
                 shutdown_sender.send(b'')
 
     def validate_alive(self, frames: Sequence[zmq.Frame]):
-        if len(frames) == 1 and frames[0] == EngineCoreProc.ENGINE_CORE_DEAD:
+        if len(frames) == 1 and (frames[0].buffer
+                                 == EngineCoreProc.ENGINE_CORE_DEAD):
             self.engine_dead = True
             raise EngineDeadError()
 

From de51ec11b9b10cc2593380c9b871e6bc8848c271 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 15:22:50 -0700
Subject: [PATCH 116/130] Fix AsyncLLM garbage collection cleanup issue

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py |  10 +--
 vllm/v1/engine/async_llm.py       | 130 ++++++++++++++++--------------
 vllm/v1/engine/core_client.py     |   4 +-
 3 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index da0639678af8..6ff5e082032b 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-from contextlib import ExitStack
 from typing import Optional
 
 import pytest
@@ -86,11 +85,10 @@ async def test_load(monkeypatch: pytest.MonkeyPatch,
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
-    with monkeypatch.context() as m, ExitStack() as after:
+    with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
-        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 10
@@ -129,11 +127,10 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
                      output_kind: RequestOutputKind,
                      engine_args: AsyncEngineArgs, prompt: PromptType):
 
-    with monkeypatch.context() as m, ExitStack() as after:
+    with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
-        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 100
@@ -195,11 +192,10 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
                              engine_args: AsyncEngineArgs, prompt: PromptType):
 
-    with monkeypatch.context() as m, ExitStack() as after:
+    with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
-        after.callback(engine.shutdown)
 
         sampling_params = SamplingParams(max_tokens=100,
                                          output_kind=RequestOutputKind.DELTA,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 51617fdbb28e..46e004ef939c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -110,8 +110,7 @@ def __init__(
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
             asyncio.get_running_loop()
-            self.output_handler = asyncio.create_task(
-                self._run_output_handler())
+            self._run_output_handler()
         except RuntimeError:
             pass
 
@@ -171,6 +170,9 @@ def from_engine_args(
             usage_context=usage_context,
         )
 
+    def __del__(self):
+        self.shutdown()
+
     def shutdown(self):
         """Shutdown, cleaning up the background proc and IPC."""
 
@@ -270,9 +272,7 @@ async def generate(
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
             # to handle startup failure gracefully in the OpenAI server.
-            if self.output_handler is None:
-                self.output_handler = asyncio.create_task(
-                    self._run_output_handler())
+            self._run_output_handler()
 
             q = await self.add_request(
                 request_id,
@@ -324,55 +324,69 @@ async def generate(
                 logger.info("Request %s failed.", request_id)
             raise EngineGenerateError() from e
 
-    async def _run_output_handler(self):
+    def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
-        try:
-            while True:
-                # 1) Pull EngineCoreOutputs from the EngineCore.
-                outputs = await self.engine_core.get_output_async()
-                num_outputs = len(outputs.outputs)
-
-                iteration_stats = IterationStats() if (
-                    self.log_stats and num_outputs) else None
-
-                # Split outputs into chunks of at most
-                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
-                # event loop for too long.
-                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
-                    slices = (outputs.outputs, )
-                else:
-                    slices = np.array_split(
-                        outputs.outputs,
-                        cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
-
-                for i, outputs_slice in enumerate(slices):
-                    # 2) Process EngineCoreOutputs.
-                    processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, outputs.timestamp, iteration_stats)
-                    # NOTE: RequestOutputs are pushed to their queues.
-                    assert not processed_outputs.request_outputs
-
-                    # Allow other asyncio tasks to run between chunks
-                    if i + 1 < len(slices):
-                        await asyncio.sleep(0)
-
-                    # 3) Abort any reqs that finished due to stop strings.
-                    await self.engine_core.abort_requests_async(
-                        processed_outputs.reqs_to_abort)
-
-                # 4) Logging.
-                # TODO(rob): make into a coroutine and launch it in
-                # background thread once Prometheus overhead is non-trivial.
-                self._record_stats(
-                    engine_index=outputs.engine_index,
-                    scheduler_stats=outputs.scheduler_stats,
-                    iteration_stats=iteration_stats,
-                )
+        if self.output_handler is not None:
+            return
 
-        except Exception as e:
-            logger.exception("AsyncLLM output_handler failed.")
-            self.output_processor.propagate_error(e)
+        # Ensure that the task doesn't have a circular ref back to the AsyncLLM
+        # object, or else it won't be garbage collected and cleaned up properly.
+        engine_core = self.engine_core
+        output_processor = self.output_processor
+        log_stats = self.log_stats
+        stat_loggers = self.stat_loggers if log_stats else None
+
+        async def output_handler():
+            try:
+                while True:
+                    # 1) Pull EngineCoreOutputs from the EngineCore.
+                    outputs = await engine_core.get_output_async()
+                    num_outputs = len(outputs.outputs)
+
+                    iteration_stats = IterationStats() if (
+                        log_stats and num_outputs) else None
+
+                    # Split outputs into chunks of at most
+                    # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
+                    # event loop for too long.
+                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                        slices = (outputs.outputs, )
+                    else:
+                        slices = np.array_split(
+                            outputs.outputs,
+                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
+
+                    for i, outputs_slice in enumerate(slices):
+                        # 2) Process EngineCoreOutputs.
+                        processed_outputs = output_processor.process_outputs(
+                            outputs_slice, outputs.timestamp, iteration_stats)
+                        # NOTE: RequestOutputs are pushed to their queues.
+                        assert not processed_outputs.request_outputs
+
+                        # Allow other asyncio tasks to run between chunks
+                        if i + 1 < len(slices):
+                            await asyncio.sleep(0)
+
+                        # 3) Abort any reqs that finished due to stop strings.
+                        await engine_core.abort_requests_async(
+                            processed_outputs.reqs_to_abort)
+
+                    # 4) Logging.
+                    # TODO(rob): make into a coroutine and launch it in
+                    # background thread once Prometheus overhead is non-trivial.
+                    if stat_loggers:
+                        assert outputs.scheduler_stats is not None
+                        AsyncLLM._record_stats(
+                            stat_loggers[outputs.engine_index],
+                            scheduler_stats=outputs.scheduler_stats,
+                            iteration_stats=iteration_stats,
+                        )
+            except Exception as e:
+                logger.exception("AsyncLLM output_handler failed.")
+                output_processor.propagate_error(e)
+
+        self.output_handler = asyncio.create_task(output_handler())
 
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
@@ -383,17 +397,15 @@ async def abort(self, request_id: str) -> None:
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
+    @staticmethod
     def _record_stats(
-        self,
-        scheduler_stats: Optional[SchedulerStats],
+        stat_loggers: list[StatLoggerBase],
+        scheduler_stats: SchedulerStats,
         iteration_stats: Optional[IterationStats],
-        engine_index: int = 0,
     ):
-        if not self.log_stats:
-            return
-
-        assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers[engine_index]:
+        """static so that it can be used from the output_handler task
+        without a circular ref to AsyncLLM."""
+        for stat_logger in stat_loggers:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 813320dd8ca5..3ed3d92290a8 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
-import contextlib
 import queue
 import uuid
 import weakref
@@ -317,8 +316,7 @@ def __call__(self):
             core_engine.close()
 
         if self.output_queue_task is not None:
-            with contextlib.suppress(Exception):
-                self.output_queue_task.cancel()
+            self.output_queue_task.cancel()
 
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.

From a0536c453e9f58e090c67063b0c44e50062c3aac Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 17:44:59 -0700
Subject: [PATCH 117/130] Re-enable failing test (seems to work now)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/shutdown/test_forward_error.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index b9d0cd8b0b5f..9fedbe4f9a01 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -42,8 +42,6 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int,
     
     AsyncLLM always uses an MP client.
     """
-    if tensor_parallel_size > 1:
-        pytest.skip(reason="Parallelism > 1 not yet supported for this test.")
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")
 

From 76494dce61371c01cacaba8721d880fda8760204 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 10 Apr 2025 17:55:53 -0700
Subject: [PATCH 118/130] Re-enable other failing test (also seems to work now)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/shutdown/test_delete.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index 2acb175571e5..ed368fe828d0 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -19,7 +19,7 @@
 @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
-@pytest.mark.parametrize("send_one_request", [False])
+@pytest.mark.parametrize("send_one_request", [False, True])
 async def test_async_llm_delete(model: str, tensor_parallel_size: int,
                                 send_one_request: bool) -> None:
     """Test that AsyncLLM frees GPU memory upon deletion.
@@ -60,7 +60,7 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tensor_parallel_size", [2, 1])
 @pytest.mark.parametrize("enable_multiprocessing", [True])
-@pytest.mark.parametrize("send_one_request", [False])
+@pytest.mark.parametrize("send_one_request", [False, True])
 def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
                     enable_multiprocessing: bool,
                     send_one_request: bool) -> None:

From b5d870294d71c79a2b3eef3fbc7086fc2f1132aa Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 11 Apr 2025 02:01:17 +0000
Subject: [PATCH 119/130] CUDA_VISIBLE_DEVICES for shutdown tests in buildkite

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8e75470ca5ef..a435c9a4ec0b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -201,7 +201,7 @@ steps:
     # split the test to avoid interference
     - pytest -v -s v1/core
     - pytest -v -s v1/engine
-    - pytest -v -s v1/shutdown
+    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker

From b067f8dd4b98d71a4a8006c76609ad2d89d5a42f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 11 Apr 2025 02:07:58 +0000
Subject: [PATCH 120/130] temporarily enabled v1 fastcheck test

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a435c9a4ec0b..99674dce905a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -197,6 +197,7 @@ steps:
   source_file_dependencies:
     - vllm/
     - tests/v1
+  fast_check: true
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core

From e94c89e9f567b6766632dfad86d3d09753b69509 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Fri, 11 Apr 2025 03:37:43 +0000
Subject: [PATCH 121/130] moved shutdown tests to 2 GPU section

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 99674dce905a..3c0d18103731 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -197,12 +197,10 @@ steps:
   source_file_dependencies:
     - vllm/
     - tests/v1
-  fast_check: true
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
     - pytest -v -s v1/engine
-    - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
@@ -540,6 +538,7 @@ steps:
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"

From 6de94aa9a4a386e34b866f63de017948b9c8de17 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 07:12:51 -0700
Subject: [PATCH 122/130] Fix breakage to DP case

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 3ed3d92290a8..edd5788eec57 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -793,10 +793,6 @@ class DPAsyncMPClient(AsyncMPClient):
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
 
-        # Control message used for triggering dp idle mode loop.
-        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             *self.encoder.encode(None))
-
         self.num_engines_running = 0
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
@@ -804,6 +800,10 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         super().__init__(vllm_config, executor_class, log_stats)
 
+        # Control message used for triggering dp idle mode loop.
+        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
+                             *self.encoder.encode(None))
+
         assert len(self.core_engines) > 1
 
     def _init_core_engines(

From 060ecd931c42a0eff10fa8b6c44876d4f2650441 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 12:56:43 -0700
Subject: [PATCH 123/130] Properly fix DP breakage

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index edd5788eec57..b63821bb56e2 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -627,9 +627,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs,
                                                 Exception]] = asyncio.Queue()
-        self.outputs_handler: Optional[Callable[
-            [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
-
         try:
             # If we are running in an asyncio event loop, start the queue task.
             # Otherwise, it will be started lazily. If it is not started here,
@@ -650,7 +647,10 @@ def _ensure_output_queue_task(self):
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
-        output_handler = self.outputs_handler
+        output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs],
+                                          Awaitable[None]]] = getattr(
+                                              self.__class__,
+                                              "process_engine_outputs", None)
         _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,

From 4228bb4ad2b7b07853d0aa6a533210a5445c5795 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 17:23:30 -0700
Subject: [PATCH 124/130] Add timeout to TP execute_model, reply only from
 rank0

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 45 ++++++++++++++++++--------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 3734d431292c..b23bdc5ffd4c 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -7,13 +7,14 @@
 import time
 import traceback
 import weakref
+from concurrent.futures import Future
 from dataclasses import dataclass
 from enum import Enum, auto
 from functools import partial
 from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
 from threading import Thread
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union, cast
 
 import cloudpickle
 
@@ -28,6 +29,7 @@
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port)
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -35,6 +37,8 @@
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
+EXECUTE_MODEL_TIMEOUT_S = 30
+
 
 class MultiprocExecutor(Executor):
 
@@ -133,11 +137,22 @@ def register_failure_callback(self, callback: Callable):
         else:
             self.failure_callback = callback
 
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        (output, ) = self.collective_rpc("execute_model",
+                                         args=(scheduler_output, ),
+                                         rank0_reply_only=True,
+                                         timeout=EXECUTE_MODEL_TIMEOUT_S)
+        return output
+
     def collective_rpc(self,
                        method: Union[str, Callable],
-                       timeout: Optional[float] = None,
+                       timeout: Optional[float] = 180.0,
                        args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
+                       kwargs: Optional[dict] = None,
+                       rank0_reply_only: bool = False) -> list[Any]:
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
@@ -153,10 +168,11 @@ def collective_rpc(self,
             else:
                 send_method = cloudpickle.dumps(
                     method, protocol=pickle.HIGHEST_PROTOCOL)
-            self.rpc_broadcast_mq.enqueue((send_method, args, kwargs))
+            self.rpc_broadcast_mq.enqueue(
+                (send_method, args, kwargs, rank0_reply_only))
 
             responses = [None] * self.world_size
-            for w in self.workers:
+            for w in (self.workers[0], ) if rank0_reply_only else self.workers:
                 dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
@@ -326,7 +342,8 @@ def wait_for_ready(
                       "See stack trace for root cause.")
 
         pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
-        ready_proc_handles = []
+        ready_proc_handles: list[Optional[WorkerProcHandle]] = (
+            [None] * len(unready_proc_handles))
         while pipes:
             ready = multiprocessing.connection.wait(pipes.keys())
             for pipe in ready:
@@ -341,7 +358,7 @@ def wait_for_ready(
                     # Extract the message queue handle.
                     worker_response_mq = MessageQueue.create_from_handle(
                         response["handle"], 0)
-                    ready_proc_handles.append(
+                    ready_proc_handles[unready_proc_handle.rank] = (
                         WorkerProcHandle.from_unready_handle(
                             unready_proc_handle, worker_response_mq))
 
@@ -353,7 +370,7 @@ def wait_for_ready(
                     # Close connection.
                     pipe.close()
 
-        return ready_proc_handles
+        return cast(list[WorkerProcHandle], ready_proc_handles)
 
     def shutdown(self):
         self.rpc_broadcast_mq = None
@@ -435,7 +452,7 @@ class ResponseStatus(Enum):
     def worker_busy_loop(self):
         """Main busy loop for Multiprocessing Workers"""
         while True:
-            method, args, kwargs = self.rpc_broadcast_mq.dequeue()
+            method, args, kwargs, rank0_only = self.rpc_broadcast_mq.dequeue()
 
             try:
                 if isinstance(method, str):
@@ -450,9 +467,11 @@ def worker_busy_loop(self):
                 logger.exception("WorkerProc hit an exception.")
                 # exception might not be serializable, so we convert it to
                 # string, only for logging purpose.
-                self.worker_response_mq.enqueue(
-                    (WorkerProc.ResponseStatus.FAILURE, str(e)))
+                if not rank0_only or self.rank == 0:
+                    self.worker_response_mq.enqueue(
+                        (WorkerProc.ResponseStatus.FAILURE, str(e)))
                 continue
 
-            self.worker_response_mq.enqueue(
-                (WorkerProc.ResponseStatus.SUCCESS, output))
+            if not rank0_only or self.rank == 0:
+                self.worker_response_mq.enqueue(
+                    (WorkerProc.ResponseStatus.SUCCESS, output))

From 6c540c3cc8f166facfac12d7b15517494dd5209d Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 18:07:21 -0700
Subject: [PATCH 125/130] Cancel shm dequeue on shutdown

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../device_communicators/shm_broadcast.py     | 31 ++++++++++++++-----
 vllm/v1/executor/multiproc_executor.py        |  5 ++-
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 11ed7c084377..49a65bd0d118 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -7,11 +7,13 @@
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
-from typing import List, Optional, Tuple, Union
+from threading import Event
+from typing import Any, List, Optional, Tuple, Union
 from unittest.mock import patch
 
 import torch
 import torch.distributed as dist
+import zmq
 from torch.distributed import ProcessGroup
 from zmq import IPV6  # type: ignore
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
@@ -400,7 +402,9 @@ def acquire_write(self, timeout: Optional[float] = None):
                 break
 
     @contextmanager
-    def acquire_read(self, timeout: Optional[float] = None):
+    def acquire_read(self,
+                     timeout: Optional[float] = None,
+                     cancel: Optional[Event] = None):
         assert self._is_local_reader, "Only readers can acquire read"
         start_time = time.monotonic()
         n_warning = 1
@@ -430,6 +434,9 @@ def acquire_read(self, timeout: Optional[float] = None):
                         )
                         n_warning += 1
 
+                    if cancel is not None and cancel.is_set():
+                        raise RuntimeError("cancelled")
+
                     # if we time out, raise an exception
                     if (timeout is not None
                             and time.monotonic() - start_time > timeout):
@@ -464,10 +471,12 @@ def enqueue(self, obj, timeout: Optional[float] = None):
         if self.n_remote_reader > 0:
             self.remote_socket.send(serialized_obj)
 
-    def dequeue(self, timeout: Optional[float] = None):
+    def dequeue(self,
+                timeout: Optional[float] = None,
+                cancel: Optional[Event] = None):
         """ Read from message queue with optional timeout (in seconds) """
         if self._is_local_reader:
-            with self.acquire_read(timeout) as buf:
+            with self.acquire_read(timeout, cancel) as buf:
                 overflow = buf[0] == 1
                 if not overflow:
                     # no need to know the size of serialized object
@@ -475,15 +484,21 @@ def dequeue(self, timeout: Optional[float] = None):
                     # see https://docs.python.org/3/library/pickle.html
                     obj = pickle.loads(buf[1:])
             if overflow:
-                recv = self.local_socket.recv()
-                obj = pickle.loads(recv)
+                obj = MessageQueue.recv(self.local_socket, timeout)
         elif self._is_remote_reader:
-            recv = self.remote_socket.recv()
-            obj = pickle.loads(recv)
+            obj = MessageQueue.recv(self.remote_socket, timeout)
         else:
             raise RuntimeError("Only readers can dequeue")
         return obj
 
+    @staticmethod
+    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+        timeout_ms = None if timeout is None else int(timeout * 1000)
+        if not socket.poll(timeout=timeout_ms):
+            raise TimeoutError
+        recv = socket.recv(copy=False)
+        return pickle.loads(recv.buffer)
+
     def broadcast_object(self, obj=None):
         if self._is_writer:
             self.enqueue(obj)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b23bdc5ffd4c..aeafe33adcd8 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -4,6 +4,7 @@
 import pickle
 import signal
 import sys
+import threading
 import time
 import traceback
 import weakref
@@ -47,6 +48,7 @@ def _init_executor(self) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
+        self.shutdown_event = threading.Event()
         self.failure_callback: Optional[Callable] = None
 
         self.world_size = self.parallel_config.world_size
@@ -176,7 +178,7 @@ def collective_rpc(self,
                 dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(
-                    timeout=dequeue_timeout)
+                    timeout=dequeue_timeout, cancel=self.shutdown_event)
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
@@ -221,6 +223,7 @@ def shutdown(self):
         """Properly shut down the executor and its workers"""
         if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
+            self.shutdown_event.set()
             for w in self.workers:
                 w.worker_response_mq = None
             self._ensure_worker_termination([w.proc for w in self.workers])

From da8c253c842230d5f19b58d402e811fd49bff664 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 11 Apr 2025 18:42:29 -0700
Subject: [PATCH 126/130] fix

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index aeafe33adcd8..fd4ea4b4421a 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -173,8 +173,9 @@ def collective_rpc(self,
             self.rpc_broadcast_mq.enqueue(
                 (send_method, args, kwargs, rank0_reply_only))
 
-            responses = [None] * self.world_size
-            for w in (self.workers[0], ) if rank0_reply_only else self.workers:
+            workers = (self.workers[0], ) if rank0_reply_only else self.workers
+            responses = [None] * len(workers)
+            for w in workers:
                 dequeue_timeout = timeout - (time.monotonic() - start_time
                                              ) if timeout is not None else None
                 status, result = w.worker_response_mq.dequeue(

From 27d7d82f0e7bc5d0bb56b9c7553b60fea34fb8e5 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 12 Apr 2025 00:25:33 -0700
Subject: [PATCH 127/130] Fix exception message

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index fd4ea4b4421a..b3dfd0755da8 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -183,8 +183,8 @@ def collective_rpc(self,
 
                 if status != WorkerProc.ResponseStatus.SUCCESS:
                     raise RuntimeError(
-                        "Worker failed with error %s, please check the"
-                        " stack trace above for the root cause", result)
+                        f"Worker failed with error '{result}', please check the"
+                        " stack trace above for the root cause")
 
                 responses[w.rank] = result
 

From 444a446ba45bf57bcab89933e1c9b94247437bee Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 12 Apr 2025 00:35:45 -0700
Subject: [PATCH 128/130] Cleanup

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b63821bb56e2..7b7505423ff6 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -796,8 +796,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         self.num_engines_running = 0
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
-        self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
-
         super().__init__(vllm_config, executor_class, log_stats)
 
         # Control message used for triggering dp idle mode loop.

From e33000e4e2fc173570ad85e646fbc129e03c8032 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Mon, 14 Apr 2025 15:30:57 +0000
Subject: [PATCH 129/130] revert

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
---
 tests/v1/engine/test_async_llm.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 6ff5e082032b..da0639678af8 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+from contextlib import ExitStack
 from typing import Optional
 
 import pytest
@@ -85,10 +86,11 @@ async def test_load(monkeypatch: pytest.MonkeyPatch,
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 10
@@ -127,10 +129,11 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
                      output_kind: RequestOutputKind,
                      engine_args: AsyncEngineArgs, prompt: PromptType):
 
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 100
@@ -192,10 +195,11 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
                              engine_args: AsyncEngineArgs, prompt: PromptType):
 
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
         engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
 
         sampling_params = SamplingParams(max_tokens=100,
                                          output_kind=RequestOutputKind.DELTA,

From 7cf6b6f170c5a717a908487d72c1e1530ca1d7f8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 16 Apr 2025 11:17:13 -0700
Subject: [PATCH 130/130] Address review comments from @DarkLight1337

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py                 | 7 ++-----
 vllm/v1/engine/core_client.py          | 7 +++----
 vllm/v1/executor/abstract.py           | 4 +++-
 vllm/v1/executor/multiproc_executor.py | 9 ++++-----
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c6b6febbfd5c..ba5e5050abbb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -12,7 +12,6 @@
 
 import msgspec
 import zmq
-import zmq.asyncio
 
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
@@ -320,8 +319,7 @@ def __init__(
         log_stats: bool,
         engine_index: int = 0,
     ):
-        input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                       Any]] = queue.Queue()
+        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
 
         executor_fail_callback = lambda: input_queue.put_nowait(
             (EngineCoreRequestType.EXECUTOR_FAILED, b''))
@@ -339,8 +337,7 @@ def __init__(
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
         self.input_queue = input_queue
-        self.output_queue: queue.Queue[Union[EngineCoreOutputs,
-                                             bytes]] = queue.Queue()
+        self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, engine_index),
                          daemon=True).start()
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 7b7505423ff6..f54b3546f06d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -481,8 +481,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: queue.Queue[Union[EngineCoreOutputs,
-                                              Exception]] = queue.Queue()
+        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -625,8 +624,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs,
-                                                Exception]] = asyncio.Queue()
+        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs,
+                                                 Exception]]()
         try:
             # If we are running in an asyncio event loop, start the queue task.
             # Otherwise, it will be started lazily. If it is not started here,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index dae6ca613080..3b9feb0d3298 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -15,6 +15,8 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 
+FailureCallback = Callable[[], None]
+
 
 class Executor(ExecutorBase):
     """
@@ -62,7 +64,7 @@ def initialize_from_config(self,
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
-    def register_failure_callback(self, callback: Callable):
+    def register_failure_callback(self, callback: FailureCallback):
         """
         Register a function to be called if the executor enters a permanent
         failed state.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b3dfd0755da8..cff6181fa3ad 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -29,7 +29,7 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_distributed_init_method, get_mp_context,
                         get_open_port)
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -49,7 +49,7 @@ def _init_executor(self) -> None:
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
         self.shutdown_event = threading.Event()
-        self.failure_callback: Optional[Callable] = None
+        self.failure_callback: Optional[FailureCallback] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -133,7 +133,7 @@ def monitor_workers():
                daemon=True,
                name="MultiprocWorkerMonitor").start()
 
-    def register_failure_callback(self, callback: Callable):
+    def register_failure_callback(self, callback: FailureCallback):
         if self.is_failed:
             callback()
         else:
@@ -276,8 +276,7 @@ def __init__(
     ):
         self.rank = rank
         wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
-        # TODO: move `init_worker` to executor level as a collective rpc
-        # call
+        # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]