From eb162391eec0f9ab3bae98ff900a836406aee598 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 3 Jan 2025 23:11:03 +0000 Subject: [PATCH 001/130] checkpoint prototype Signed-off-by: rshaw@neuralmagic.com --- vllm/engine/multiprocessing/client.py | 4 -- vllm/engine/protocol.py | 5 -- vllm/entrypoints/launcher.py | 89 ++++++++++++++------------- vllm/v1/engine/async_llm.py | 87 ++++++++++++++++++-------- 4 files changed, 107 insertions(+), 78 deletions(-) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0a046c71e86e..329bcc3d1ff6 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -402,10 +402,6 @@ async def check_health(self): def is_running(self) -> bool: return not self.errored - @property - def is_stopped(self) -> bool: - return self.errored - @property def errored(self) -> bool: return self._errored_with is not None diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index a066836b9270..b2a5cc17ead6 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -29,11 +29,6 @@ class EngineClient(ABC): def is_running(self) -> bool: ... - @property - @abstractmethod - def is_stopped(self) -> bool: - ... - @property @abstractmethod def errored(self) -> bool: diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 5dcf50bd1b0a..c928a9f4b8ce 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -7,10 +7,11 @@ from fastapi import FastAPI, Request, Response from vllm import envs -from vllm.engine.async_llm_engine import AsyncEngineDeadError -from vllm.engine.multiprocessing import MQEngineDeadError +# from vllm.engine.async_llm_engine import AsyncEngineDeadError +# from vllm.engine.multiprocessing import MQEngineDeadError from vllm.logger import init_logger from vllm.utils import find_process_using_port +from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError logger = init_logger(__name__) @@ -58,46 +59,46 @@ async def dummy_shutdown() -> None: return server.shutdown() +def start_termination(server: uvicorn.Server): + # See discussions here on shutting down a uvicorn server + # https://github.com/encode/uvicorn/discussions/1103 + # In this case we cannot await the server shutdown here because + # this handler must first return to close the connection for + # this request. + logger.fatal("VLLM Engine failed, terminating server.") + server.should_exit = True + + +# NOTE(rob): VLLM V1 AsyncLLM catches exceptions and returns +# only two types: EngineGenerateError and EngineDeadError. +# +# EngineGenerateError is raised by the per request generate() +# method. This error could be request specific (and therefore +# recoverable - e.g. if there is an error in input processing). +# +# EngineDeadError is raised by the background output_handler +# method. This error is global and therefore not recoverable. +# +# We register these @app.exception_handlers to return nice +# responses to the end user if they occur and shut down if needed. +# See https://fastapi.tiangolo.com/tutorial/handling-errors/ +# for more details on how exception handlers work. def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: - """Adds handlers for fatal errors that should crash the server""" - - @app.exception_handler(RuntimeError) - async def runtime_error_handler(request: Request, __): - """On generic runtime error, check to see if the engine has died. - It probably has, in which case the server will no longer be able to - handle requests. Trigger a graceful shutdown with a SIGTERM.""" - engine = request.app.state.engine_client - if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored - and not engine.is_running): - logger.fatal("AsyncLLMEngine has failed, terminating server " - "process") - # See discussions here on shutting down a uvicorn server - # https://github.com/encode/uvicorn/discussions/1103 - # In this case we cannot await the server shutdown here because - # this handler must first return to close the connection for - # this request. - server.should_exit = True - - return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) - - @app.exception_handler(AsyncEngineDeadError) - async def async_engine_dead_handler(_, __): - """Kill the server if the async engine is already dead. It will - not handle any further requests.""" - if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: - logger.fatal("AsyncLLMEngine is already dead, terminating server " - "process") - server.should_exit = True - - return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) - - @app.exception_handler(MQEngineDeadError) - async def mq_engine_dead_handler(_, __): - """Kill the server if the mq engine is already dead. It will - not handle any further requests.""" - if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: - logger.fatal("MQLLMEngine is already dead, terminating server " - "process") - server.should_exit = True - - return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) + + if envs.VLLM_USE_V1: + + @app.exception_handler(EngineGenerateError) + async def generate_error_handler(request: Request, __): + engine = request.app.state.engine_client + if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored): + # Terminate if recoverable. + start_termination(server) + + return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) + + @app.exception_handler(EngineDeadError) + async def engine_dead_handler(_, __): + if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: + start_termination(server) + + return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ff7a0c28dd91..27db49a11ed4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,7 @@ import asyncio -import os +# import os import signal +from functools import partial from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -18,7 +19,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import kill_process_tree +# from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -27,6 +28,17 @@ logger = init_logger(__name__) +# NOTE(rob): raised when a generate() fails. +class EngineGenerateError(Exception): + pass + + +# NOTE(rob): raised when the engine dies, typically +# by the background output handler loop. Unrecoverable. +class EngineDeadError(Exception): + pass + + class AsyncLLM(EngineClient): def __init__( @@ -42,23 +54,17 @@ def __init__( start_engine_loop: bool = True, ) -> None: - # The child processes will send SIGQUIT when unrecoverable - # errors happen. We kill the process tree here so that the - # stack trace is very evident. - # TODO: rather than killing the main process, we should - # figure out how to raise an AsyncEngineDeadError and - # handle at the API server level so we can return a better - # error code to the clients calling VLLM. + # NOTE(rob): EngineCore sends SIGQUIT on unrecoverable errors. def sigquit_handler(signum, frame): logger.fatal( "AsyncLLM got SIGQUIT from worker processes, shutting " "down. See stack trace above for root cause issue.") - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGQUIT, sigquit_handler) + self._propagate_error() - assert start_engine_loop + loop = asyncio.get_running_loop() + loop.add_signal_handler(signal.SIGQUIT, partial(sigquit_handler)) + self._errored = False self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers @@ -243,12 +249,17 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. while True: - # Note: drain queue without await if possible (avoids + # Note(rob): drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - # Note: both Detokenizer and EngineCore handle their - # own request cleanup based on finished. + # _run_output_handler() puts EngineDeadError into the queue + # if it encounters an unrecoverable issue in the EngineCore. + if isinstance(out, EngineDeadError): + raise out + + # NOTE(rob): both Detokenizer and EngineCore handle + # their own request cleanup based on finished. if out.finished: del self.rid_to_queue[request_id] yield out @@ -261,8 +272,18 @@ async def generate( # request if we end up here. except asyncio.CancelledError: await self.abort(request_id) + if self.log_requests: + logger.info("Request %s aborted.", request_id) raise + except Exception as e: + if self.log_requests: + logger.info("Request %s failed.", request_id) + + # NOTE(rob): EngineGenerateError is handed by FastAPI + # exception handlers in vllm/entrypoints/launcher.py. + raise EngineGenerateError() from e + def _process_request_outputs(self, request_outputs: List[RequestOutput]): """Process outputs by putting them into per-request queues.""" @@ -292,9 +313,29 @@ async def _run_output_handler(self): # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) + except asyncio.CancelledError: + raise + except Exception as e: - logger.exception("EngineCore output handler hit an error: %s", e) - kill_process_tree(os.getpid()) + self._propagate_error(e) + + def _propagate_error(self, exception: Optional[Exception] = None): + """Propagate to generate() tasks and raise EngineDeadError.""" + + # Set errored state and log if we have + self._errored = True + if exception: + logger.error("AsyncLLM run_output_handler failed", + exc_info=exception) + + # Put EngineDeadError() into + for _, q in self.rid_to_queue.items(): + q.put_nowait(EngineDeadError()) + + raise EngineDeadError( + "AsyncLLM finished unexpectedly. This should never happen! " + "Please open an issue on Github. See stack trace above for the " + "actual cause.") from exception async def abort(self, request_id: str) -> None: """Abort RequestId in self, detokenizer, and engine core.""" @@ -356,16 +397,12 @@ async def stop_profile(self) -> None: @property def is_running(self) -> bool: - return True - - @property - def is_stopped(self) -> bool: - return False + return not self.errored @property def errored(self) -> bool: - return False + return self._errored @property def dead_error(self) -> BaseException: - return Exception() # TODO: implement + return EngineDeadError() From 8549fdd6c0f27189e4b52170f3ccaff2fefb8f1a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 3 Jan 2025 23:49:22 +0000 Subject: [PATCH 002/130] Issue currently is with streaming. The HTTP exception handlers do not handle properly Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/async_llm.py | 48 +++++++++++++++++++------------------ vllm/v1/engine/core.py | 4 ++++ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 27db49a11ed4..0f7fe0452252 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,7 +1,6 @@ import asyncio # import os import signal -from functools import partial from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -54,15 +53,16 @@ def __init__( start_engine_loop: bool = True, ) -> None: - # NOTE(rob): EngineCore sends SIGQUIT on unrecoverable errors. - def sigquit_handler(signum, frame): + # EngineCore sends SIGQUIT on unrecoverable errors. + def sigquit_handler(): logger.fatal( "AsyncLLM got SIGQUIT from worker processes, shutting " "down. See stack trace above for root cause issue.") self._propagate_error() + self._errored = True loop = asyncio.get_running_loop() - loop.add_signal_handler(signal.SIGQUIT, partial(sigquit_handler)) + loop.add_signal_handler(signal.SIGQUIT, sigquit_handler) self._errored = False self.log_requests = log_requests @@ -141,13 +141,12 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + if handler := getattr(self, "output_handler", None): + handler.cancel() if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - if handler := getattr(self, "output_handler", None): - handler.cancel() - @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] @@ -228,6 +227,10 @@ async def generate( returning the RequestOutput back to the caller. """ + if self.errored: + self._propagate_error() + raise EngineDeadError() + try: # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us @@ -249,12 +252,9 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. while True: - # Note(rob): drain queue without await if possible (avoids + # NOTE(rob): drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - - # _run_output_handler() puts EngineDeadError into the queue - # if it encounters an unrecoverable issue in the EngineCore. if isinstance(out, EngineDeadError): raise out @@ -268,20 +268,25 @@ async def generate( yield out # If the request is disconnected by the client, the - # generate() task will be canceled. So, we abort the - # request if we end up here. + # generate() task will be canceled so, we abort. except asyncio.CancelledError: await self.abort(request_id) if self.log_requests: logger.info("Request %s aborted.", request_id) raise - except Exception as e: + # EngineCore or output_handler pushed error. Raise so API Server + # can handle and shutdown in vllm/entrypoints/launcher.py. + except EngineDeadError: if self.log_requests: logger.info("Request %s failed.", request_id) + raise - # NOTE(rob): EngineGenerateError is handed by FastAPI - # exception handlers in vllm/entrypoints/launcher.py. + # Error in the generate() task (possibly recoverable). Raise so API + # Server can handle and maybe shutdown vllm/entrypoints/launcher.py. + except Exception as e: + if self.log_requests: + logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e def _process_request_outputs(self, request_outputs: List[RequestOutput]): @@ -298,7 +303,6 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - try: while True: # 1) Pull EngineCoreOutput from the EngineCore. @@ -314,10 +318,12 @@ async def _run_output_handler(self): await self.engine_core.abort_requests_async(reqs_to_abort) except asyncio.CancelledError: + logger.debug("Output handler interrupted.") raise except Exception as e: self._propagate_error(e) + raise EngineDeadError() from e def _propagate_error(self, exception: Optional[Exception] = None): """Propagate to generate() tasks and raise EngineDeadError.""" @@ -328,15 +334,11 @@ def _propagate_error(self, exception: Optional[Exception] = None): logger.error("AsyncLLM run_output_handler failed", exc_info=exception) - # Put EngineDeadError() into + # Put EngineDeadError() into each generate()'s queue, + # each of which will raise in their own context. for _, q in self.rid_to_queue.items(): q.put_nowait(EngineDeadError()) - raise EngineDeadError( - "AsyncLLM finished unexpectedly. This should never happen! " - "Please open an issue on Github. See stack trace above for the " - "actual cause.") from exception - async def abort(self, request_id: str) -> None: """Abort RequestId in self, detokenizer, and engine core.""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 13a50a4f855e..428ab32d4095 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -208,7 +208,11 @@ def run_busy_loop(self): """Core busy loop of the EngineCore.""" # Loop until process is sent a SIGINT or SIGTERM + i = 0 while True: + if i == 10: + raise ValueError("TEST RUN") + i += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: From 77801cddb5f7cba17d0a09845f9e1879350ddb94 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 14:46:09 +0000 Subject: [PATCH 003/130] switch from ValueError -> Exception. --- vllm/entrypoints/openai/serving_chat.py | 6 +++--- vllm/entrypoints/openai/serving_completion.py | 8 ++++---- vllm/entrypoints/openai/serving_embedding.py | 6 +++--- vllm/entrypoints/openai/serving_pooling.py | 6 +++--- vllm/entrypoints/openai/serving_score.py | 6 +++--- vllm/entrypoints/openai/serving_tokenization.py | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9ba5eeb7709c..7e3f8b56fd54 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -171,7 +171,7 @@ async def create_chat_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -228,7 +228,7 @@ async def create_chat_completion( ) generators.append(generator) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -245,7 +245,7 @@ async def create_chat_completion( return await self.chat_completion_full_generator( request, result_generator, request_id, model_name, conversation, tokenizer, request_metadata) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 17197dce8da2..53ae1b134590 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -106,7 +106,7 @@ async def create_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -158,7 +158,7 @@ async def create_completion( ) generators.append(generator) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -215,7 +215,7 @@ async def create_completion( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -371,7 +371,7 @@ async def completion_stream_generator( # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e7116a3d95d1..fe8ba5eb95b9 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -136,7 +136,7 @@ async def create_embedding( truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -167,7 +167,7 @@ async def create_embedding( ) generators.append(generator) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -196,7 +196,7 @@ async def create_embedding( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 5830322071e5..3441071344f4 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -132,7 +132,7 @@ async def create_pooling( truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -163,7 +163,7 @@ async def create_pooling( ) generators.append(generator) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -192,7 +192,7 @@ async def create_pooling( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 5d3e7139d7a1..9b5aa13bda84 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,7 +101,7 @@ async def create_score( if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -155,7 +155,7 @@ async def create_score( ) generators.append(generator) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -184,7 +184,7 @@ async def create_score( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index b67ecfb01316..a3dc42ff8f02 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -86,7 +86,7 @@ async def create_tokenize( request.prompt, add_special_tokens=request.add_special_tokens, ) - except ValueError as e: + except Exception as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) From 8eca8646ec74728d8b133a87d4b32cf78bc37ac6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 15:09:06 +0000 Subject: [PATCH 004/130] updated --- vllm/v1/engine/async_llm.py | 25 ++++++++++++++++++++----- vllm/v1/engine/core_client.py | 30 ++++++++++++++++-------------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5e76c6e6043b..83ed99a9fd64 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ import asyncio +import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -16,7 +17,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -# from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -58,6 +58,19 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config + # EngineCore and Worker processes send SIGUSR1 when + # unrecoverable errors occur. Start the shutdown + # process if this occurs. + def sigusr1_handler(): + logger.fatal( + "AsyncLLM got fatal signal from worker process, " + "shutting down. See stack trace for root cause.") + self._propagate_error() + self._errored = True + + asyncio.get_running_loop().add_signal_handler( + signal.SIGUSR1, sigusr1_handler) + # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -244,14 +257,14 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. while True: - # NOTE(rob): drain queue without await if possible (avoids + # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() if isinstance(out, EngineDeadError): raise out - # NOTE(rob): both Detokenizer and EngineCore handle - # their own request cleanup based on finished. + # Note: both Detokenizer and EngineCore handle their + # own request cleanup based on finished. if out.finished: del self.rid_to_queue[request_id] yield out @@ -260,7 +273,8 @@ async def generate( yield out # If the request is disconnected by the client, the - # generate() task will be canceled so, we abort. + # generate() task will be canceled. So, we abort the + # request if we end up here. except asyncio.CancelledError: await self.abort(request_id) if self.log_requests: @@ -295,6 +309,7 @@ def _process_request_outputs(self, request_outputs: List[RequestOutput]): async def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + try: while True: # 1) Pull EngineCoreOutput from the EngineCore. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a4a45ae05ff9..5e907a3c5ec7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,4 +1,4 @@ -import os +import asyncio import signal import weakref from abc import ABC, abstractmethod @@ -135,19 +135,19 @@ def __init__( executor_class: Type[Executor], log_stats: bool = False, ): - # The child processes will send SIGUSR1 when unrecoverable - # errors happen. We kill the process tree here so that the - # stack trace is very evident. - # TODO(rob): rather than killing the main process, we should - # figure out how to raise an AsyncEngineDeadError and - # handle at the API server level so we can return a better - # error code to the clients calling VLLM. - def sigusr1_handler(signum, frame): - logger.fatal("Got fatal signal from worker processes, shutting " - "down. See stack trace above for root cause issue.") - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGUSR1, sigusr1_handler) + # # The child processes will send SIGUSR1 when unrecoverable + # # errors happen. We kill the process tree here so that the + # # stack trace is very evident. + # # TODO(rob): rather than killing the main process, we should + # # figure out how to raise an AsyncEngineDeadError and + # # handle at the API server level so we can return a better + # # error code to the clients calling VLLM. + # def sigusr1_handler(signum, frame): + # logger.fatal("Got fatal signal from worker processes, shutting " + # "down. See stack trace above for root cause issue.") + # kill_process_tree(os.getpid()) + + # signal.signal(signal.SIGUSR1, sigusr1_handler) # Serialization setup. self.encoder = PickleEncoder() @@ -198,6 +198,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): + super().__init__( asyncio_mode=False, vllm_config=vllm_config, @@ -236,6 +237,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): + super().__init__( asyncio_mode=True, vllm_config=vllm_config, From b8c77b37f4e40355819ad2cb6cfb310e2d1a7c61 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 15:09:56 +0000 Subject: [PATCH 005/130] stash --- vllm/v1/engine/async_llm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 83ed99a9fd64..70ab5c2f3f77 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -30,13 +30,11 @@ class EngineGenerateError(Exception): pass - # NOTE(rob): raised when the engine dies, typically # by the background output handler loop. Unrecoverable. class EngineDeadError(Exception): pass - class AsyncLLM(EngineClient): def __init__( From ce9b8ef26d62db2092e2ac3fb18143d52c4f8e62 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 15:11:43 +0000 Subject: [PATCH 006/130] stash --- vllm/entrypoints/openai/serving_chat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e3f8b56fd54..a20bf1efa08a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -591,7 +591,7 @@ async def chat_completion_stream_generator( completion_tokens=num_completion_tokens, total_tokens=num_prompt_tokens + num_completion_tokens) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") data = self.create_streaming_error_response(str(e)) @@ -618,7 +618,7 @@ async def chat_completion_full_generator( final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) From 3a760a7598df8df13acbcbafb3213f3e69071f28 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:29:38 +0000 Subject: [PATCH 007/130] add watchdog --- vllm/entrypoints/launcher.py | 110 ++++++++++++++++---------- vllm/entrypoints/openai/api_server.py | 2 +- vllm/v1/engine/async_llm.py | 13 +-- vllm/v1/engine/core_client.py | 21 +---- 4 files changed, 79 insertions(+), 67 deletions(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index c928a9f4b8ce..bbb2271d7a46 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -7,8 +7,9 @@ from fastapi import FastAPI, Request, Response from vllm import envs -# from vllm.engine.async_llm_engine import AsyncEngineDeadError -# from vllm.engine.multiprocessing import MQEngineDeadError +from vllm.engine.async_llm_engine import AsyncEngineDeadError +from vllm.engine.multiprocessing import MQEngineDeadError +from vllm.engine.protocol import EngineClient from vllm.logger import init_logger from vllm.utils import find_process_using_port from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError @@ -33,11 +34,14 @@ async def serve_http(app: FastAPI, **uvicorn_kwargs: Any): loop = asyncio.get_running_loop() + watchdog_task = loop.create_task( + watchdog_loop(server, app.state.engine_client)) server_task = loop.create_task(server.serve()) def signal_handler() -> None: # prevents the uvicorn signal handler to exit early server_task.cancel() + watchdog_task.cancel() async def dummy_shutdown() -> None: pass @@ -57,48 +61,72 @@ async def dummy_shutdown() -> None: port, process, " ".join(process.cmdline())) logger.info("Shutting down FastAPI HTTP server.") return server.shutdown() + finally: + watchdog_task.cancel() -def start_termination(server: uvicorn.Server): - # See discussions here on shutting down a uvicorn server - # https://github.com/encode/uvicorn/discussions/1103 - # In this case we cannot await the server shutdown here because - # this handler must first return to close the connection for - # this request. - logger.fatal("VLLM Engine failed, terminating server.") - server.should_exit = True - - -# NOTE(rob): VLLM V1 AsyncLLM catches exceptions and returns -# only two types: EngineGenerateError and EngineDeadError. -# -# EngineGenerateError is raised by the per request generate() -# method. This error could be request specific (and therefore -# recoverable - e.g. if there is an error in input processing). -# -# EngineDeadError is raised by the background output_handler -# method. This error is global and therefore not recoverable. -# -# We register these @app.exception_handlers to return nice -# responses to the end user if they occur and shut down if needed. -# See https://fastapi.tiangolo.com/tutorial/handling-errors/ -# for more details on how exception handlers work. -def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: - - if envs.VLLM_USE_V1: +async def watchdog_loop(server: uvicorn.Server, engine: EngineClient): + # Background task that runs in the background, checking + # for error state in the engine. This is needed for a + # clean shutdown since we cannot raise an Exception in + # a StreamingResponse generator() meaning we cannot use + # the exception handlers below. + VLLM_WATCHDOG_TIME_S = 3.0 + while True: + await asyncio.sleep(VLLM_WATCHDOG_TIME_S) + terminate_if_errored(server, engine) - @app.exception_handler(EngineGenerateError) - async def generate_error_handler(request: Request, __): - engine = request.app.state.engine_client - if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored): - # Terminate if recoverable. - start_termination(server) - return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) +def terminate_if_errored(server: uvicorn.Server, engine: EngineClient): + # See discussions here on shutting down a uvicorn server + # https://github.com/encode/uvicorn/discussions/1103 + # In this case we cannot await the server shutdown here + # because handler must first return to close the connection + # for this request. + engine_errored = engine.errored and not engine.is_running + is_already_exiting = server.should_exit + if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored + and not is_already_exiting): + # Avoid spamming the logs by only sending once. + logger.fatal("Engine failed, terminating server.") + server.should_exit = True - @app.exception_handler(EngineDeadError) - async def engine_dead_handler(_, __): - if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: - start_termination(server) - return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) +def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: + """ + VLLM V1 AsyncLLM catches exceptions and returns + only two types: EngineGenerateError and EngineDeadError. + + EngineGenerateError is raised by the per request generate() + method. This error could be request specific (and therefore + recoverable - e.g. if there is an error in input processing). + + EngineDeadError is raised by the background output_handler + method. This error is global and therefore not recoverable. + + We register these @app.exception_handlers to return nice + responses to the end user if they occur and shut down if needed. + See https://fastapi.tiangolo.com/tutorial/handling-errors/ + for more details on how exception handlers work. + + NOTE(rob): if an exception is encountered in a StreamingResponse + generator, the exception is not raised, since we already sent + a 200 status. Rather, we send an error message as the next chunk. + Since the exception is not raised, this means that the server + will not automatically shut down. + """ + + # NOTE(rob): RuntimeError, AsyncEngineDeadError, + # MQEngineDeadError are all V0 errors. + @app.exception_handler(RuntimeError) + @app.exception_handler(AsyncEngineDeadError) + @app.exception_handler(MQEngineDeadError) + @app.exception_handler(EngineDeadError) + @app.exception_handler(EngineGenerateError) + async def runtime_exception_handler(request: Request, __): + terminate_if_errored( + server=server, + engine=request.app.state.engine_client, + ) + + return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e942b475535a..ea3a9cd08837 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -90,8 +90,8 @@ @asynccontextmanager async def lifespan(app: FastAPI): try: + engine_client: EngineClient = app.state.engine_client if app.state.log_stats: - engine_client: EngineClient = app.state.engine_client async def _force_log(): while True: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 70ab5c2f3f77..7112e88410a4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -30,11 +30,13 @@ class EngineGenerateError(Exception): pass + # NOTE(rob): raised when the engine dies, typically # by the background output handler loop. Unrecoverable. class EngineDeadError(Exception): pass + class AsyncLLM(EngineClient): def __init__( @@ -56,18 +58,17 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # EngineCore and Worker processes send SIGUSR1 when + # EngineCore and Worker processes send SIGUSR1 when # unrecoverable errors occur. Start the shutdown # process if this occurs. def sigusr1_handler(): - logger.fatal( - "AsyncLLM got fatal signal from worker process, " - "shutting down. See stack trace for root cause.") + logger.fatal("AsyncLLM got fatal signal from worker process, " + "shutting down. See stack trace for root cause.") self._propagate_error() self._errored = True - asyncio.get_running_loop().add_signal_handler( - signal.SIGUSR1, sigusr1_handler) + asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, + sigusr1_handler) # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5e907a3c5ec7..b665b1b7407b 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,3 @@ -import asyncio -import signal import weakref from abc import ABC, abstractmethod from typing import List, Type @@ -10,8 +8,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, - make_zmq_socket) +from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) @@ -135,20 +132,6 @@ def __init__( executor_class: Type[Executor], log_stats: bool = False, ): - # # The child processes will send SIGUSR1 when unrecoverable - # # errors happen. We kill the process tree here so that the - # # stack trace is very evident. - # # TODO(rob): rather than killing the main process, we should - # # figure out how to raise an AsyncEngineDeadError and - # # handle at the API server level so we can return a better - # # error code to the clients calling VLLM. - # def sigusr1_handler(signum, frame): - # logger.fatal("Got fatal signal from worker processes, shutting " - # "down. See stack trace above for root cause issue.") - # kill_process_tree(os.getpid()) - - # signal.signal(signal.SIGUSR1, sigusr1_handler) - # Serialization setup. self.encoder = PickleEncoder() self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) @@ -198,7 +181,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): - + super().__init__( asyncio_mode=False, vllm_config=vllm_config, From 3024da070a4aebdedf1c606843bb388541e03fcc Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:41:22 +0000 Subject: [PATCH 008/130] updated --- vllm/entrypoints/launcher.py | 7 +++---- vllm/entrypoints/openai/api_server.py | 2 +- vllm/v1/engine/async_llm.py | 7 +++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index bbb2271d7a46..8512fe135c66 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -109,15 +109,14 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: See https://fastapi.tiangolo.com/tutorial/handling-errors/ for more details on how exception handlers work. - NOTE(rob): if an exception is encountered in a StreamingResponse + If an exception is encountered in a StreamingResponse generator, the exception is not raised, since we already sent a 200 status. Rather, we send an error message as the next chunk. Since the exception is not raised, this means that the server - will not automatically shut down. + will not automatically shut down. Instead, we use the watchdog + background task for check for errored state. """ - # NOTE(rob): RuntimeError, AsyncEngineDeadError, - # MQEngineDeadError are all V0 errors. @app.exception_handler(RuntimeError) @app.exception_handler(AsyncEngineDeadError) @app.exception_handler(MQEngineDeadError) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ea3a9cd08837..e942b475535a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -90,8 +90,8 @@ @asynccontextmanager async def lifespan(app: FastAPI): try: - engine_client: EngineClient = app.state.engine_client if app.state.log_stats: + engine_client: EngineClient = app.state.engine_client async def _force_log(): while True: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7112e88410a4..2a831ae751c5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,13 +26,12 @@ logger = init_logger(__name__) -# NOTE(rob): raised when a generate() fails. +# Raised when a generate() fails. Possibly Recoverable. class EngineGenerateError(Exception): pass - -# NOTE(rob): raised when the engine dies, typically -# by the background output handler loop. Unrecoverable. +# Raised when the engine dies, typically by the +# background output handler loop. Unrecoverable. class EngineDeadError(Exception): pass From 5af8189792f54caf6a7171d7c3e421692494f9cb Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:42:12 +0000 Subject: [PATCH 009/130] revert spurious changes --- vllm/v1/engine/async_llm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2a831ae751c5..5d016161ab93 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -140,12 +140,13 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" - if handler := getattr(self, "output_handler", None): - handler.cancel() if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() + if handler := getattr(self, "output_handler", None): + handler.cancel() + @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] From 3cb21bbb2b87a7d1c8a01f950d9c61f82deb2579 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:50:32 +0000 Subject: [PATCH 010/130] updated --- vllm/v1/engine/async_llm.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5d016161ab93..dc00d6f7ee35 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -30,6 +30,7 @@ class EngineGenerateError(Exception): pass + # Raised when the engine dies, typically by the # background output handler loop. Unrecoverable. class EngineDeadError(Exception): @@ -63,8 +64,8 @@ def __init__( def sigusr1_handler(): logger.fatal("AsyncLLM got fatal signal from worker process, " "shutting down. See stack trace for root cause.") - self._propagate_error() self._errored = True + self._propagate_error() asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, sigusr1_handler) @@ -230,9 +231,7 @@ async def generate( The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. """ - if self.errored: - self._propagate_error() raise EngineDeadError() try: @@ -328,17 +327,13 @@ async def _run_output_handler(self): raise except Exception as e: - self._propagate_error(e) + logger.error("run_output_handler failed", e) + self._errored = True + self._propagate_error() raise EngineDeadError() from e - def _propagate_error(self, exception: Optional[Exception] = None): - """Propagate to generate() tasks and raise EngineDeadError.""" - - # Set errored state and log if we have - self._errored = True - if exception: - logger.error("AsyncLLM run_output_handler failed", - exc_info=exception) + def _propagate_error(self): + """Propagate to all generate() tasks.""" # Put EngineDeadError() into each generate()'s queue, # each of which will raise in their own context. From 7c973088503eaf528bb2509398d5a3f275b86f58 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:56:30 +0000 Subject: [PATCH 011/130] updated --- vllm/v1/engine/async_llm.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dc00d6f7ee35..9a0e5ec3f91a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -64,8 +64,7 @@ def __init__( def sigusr1_handler(): logger.fatal("AsyncLLM got fatal signal from worker process, " "shutting down. See stack trace for root cause.") - self._errored = True - self._propagate_error() + self._set_errored_and_propagate() asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, sigusr1_handler) @@ -327,16 +326,16 @@ async def _run_output_handler(self): raise except Exception as e: - logger.error("run_output_handler failed", e) - self._errored = True - self._propagate_error() + logger.error("AsyncLLM._run_output_handler failed", e) + self._set_errored_and_propagate() raise EngineDeadError() from e - def _propagate_error(self): + def _set_errored_and_propagate(self): """Propagate to all generate() tasks.""" + self._errored = True - # Put EngineDeadError() into each generate()'s queue, - # each of which will raise in their own context. + # Put EngineDeadError() into each generate() task's queue, + # each of which will raise it in their own context. for _, q in self.rid_to_queue.items(): q.put_nowait(EngineDeadError()) From ea6824ae35fcee912f4abe42f246117c1bbe3f24 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 16:57:08 +0000 Subject: [PATCH 012/130] updated --- vllm/v1/engine/core_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b665b1b7407b..8412de226f4c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -181,7 +181,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): - super().__init__( asyncio_mode=False, vllm_config=vllm_config, @@ -215,7 +214,6 @@ def profile(self, is_start: bool = True) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], From b278065ccf9d86bf35d3b0f2c7fde43a2d681af0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 17:00:01 +0000 Subject: [PATCH 013/130] remove cruft --- vllm/v1/engine/core_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 8412de226f4c..62c284577007 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -218,7 +218,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): - super().__init__( asyncio_mode=True, vllm_config=vllm_config, From c004bd47823a44d79b592f4da2ecbb0f6334f220 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 17:06:40 +0000 Subject: [PATCH 014/130] cruft --- vllm/v1/engine/core_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 62c284577007..bf3a6f60c0c5 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -214,6 +214,7 @@ def profile(self, is_start: bool = True) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" + def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], From 2556bc44823a95b2181de4ff152a41ea7d0bc9b2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 19:46:36 +0000 Subject: [PATCH 015/130] stash --- vllm/distributed/parallel_state.py | 5 +++++ vllm/entrypoints/launcher.py | 6 +----- vllm/model_executor/models/llama.py | 4 ++++ vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 8 +++----- vllm/v1/executor/multiproc_executor.py | 16 +++++++++++++--- 6 files changed, 27 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index a0d4235460f3..5d9549b1f74e 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -827,6 +827,7 @@ def recv(self, def destroy(self): if self.device_group is not None: + print(f"{self.device_group}") torch.distributed.destroy_process_group(self.device_group) self.device_group = None if self.cpu_group is not None: @@ -1160,13 +1161,16 @@ def get_tensor_model_parallel_rank(): def destroy_model_parallel(): """Set the groups to none and destroy them.""" + global _TP if _TP: + print("calling TP.destroy()") _TP.destroy() _TP = None global _PP if _PP: + print("calling PP.destroy()") _PP.destroy() _PP = None @@ -1174,6 +1178,7 @@ def destroy_model_parallel(): def destroy_distributed_environment(): global _WORLD if _WORLD: + print("calling WORLD.destroy()") _WORLD.destroy() _WORLD = None if torch.distributed.is_initialized(): diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 8512fe135c66..621012a800a4 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -84,11 +84,7 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient): # because handler must first return to close the connection # for this request. engine_errored = engine.errored and not engine.is_running - is_already_exiting = server.should_exit - if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored - and not is_already_exiting): - # Avoid spamming the logs by only sending once. - logger.fatal("Engine failed, terminating server.") + if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored): server.should_exit = True diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8623da99574b..704dd6aae60a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -509,6 +509,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + self.i = 0 config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -566,6 +567,9 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: + # if self.i == 100 and get_tensor_model_parallel_rank() == 0: + # raise RuntimeError("ERROR IN LLAMA!") + # self.i += 1 model_output = self.model(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, inputs_embeds) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 9a0e5ec3f91a..a76918a8256f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -326,7 +326,7 @@ async def _run_output_handler(self): raise except Exception as e: - logger.error("AsyncLLM._run_output_handler failed", e) + logger.fatal("AsyncLLM._run_output_handler failed") self._set_errored_and_propagate() raise EngineDeadError() from e diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index c3244c96e33a..4ac7bc1041fb 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -198,6 +198,8 @@ def signal_handler(signum, frame): except Exception: traceback = get_exception_traceback() logger.error("EngineCore hit an exception: %s", traceback) + engine_core.shutdown() + engine_core = None parent_process.send_signal(signal.SIGUSR1) finally: @@ -207,12 +209,8 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - # Loop until process is sent a SIGINT or SIGTERM - i = 0 + # Loop until process is sent a SIGINT or SIGTERM. while True: - if i == 10: - raise ValueError("TEST RUN") - i += 1 # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): while True: diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 41e6abbd6795..952705401934 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -193,7 +193,8 @@ def wait_for_termination(procs, timeout): active_procs = [w.proc for w in self.workers if w.proc.is_alive()] for p in active_procs: p.terminate() - if not wait_for_termination(active_procs, 4): + if not wait_for_termination(active_procs, 100): + # Send SIGKILL if still running active_procs = [p for p in active_procs if p.is_alive()] for p in active_procs: @@ -210,7 +211,7 @@ def _cleanup_sockets(self): def shutdown(self): """Properly shut down the executor and its workers""" - if getattr(self, 'shutting_down', False): + if not getattr(self, 'shutting_down', False): self.shutting_down = True for w in self.workers: w.worker_response_mq = None @@ -313,8 +314,11 @@ def make_worker_process( def shutdown(self): self.rpc_broadcast_mq = None self.worker_response_mq = None + print(f"destroy_model_parallel PID: {os.getpid()}") destroy_model_parallel() + print(f"destroy_distributed_environment PID: {os.getpid()}") destroy_distributed_environment() + print(f"done with shutdown PID: {os.getpid()}") @staticmethod def worker_main(*args, **kwargs): @@ -348,7 +352,7 @@ def signal_handler(signum, frame): worker.worker_busy_loop() except SystemExit: - logger.debug("Worker interrupted.") + logger.info("Worker interrupted.") except Exception: # worker_busy_loop sends exceptions exceptons to Executor @@ -358,10 +362,12 @@ def signal_handler(signum, frame): raise finally: + print(f"IN WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}") # Clean up once worker exits busy loop if worker is not None: worker.shutdown() worker = None + print(f"DONE W WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}") @staticmethod def wait_for_startup( @@ -390,10 +396,14 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" + i = 0 while True: method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: + if i == 10 and self.rank == 0: + raise ValueError + i+=1 output = getattr(self.worker, method)(*args, **kwargs) except Exception as e: self.worker_response_mq.enqueue( From db0b9e673fbf3311885038d92a815cd45fd8f6cb Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:13:27 +0000 Subject: [PATCH 016/130] fix llama --- vllm/distributed/parallel_state.py | 5 ----- vllm/model_executor/models/llama.py | 4 ---- vllm/v1/executor/multiproc_executor.py | 31 +++++++++++++------------- 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5d9549b1f74e..a0d4235460f3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -827,7 +827,6 @@ def recv(self, def destroy(self): if self.device_group is not None: - print(f"{self.device_group}") torch.distributed.destroy_process_group(self.device_group) self.device_group = None if self.cpu_group is not None: @@ -1161,16 +1160,13 @@ def get_tensor_model_parallel_rank(): def destroy_model_parallel(): """Set the groups to none and destroy them.""" - global _TP if _TP: - print("calling TP.destroy()") _TP.destroy() _TP = None global _PP if _PP: - print("calling PP.destroy()") _PP.destroy() _PP = None @@ -1178,7 +1174,6 @@ def destroy_model_parallel(): def destroy_distributed_environment(): global _WORLD if _WORLD: - print("calling WORLD.destroy()") _WORLD.destroy() _WORLD = None if torch.distributed.is_initialized(): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 704dd6aae60a..8623da99574b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -509,7 +509,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.i = 0 config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -567,9 +566,6 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - # if self.i == 100 and get_tensor_model_parallel_rank() == 0: - # raise RuntimeError("ERROR IN LLAMA!") - # self.i += 1 model_output = self.model(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, inputs_embeds) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 952705401934..a7f702ab718d 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -42,9 +42,6 @@ def __init__(self, vllm_config: VllmConfig) -> None: # The child processes will send SIGUSR1 when unrecoverable # errors happen. def sigusr1_handler(signum, frame): - logger.fatal( - "MulitprocExecutor got fatal signal from worker processes, " - "shutting down. See stack trace above for root cause issue.") # Propagate error up to parent process. parent_process = psutil.Process().parent() parent_process.send_signal(signal.SIGUSR1) @@ -193,7 +190,7 @@ def wait_for_termination(procs, timeout): active_procs = [w.proc for w in self.workers if w.proc.is_alive()] for p in active_procs: p.terminate() - if not wait_for_termination(active_procs, 100): + if not wait_for_termination(active_procs, 4): # Send SIGKILL if still running active_procs = [p for p in active_procs if p.is_alive()] @@ -314,11 +311,8 @@ def make_worker_process( def shutdown(self): self.rpc_broadcast_mq = None self.worker_response_mq = None - print(f"destroy_model_parallel PID: {os.getpid()}") destroy_model_parallel() - print(f"destroy_distributed_environment PID: {os.getpid()}") destroy_distributed_environment() - print(f"done with shutdown PID: {os.getpid()}") @staticmethod def worker_main(*args, **kwargs): @@ -352,22 +346,27 @@ def signal_handler(signum, frame): worker.worker_busy_loop() except SystemExit: - logger.info("Worker interrupted.") + logger.debug("Worker interrupted.") + + except Exception as e: + # Log rather than raise so the stack trace is in order. + logger.exception("WorkerProc got an Exception:", exc_info=e) + + # The parent will send a SIGTERM to all worker processes + # after we send SIGUSR. Set this value so we don't re-throw + # SystemExit(), to avoid zmq Exceptions during shyt + shutdown_requested = True - except Exception: # worker_busy_loop sends exceptions exceptons to Executor # for shutdown, but if there is an error in startup or an # error with IPC itself, we need to alert the parent. psutil.Process().parent().send_signal(signal.SIGUSR1) - raise finally: - print(f"IN WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}") # Clean up once worker exits busy loop if worker is not None: worker.shutdown() worker = None - print(f"DONE W WORKER FINALLY. RANK: {kwargs["rank"]} PID: {os.getpid()}") @staticmethod def wait_for_startup( @@ -401,15 +400,15 @@ def worker_busy_loop(self): method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: - if i == 10 and self.rank == 0: - raise ValueError - i+=1 output = getattr(self.worker, method)(*args, **kwargs) except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) logger.exception("WorkerProc hit an exception: %s", exc_info=e) continue - + + if i == 10 and self.rank == 0: + raise ValueError + i+=1 self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.SUCCESS, output)) From f72258961fac00cf32a71a4eae008d26c65d92b2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:15:41 +0000 Subject: [PATCH 017/130] updated --- vllm/v1/engine/async_llm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a76918a8256f..5f7c14ae7b59 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -58,9 +58,8 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # EngineCore and Worker processes send SIGUSR1 when - # unrecoverable errors occur. Start the shutdown - # process if this occurs. + # Background processes send SIGUSR1 when unrecoverable + # errors occur. Start the shutdown process if this happens. def sigusr1_handler(): logger.fatal("AsyncLLM got fatal signal from worker process, " "shutting down. See stack trace for root cause.") From de75cc44a2a4162a2ef398ad0f729d139e78c96c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:16:35 +0000 Subject: [PATCH 018/130] cruft --- vllm/v1/engine/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 4ac7bc1041fb..200c8184b0ba 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -209,7 +209,7 @@ def signal_handler(signum, frame): def run_busy_loop(self): """Core busy loop of the EngineCore.""" - # Loop until process is sent a SIGINT or SIGTERM. + # Loop until process is sent a SIGINT or SIGTERM while True: # 1) Poll the input queue until there is work to do. if not self.scheduler.has_unfinished_requests(): From ba5ca87ccd1e4a2138f8de63d46c68648123be76 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:17:41 +0000 Subject: [PATCH 019/130] cruft --- vllm/v1/executor/multiproc_executor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index a7f702ab718d..0c7b0d792f58 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -191,7 +191,6 @@ def wait_for_termination(procs, timeout): for p in active_procs: p.terminate() if not wait_for_termination(active_procs, 4): - # Send SIGKILL if still running active_procs = [p for p in active_procs if p.is_alive()] for p in active_procs: From 4f6b68a3c03a28177770355cd261d7ca3aac5030 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:29:58 +0000 Subject: [PATCH 020/130] updated --- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/llm_engine.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 200c8184b0ba..5f5835f8cb6f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -221,7 +221,7 @@ def run_busy_loop(self): except queue.Empty: self._log_stats() logger.debug("EngineCore busy loop waiting.") - except BaseException: + except Exception: raise # 2) Handle any new client requests (Abort or Add). diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 0bd9b52c9be8..04a899e614ec 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,5 +1,5 @@ +import signal from typing import Dict, List, Mapping, Optional, Type, Union - from typing_extensions import TypeVar from vllm.config import VllmConfig @@ -44,6 +44,16 @@ def __init__( ) -> None: self.model_config = vllm_config.model_config + # Background processes send SIGUSR1 when unrecoverable + # errors occur. Start the shutdown process if this happens. + def sigusr1_handler(): + logger.fatal("LLMEngine got fatal signal from worker process, " + "shutting down. See stack trace for root cause.") + self._set_errored_and_propagate() + + asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, + sigusr1_handler) + # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -201,3 +211,9 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group + + def shutdown(self): + """Shutdown, cleaning up the background proc and IPC.""" + + if engine_core := getattr(self, "engine_core", None): + engine_core.shutdown() From 949d4253a7997596411336204ef1aefa3fc77fe4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:38:10 +0000 Subject: [PATCH 021/130] updated --- vllm/v1/engine/llm_engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 04a899e614ec..2693be7741d6 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -46,13 +46,15 @@ def __init__( # Background processes send SIGUSR1 when unrecoverable # errors occur. Start the shutdown process if this happens. + # NOTE: signal_handlers must be created and run in the main + # python thread, a workaround for this would be using polling + # rather than signal handling to detect a shutdown. Investigate. def sigusr1_handler(): logger.fatal("LLMEngine got fatal signal from worker process, " "shutting down. See stack trace for root cause.") - self._set_errored_and_propagate() + self.shutdown() - asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, - sigusr1_handler) + signal().add_signal_handler(signal.SIGUSR1, sigusr1_handler) # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( From f67398bd3f33ff2280cccf2d8afa47d22ee031e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 20:39:09 +0000 Subject: [PATCH 022/130] updated --- vllm/v1/engine/llm_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 2693be7741d6..98ddc6c77b80 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -49,12 +49,12 @@ def __init__( # NOTE: signal_handlers must be created and run in the main # python thread, a workaround for this would be using polling # rather than signal handling to detect a shutdown. Investigate. - def sigusr1_handler(): - logger.fatal("LLMEngine got fatal signal from worker process, " - "shutting down. See stack trace for root cause.") + def sigusr1_handler(signum, frame): + logger.fatal("LLMEngine go fatal signal from worker, shutting " + "down. See stack trace above for root cause issue.") self.shutdown() - signal().add_signal_handler(signal.SIGUSR1, sigusr1_handler) + signal.signal(signal.SIGUSR1, sigusr1_handler) # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( From b3d29946038315c297f470c32ffef88eed8eb75c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:16:28 +0000 Subject: [PATCH 023/130] updated --- vllm/v1/engine/async_llm.py | 12 +----------- vllm/v1/engine/core.py | 3 +-- vllm/v1/engine/core_client.py | 19 ++++++++++++++++++- vllm/v1/engine/llm_engine.py | 21 +-------------------- vllm/v1/executor/multiproc_executor.py | 19 ++++++++++++------- 5 files changed, 33 insertions(+), 41 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5f7c14ae7b59..eff6aa73736a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -19,6 +19,7 @@ from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor from vllm.v1.executor.ray_utils import initialize_ray_cluster @@ -26,17 +27,6 @@ logger = init_logger(__name__) -# Raised when a generate() fails. Possibly Recoverable. -class EngineGenerateError(Exception): - pass - - -# Raised when the engine dies, typically by the -# background output handler loop. Unrecoverable. -class EngineDeadError(Exception): - pass - - class AsyncLLM(EngineClient): def __init__( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5f5835f8cb6f..5761210b6a1a 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -198,13 +198,12 @@ def signal_handler(signum, frame): except Exception: traceback = get_exception_traceback() logger.error("EngineCore hit an exception: %s", traceback) - engine_core.shutdown() - engine_core = None parent_process.send_signal(signal.SIGUSR1) finally: if engine_core is not None: engine_core.shutdown() + engine_core = None def run_busy_loop(self): """Core busy loop of the EngineCore.""" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index bf3a6f60c0c5..2a027300c401 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,4 @@ +import signal import weakref from abc import ABC, abstractmethod from typing import List, Type @@ -13,6 +14,7 @@ EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.engine.exceptions import engine_dead_error_guard from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder from vllm.v1.utils import BackgroundProcHandle @@ -181,6 +183,20 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): + + # Background procs sent SIGUSR1 if they hit error. + # We handle this by setting the _errored state to True + # and shutting down. Once _errored, we convert any + # Exceptions into an EngineDeadError for UX. + def sigusr1_handler(signum, frame): + logger.fatal("LLMEngine got fatal signal from background " + "process, starting shutting down.") + self._errored = True + self.shutdown() + + signal.signal(signal.SIGUSR1, sigusr1_handler) + self._errored = False + super().__init__( asyncio_mode=False, vllm_config=vllm_config, @@ -188,12 +204,14 @@ def __init__(self, log_stats=log_stats, ) + @engine_dead_error_guard def get_output(self) -> List[EngineCoreOutput]: (frame, ) = self.output_socket.recv_multipart(copy=False) engine_core_outputs = self.decoder.decode(frame.buffer).outputs return engine_core_outputs + @engine_dead_error_guard def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -227,7 +245,6 @@ def __init__(self, ) async def get_output_async(self) -> List[EngineCoreOutput]: - frames = await self.output_socket.recv_multipart(copy=False) engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 98ddc6c77b80..119d1eeabfc9 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,5 +1,5 @@ -import signal from typing import Dict, List, Mapping, Optional, Type, Union + from typing_extensions import TypeVar from vllm.config import VllmConfig @@ -44,18 +44,6 @@ def __init__( ) -> None: self.model_config = vllm_config.model_config - # Background processes send SIGUSR1 when unrecoverable - # errors occur. Start the shutdown process if this happens. - # NOTE: signal_handlers must be created and run in the main - # python thread, a workaround for this would be using polling - # rather than signal handling to detect a shutdown. Investigate. - def sigusr1_handler(signum, frame): - logger.fatal("LLMEngine go fatal signal from worker, shutting " - "down. See stack trace above for root cause issue.") - self.shutdown() - - signal.signal(signal.SIGUSR1, sigusr1_handler) - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -160,7 +148,6 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - # 1) Process raw inputs into the request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, @@ -213,9 +200,3 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group - - def shutdown(self): - """Shutdown, cleaning up the background proc and IPC.""" - - if engine_core := getattr(self, "engine_core", None): - engine_core.shutdown() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 0c7b0d792f58..cbc025f2c41a 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -17,6 +17,7 @@ destroy_model_parallel) from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) +from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING from vllm.executor.multiproc_worker_utils import ( _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger @@ -39,13 +40,17 @@ def __init__(self, vllm_config: VllmConfig) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) - # The child processes will send SIGUSR1 when unrecoverable - # errors happen. + # WorkerProcs send SIGUSR1 if they get an Error. def sigusr1_handler(signum, frame): - # Propagate error up to parent process. - parent_process = psutil.Process().parent() - parent_process.send_signal(signal.SIGUSR1) + logger.fatal("MultiprocExecutor got fatal signal from " + "background process, starting shutdown.") + # Shutdown first (avoid SysExit exceptions in __del__). self.shutdown() + # TODO(rob): move this to the VLLMConfig. + if VLLM_ENABLE_V1_MULTIPROCESSING: + # Propagate up if using the mp engine. Note that + # sending in non-mp mode crashes caller process. + psutil.Process().parent().send_signal(signal.SIGUSR1) signal.signal(signal.SIGUSR1, sigusr1_handler) @@ -405,9 +410,9 @@ def worker_busy_loop(self): (WorkerProc.ResponseStatus.FAILURE, e)) logger.exception("WorkerProc hit an exception: %s", exc_info=e) continue - + if i == 10 and self.rank == 0: raise ValueError - i+=1 + i += 1 self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.SUCCESS, output)) From 34a997a2cbf871a716783c2477a3e9779b53b58e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:21:47 +0000 Subject: [PATCH 024/130] update comment --- vllm/v1/engine/core_client.py | 6 +++++- vllm/v1/executor/multiproc_executor.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 2a027300c401..641e3c0d0284 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -184,6 +184,11 @@ def __init__(self, executor_class: Type[Executor], log_stats: bool = False): + # NOTE(rob): signal handler only needed for SyncMPClient + # because AsyncLLM needs to handle the signal rather + # than the AsyncMPClient. TODO(follow-up): move the defn of + # these functions to async_llm.py and llm_engine.py to make + # distinction clearer. # Background procs sent SIGUSR1 if they hit error. # We handle this by setting the _errored state to True # and shutting down. Once _errored, we convert any @@ -252,7 +257,6 @@ async def get_output_async(self) -> List[EngineCoreOutput]: async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: - msg = (request_type.value, self.encoder.encode(request)) await self.input_socket.send_multipart(msg, copy=False) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index cbc025f2c41a..f4cc73cb6376 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -353,12 +353,13 @@ def signal_handler(signum, frame): logger.debug("Worker interrupted.") except Exception as e: - # Log rather than raise so the stack trace is in order. + # Log rather than raise so the stack trace is in order of + # WorkerProc -> EngineCore -> AsyncLLM. logger.exception("WorkerProc got an Exception:", exc_info=e) # The parent will send a SIGTERM to all worker processes # after we send SIGUSR. Set this value so we don't re-throw - # SystemExit(), to avoid zmq Exceptions during shyt + # SystemExit(), to avoid zmq exceptions during __del__. shutdown_requested = True # worker_busy_loop sends exceptions exceptons to Executor From 32cf91b52169941b30a5c464f8f6d3d8af35984b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:23:52 +0000 Subject: [PATCH 025/130] update comment --- vllm/entrypoints/launcher.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 621012a800a4..d5d000e28016 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -66,23 +66,25 @@ async def dummy_shutdown() -> None: async def watchdog_loop(server: uvicorn.Server, engine: EngineClient): - # Background task that runs in the background, checking - # for error state in the engine. This is needed for a - # clean shutdown since we cannot raise an Exception in - # a StreamingResponse generator() meaning we cannot use - # the exception handlers below. - VLLM_WATCHDOG_TIME_S = 3.0 + """ + # Watchdog task that runs in the background, checking + # for error state in the engine. Needed to trigger shutdown + # if an exception arises is StreamingResponse() generator. + """ + VLLM_WATCHDOG_TIME_S = 5.0 while True: await asyncio.sleep(VLLM_WATCHDOG_TIME_S) terminate_if_errored(server, engine) def terminate_if_errored(server: uvicorn.Server, engine: EngineClient): - # See discussions here on shutting down a uvicorn server - # https://github.com/encode/uvicorn/discussions/1103 - # In this case we cannot await the server shutdown here - # because handler must first return to close the connection - # for this request. + """ + See discussions here on shutting down a uvicorn server + https://github.com/encode/uvicorn/discussions/1103 + In this case we cannot await the server shutdown here + because handler must first return to close the connection + for this request. + """ engine_errored = engine.errored and not engine.is_running if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored): server.should_exit = True From c73801c703a51e881e7dceca60ab62451bbedb82 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:25:49 +0000 Subject: [PATCH 026/130] fix more --- vllm/v1/engine/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5761210b6a1a..391aa59beb73 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -203,7 +203,6 @@ def signal_handler(signum, frame): finally: if engine_core is not None: engine_core.shutdown() - engine_core = None def run_busy_loop(self): """Core busy loop of the EngineCore.""" From 11888451fb593a3daad23390fb03254189a080e7 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:35:03 +0000 Subject: [PATCH 027/130] updated --- vllm/v1/engine/llm_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 119d1eeabfc9..0bd9b52c9be8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -148,6 +148,7 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: + # 1) Process raw inputs into the request. request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, From 706782c899f6bc75f99796efd0791382f8fe5e23 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:35:42 +0000 Subject: [PATCH 028/130] udpatd --- vllm/v1/engine/core_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 641e3c0d0284..a461f82e418a 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -250,6 +250,7 @@ def __init__(self, ) async def get_output_async(self) -> List[EngineCoreOutput]: + frames = await self.output_socket.recv_multipart(copy=False) engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs @@ -257,6 +258,7 @@ async def get_output_async(self) -> List[EngineCoreOutput]: async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: + msg = (request_type.value, self.encoder.encode(request)) await self.input_socket.send_multipart(msg, copy=False) From 1cc09156ebb4edf15db6281be82ea2ffce3fd77a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:36:26 +0000 Subject: [PATCH 029/130] added exception file --- vllm/v1/engine/exceptions.py | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 vllm/v1/engine/exceptions.py diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py new file mode 100644 index 000000000000..d190613b2cf1 --- /dev/null +++ b/vllm/v1/engine/exceptions.py @@ -0,0 +1,44 @@ +# There exceptions are raised by the LLMEngine and AsyncLLM +# when errors occur. See vllm/entrypoints/launcher.py for the +# handlers of these exceptions in the API Server. + +# Raised when a AsyncLLM.generate() fails. Possibly recoverable. +class EngineGenerateError(Exception): + pass + +# Raised when the EngineCore dies. Unrecoverable. +class EngineDeadError(Exception): + pass + +def engine_dead_error_guard(func): + """ + Decorator to be used by functions that call engine_core. + engine_core runs in a background process and sends a fatal + signal to the LLMEngine if it encounters an error. The + LLMEngine handles this signal, sets self._errored, and then + calls self.shutdown(), which kills engine_core. + + After the signal is handled, we will get an exception if + we try to interact with the engine_core. This decorator + catches the exception and raises an a more accurate + EngineDeadError exception to make the fundamental issue + clearer to the end user. + """ + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + # NOTE: args[0] is self (EngineCoreMPClient) + if not args[0]._errored: + raise e + else: + new_e = EngineDeadError( + "Engine got error in background worker process. " + "See stack trace for root cause issue.") + # Convert the exception to EngineDeadError to give the + # user a clear failure reason, suppressing. + # https://docs.python.org/3/library/exceptions.html#exception-context # noqa: E501 + new_e.__suppress_context__ = True + raise new_e from None + + return wrapper \ No newline at end of file From 8db0eee569ab4593954168c58e7404750ab0e5a5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:43:31 +0000 Subject: [PATCH 030/130] updated --- vllm/entrypoints/launcher.py | 2 +- vllm/v1/engine/async_llm.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index d5d000e28016..ad2b725b49bb 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -12,7 +12,7 @@ from vllm.engine.protocol import EngineClient from vllm.logger import init_logger from vllm.utils import find_process_using_port -from vllm.v1.engine.async_llm import EngineDeadError, EngineGenerateError +from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError logger = init_logger(__name__) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index eff6aa73736a..c29f1f6d6c23 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -51,8 +51,9 @@ def __init__( # Background processes send SIGUSR1 when unrecoverable # errors occur. Start the shutdown process if this happens. def sigusr1_handler(): - logger.fatal("AsyncLLM got fatal signal from worker process, " - "shutting down. See stack trace for root cause.") + logger.fatal( + "AsyncLLM got fatal signal from background process, " + "starting shutdown. See stack trace for root cause.") self._set_errored_and_propagate() asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, From 2fc8af62b72fb0eff201e07f5f7fe63f02b3246f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 4 Jan 2025 23:52:41 +0000 Subject: [PATCH 031/130] fixt --- vllm/v1/engine/async_llm.py | 5 ++--- vllm/v1/engine/exceptions.py | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c29f1f6d6c23..51bd7baf1d53 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -51,9 +51,8 @@ def __init__( # Background processes send SIGUSR1 when unrecoverable # errors occur. Start the shutdown process if this happens. def sigusr1_handler(): - logger.fatal( - "AsyncLLM got fatal signal from background process, " - "starting shutdown. See stack trace for root cause.") + logger.fatal("AsyncLLM got fatal signal from background process, " + "starting shutdown. See stack trace for root cause.") self._set_errored_and_propagate() asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index d190613b2cf1..ac554aa4bc23 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -1,15 +1,13 @@ -# There exceptions are raised by the LLMEngine and AsyncLLM -# when errors occur. See vllm/entrypoints/launcher.py for the -# handlers of these exceptions in the API Server. - # Raised when a AsyncLLM.generate() fails. Possibly recoverable. class EngineGenerateError(Exception): pass + # Raised when the EngineCore dies. Unrecoverable. class EngineDeadError(Exception): pass + def engine_dead_error_guard(func): """ Decorator to be used by functions that call engine_core. @@ -24,6 +22,7 @@ def engine_dead_error_guard(func): EngineDeadError exception to make the fundamental issue clearer to the end user. """ + def wrapper(*args, **kwargs): try: return func(*args, **kwargs) @@ -41,4 +40,4 @@ def wrapper(*args, **kwargs): new_e.__suppress_context__ = True raise new_e from None - return wrapper \ No newline at end of file + return wrapper From de39af149f73eadcfdb784079ca4048260a4d7d8 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:10:51 +0000 Subject: [PATCH 032/130] reduce cruft --- vllm/engine/multiprocessing/client.py | 4 ++++ vllm/engine/protocol.py | 4 ++++ vllm/v1/engine/async_llm.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 329bcc3d1ff6..0a046c71e86e 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -402,6 +402,10 @@ async def check_health(self): def is_running(self) -> bool: return not self.errored + @property + def is_stopped(self) -> bool: + return self.errored + @property def errored(self) -> bool: return self._errored_with is not None diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index b2a5cc17ead6..9f58d61708fb 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -29,6 +29,10 @@ class EngineClient(ABC): def is_running(self) -> bool: ... + @abstractmethod + def is_stopped(self) -> bool: + ... + @property @abstractmethod def errored(self) -> bool: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 51bd7baf1d53..ce41c4946d71 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -390,6 +390,10 @@ async def stop_profile(self) -> None: def is_running(self) -> bool: return not self.errored + @property + def is_stopped(self) -> bool: + return False + @property def errored(self) -> bool: return self._errored From 732ba645d76f0c15f8cb20f7c1ff76004f3ac63c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:11:04 +0000 Subject: [PATCH 033/130] reduce cruft --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ce41c4946d71..434e2a7945d3 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -392,7 +392,7 @@ def is_running(self) -> bool: @property def is_stopped(self) -> bool: - return False + return self.errored @property def errored(self) -> bool: From 437209430e5c94c79e97e6d5dd29ec5eadc7caba Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:23:58 +0000 Subject: [PATCH 034/130] cleanup --- vllm/engine/protocol.py | 1 + vllm/v1/engine/core_client.py | 13 ++++++--- vllm/v1/engine/exceptions.py | 50 +++++++++-------------------------- 3 files changed, 24 insertions(+), 40 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 9f58d61708fb..a066836b9270 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -29,6 +29,7 @@ class EngineClient(ABC): def is_running(self) -> bool: ... + @property @abstractmethod def is_stopped(self) -> bool: ... diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a461f82e418a..5337237b26d2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -208,13 +208,20 @@ def sigusr1_handler(signum, frame): executor_class=executor_class, log_stats=log_stats, ) + + def _handle_exception(self, e: Exception): + + @engine_dead_error_guard def get_output(self) -> List[EngineCoreOutput]: - (frame, ) = self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frame.buffer).outputs - return engine_core_outputs + try: + (frame, ) = self.output_socket.recv_multipart(copy=False) + return self.decoder.decode(frame.buffer).outputs + except Exception as e: + if self._errored + @engine_dead_error_guard def _send_input(self, request_type: EngineCoreRequestType, diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index ac554aa4bc23..965c8441373e 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -1,43 +1,19 @@ -# Raised when a AsyncLLM.generate() fails. Possibly recoverable. class EngineGenerateError(Exception): + """Raised when a AsyncLLM.generate() fails. Maybe recoverable.""" pass -# Raised when the EngineCore dies. Unrecoverable. class EngineDeadError(Exception): - pass - - -def engine_dead_error_guard(func): - """ - Decorator to be used by functions that call engine_core. - engine_core runs in a background process and sends a fatal - signal to the LLMEngine if it encounters an error. The - LLMEngine handles this signal, sets self._errored, and then - calls self.shutdown(), which kills engine_core. - - After the signal is handled, we will get an exception if - we try to interact with the engine_core. This decorator - catches the exception and raises an a more accurate - EngineDeadError exception to make the fundamental issue - clearer to the end user. - """ - - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - # NOTE: args[0] is self (EngineCoreMPClient) - if not args[0]._errored: - raise e - else: - new_e = EngineDeadError( - "Engine got error in background worker process. " - "See stack trace for root cause issue.") - # Convert the exception to EngineDeadError to give the - # user a clear failure reason, suppressing. - # https://docs.python.org/3/library/exceptions.html#exception-context # noqa: E501 - new_e.__suppress_context__ = True - raise new_e from None + """Raised when the EngineCore dies. Unrecoverable.""" + def __init__(self, + *args, + suppress_context: bool = False, + **kwargs): + super().__init__(args, kwargs) - return wrapper + # If we get an EngineDead signal when using LLMEngine, + # we often shutdown the EngineCore while the main + # process is still using ZMQ. This makes the root + # cause clear in the stack trace. + if suppress_context: + self.__suppress_context__ = True \ No newline at end of file From b9144a34e02b91d62cd6c20d4bf60c7ba9eb7fad Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:42:22 +0000 Subject: [PATCH 035/130] updated --- vllm/v1/engine/core_client.py | 40 +++++++++++++++++------------------ vllm/v1/engine/exceptions.py | 8 +++---- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5337237b26d2..10138c37e310 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -14,7 +14,7 @@ EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) from vllm.v1.engine.core import EngineCore, EngineCoreProc -from vllm.v1.engine.exceptions import engine_dead_error_guard +from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder from vllm.v1.utils import BackgroundProcHandle @@ -184,15 +184,13 @@ def __init__(self, executor_class: Type[Executor], log_stats: bool = False): - # NOTE(rob): signal handler only needed for SyncMPClient + # TODO(rob): signal handler only needed for SyncMPClient # because AsyncLLM needs to handle the signal rather - # than the AsyncMPClient. TODO(follow-up): move the defn of - # these functions to async_llm.py and llm_engine.py to make - # distinction clearer. + # than the AsyncMPClient. TODO(rob): move the Client def + # to async_llm and llm_engine to make this clearer. # Background procs sent SIGUSR1 if they hit error. - # We handle this by setting the _errored state to True - # and shutting down. Once _errored, we convert any - # Exceptions into an EngineDeadError for UX. + # Handle by setting _errored=True and shutting down. + # Next action taken will raise EngineDeadError. def sigusr1_handler(signum, frame): logger.fatal("LLMEngine got fatal signal from background " "process, starting shutting down.") @@ -208,28 +206,30 @@ def sigusr1_handler(signum, frame): executor_class=executor_class, log_stats=log_stats, ) - - def _handle_exception(self, e: Exception): - + def _format_exception(self, e: Exception) -> Exception: + # If we are in the _errored state, raise EngineDeadError + # so the root cause is clear in the stack trace. + return (EngineDeadError( + "EngineCore encountered an issue. See stack trace " + "for the root cause.", + suppress_context=True) if self._errored else e) - @engine_dead_error_guard def get_output(self) -> List[EngineCoreOutput]: - try: (frame, ) = self.output_socket.recv_multipart(copy=False) return self.decoder.decode(frame.buffer).outputs except Exception as e: - if self._errored + raise self._format_exception(e) from None - - @engine_dead_error_guard def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: - - # (RequestType, SerializedRequest) - msg = (request_type.value, self.encoder.encode(request)) - self.input_socket.send_multipart(msg, copy=False) + try: + # (RequestType, SerializedRequest) + msg = (request_type.value, self.encoder.encode(request)) + self.input_socket.send_multipart(msg, copy=False) + except Exception as e: + raise self._format_exception(e) from None def add_request(self, request: EngineCoreRequest) -> None: self._send_input(EngineCoreRequestType.ADD, request) diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 965c8441373e..523416ca384d 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -5,10 +5,8 @@ class EngineGenerateError(Exception): class EngineDeadError(Exception): """Raised when the EngineCore dies. Unrecoverable.""" - def __init__(self, - *args, - suppress_context: bool = False, - **kwargs): + + def __init__(self, *args, suppress_context: bool = False, **kwargs): super().__init__(args, kwargs) # If we get an EngineDead signal when using LLMEngine, @@ -16,4 +14,4 @@ def __init__(self, # process is still using ZMQ. This makes the root # cause clear in the stack trace. if suppress_context: - self.__suppress_context__ = True \ No newline at end of file + self.__suppress_context__ = True From d90e122e2047228847f171c3ce7a519649a0a48f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:43:46 +0000 Subject: [PATCH 036/130] cruft --- vllm/v1/executor/multiproc_executor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index f4cc73cb6376..ab96842210fa 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -40,10 +40,12 @@ def __init__(self, vllm_config: VllmConfig) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) - # WorkerProcs send SIGUSR1 if they get an Error. + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. def sigusr1_handler(signum, frame): - logger.fatal("MultiprocExecutor got fatal signal from " - "background process, starting shutdown.") + logger.fatal( + "MulitprocExecutor got fatal signal from worker processes, " + "shutting down. See stack trace above for root cause issue.") # Shutdown first (avoid SysExit exceptions in __del__). self.shutdown() # TODO(rob): move this to the VLLMConfig. From 2bbac313c2e71d500fed6dbfd576ef861e6f7bca Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 17:48:32 +0000 Subject: [PATCH 037/130] updated --- vllm/v1/engine/core_client.py | 3 +-- vllm/v1/engine/exceptions.py | 10 +++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 10138c37e310..ddedac0be1c1 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -208,8 +208,7 @@ def sigusr1_handler(signum, frame): ) def _format_exception(self, e: Exception) -> Exception: - # If we are in the _errored state, raise EngineDeadError - # so the root cause is clear in the stack trace. + """If _errored, use EngineDeadError so root cause is clear.""" return (EngineDeadError( "EngineCore encountered an issue. See stack trace " "for the root cause.", diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 523416ca384d..5313c1e0943c 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -8,10 +8,6 @@ class EngineDeadError(Exception): def __init__(self, *args, suppress_context: bool = False, **kwargs): super().__init__(args, kwargs) - - # If we get an EngineDead signal when using LLMEngine, - # we often shutdown the EngineCore while the main - # process is still using ZMQ. This makes the root - # cause clear in the stack trace. - if suppress_context: - self.__suppress_context__ = True + # Make stack trace clearer when using with LLMEngine by + # silencing irrelevant ZMQError. + self.__suppress_context__ = suppress_context From c40542abb83ea1b5f40093e8710e2801ada0cf81 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 19:11:11 +0000 Subject: [PATCH 038/130] revert changes to server --- vllm/entrypoints/openai/serving_chat.py | 10 +++++----- vllm/entrypoints/openai/serving_completion.py | 8 ++++---- vllm/entrypoints/openai/serving_embedding.py | 6 +++--- vllm/entrypoints/openai/serving_pooling.py | 6 +++--- vllm/entrypoints/openai/serving_score.py | 6 +++--- vllm/entrypoints/openai/serving_tokenization.py | 2 +- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a20bf1efa08a..9ba5eeb7709c 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -171,7 +171,7 @@ async def create_chat_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -228,7 +228,7 @@ async def create_chat_completion( ) generators.append(generator) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -245,7 +245,7 @@ async def create_chat_completion( return await self.chat_completion_full_generator( request, result_generator, request_id, model_name, conversation, tokenizer, request_metadata) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -591,7 +591,7 @@ async def chat_completion_stream_generator( completion_tokens=num_completion_tokens, total_tokens=num_prompt_tokens + num_completion_tokens) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") data = self.create_streaming_error_response(str(e)) @@ -618,7 +618,7 @@ async def chat_completion_full_generator( final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 53ae1b134590..17197dce8da2 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -106,7 +106,7 @@ async def create_completion( truncate_prompt_tokens=request.truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -158,7 +158,7 @@ async def create_completion( ) generators.append(generator) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -215,7 +215,7 @@ async def create_completion( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -371,7 +371,7 @@ async def completion_stream_generator( # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index fe8ba5eb95b9..e7116a3d95d1 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -136,7 +136,7 @@ async def create_embedding( truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -167,7 +167,7 @@ async def create_embedding( ) generators.append(generator) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -196,7 +196,7 @@ async def create_embedding( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 3441071344f4..5830322071e5 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -132,7 +132,7 @@ async def create_pooling( truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -163,7 +163,7 @@ async def create_pooling( ) generators.append(generator) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -192,7 +192,7 @@ async def create_pooling( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 9b5aa13bda84..5d3e7139d7a1 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,7 +101,7 @@ async def create_score( if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -155,7 +155,7 @@ async def create_score( ) generators.append(generator) - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -184,7 +184,7 @@ async def create_score( ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") - except Exception as e: + except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index a3dc42ff8f02..b67ecfb01316 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -86,7 +86,7 @@ async def create_tokenize( request.prompt, add_special_tokens=request.add_special_tokens, ) - except Exception as e: + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) From 46734ebbe3dcbc5bcec4347c950d0d3704d11b49 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 19:13:44 +0000 Subject: [PATCH 039/130] revert debug cruft --- vllm/v1/executor/multiproc_executor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ab96842210fa..9464744e184b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -402,7 +402,6 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" - i = 0 while True: method, args, kwargs = self.rpc_broadcast_mq.dequeue() @@ -414,8 +413,5 @@ def worker_busy_loop(self): logger.exception("WorkerProc hit an exception: %s", exc_info=e) continue - if i == 10 and self.rank == 0: - raise ValueError - i += 1 self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.SUCCESS, output)) From f0baffbe8d07d02a1b3930b84db1b9151679d960 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 20:19:04 +0000 Subject: [PATCH 040/130] fix error --- vllm/v1/engine/core_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ddedac0be1c1..4a069b0edba9 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -193,7 +193,7 @@ def __init__(self, # Next action taken will raise EngineDeadError. def sigusr1_handler(signum, frame): logger.fatal("LLMEngine got fatal signal from background " - "process, starting shutting down.") + "process, shutting down.") self._errored = True self.shutdown() From 8a7f18e7234f2075778994a0d5a3daafc8da176b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 21:34:34 +0000 Subject: [PATCH 041/130] added tests --- tests/v1/shutdown/forward_error.py | 116 +++++++++++++++++++++++++++++ tests/v1/utils.py | 28 +++++++ 2 files changed, 144 insertions(+) create mode 100644 tests/v1/shutdown/forward_error.py create mode 100644 tests/v1/utils.py diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py new file mode 100644 index 000000000000..824447ce4da0 --- /dev/null +++ b/tests/v1/shutdown/forward_error.py @@ -0,0 +1,116 @@ +"""Test that we handle an Error in model forward and shutdown.""" + +import asyncio +import pytest + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.utils import cuda_device_count_stateless +from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.exceptions import EngineDeadError + +from tests.utils import wait_for_gpu_memory_to_clear + +def evil_forward(self, *args, **kwargs): + """Evil forward method that raise an exception after 5 calls.""" + NUMBER_OF_GOOD_PASSES = 10 + + if not hasattr(self, "num_calls"): + self.num_calls = 0 + + if (self.num_calls == NUMBER_OF_GOOD_PASSES and + get_tensor_model_parallel_rank() == 0): + raise Exception("Simulated illegal memory access on Rank 0!") + self.num_calls += 1 + + return self.model(*args, **kwargs, intermediate_tensors=None) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +async def test_async_llm_model_error(monkeypatch, tensor_parallel_size): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + engine_args = AsyncEngineArgs( + model="meta-llama/Llama-3.2-1B", + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + async_llm = AsyncLLM.from_engine_args(engine_args) + + async def generate(request_id: str): + generator = async_llm.generate("Hello my name is", + request_id=request_id, + sampling_params=SamplingParams()) + try: + async for _ in generator: + pass + except Exception as e: + return e + + NUM_REQS = 3 + tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] + outputs = await asyncio.gather(*tasks) + + # Every request should have get an EngineDeadError. + for output in outputs: + assert isinstance(output, EngineDeadError) + + # AsyncLLM should be errored. + assert async_llm.errored + + # We should not be able to make another request. + with pytest.raises(EngineDeadError): + async for _ in async_llm.generate( + "Hello my name is", request_id="abc", + sampling_params=SamplingParams()): + raise Exception("We should not get here.") + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + # NOTE: shutdown is handled by the API Server. If an exception + # occurs, so it is expected that we would need to call this. + async_llm.shutdown() + + +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +def test_llm_model_error(monkeypatch, tensor_parallel_size, + enable_multiprocessing): + + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) + + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + llm = LLM(model="meta-llama/Llama-3.2-1B", + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + + with pytest.raises(EngineDeadError): + llm.generate("Hello my name is Robert and I") + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) diff --git a/tests/v1/utils.py b/tests/v1/utils.py new file mode 100644 index 000000000000..b06c775220cc --- /dev/null +++ b/tests/v1/utils.py @@ -0,0 +1,28 @@ +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.model_executor.models.llama import LlamaModel + +NUMBER_OF_GOOD_PASSES = 10 + +class ModelForwardError(Exception): + pass + +class EvilLlamaModel(LlamaModel): + """Evil Llama Class For Simulating Model Issue.""" + + def __init__(self, *args, **kwargs): + super.__init__(*args, **kwargs) + self.good_passes = 0 + + def forward(self, *args, **kwargs): + """Raise an after N iterations""" + + if (self.good_passes == NUMBER_OF_GOOD_PASSES and + get_tensor_model_parallel_rank() == 0): + raise ModelForwardError( + "Simulated illegal memory access on rank 0!") + self.good_passes += 1 + return self.forward(*args, **kwargs) + + + + From a66294091094dfbcdc3f7506624f94ce5ee0ba9b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 21:34:44 +0000 Subject: [PATCH 042/130] revert --- tests/v1/utils.py | 28 ---------------------------- vllm/v1/engine/async_llm.py | 32 +++++++++++++++++--------------- 2 files changed, 17 insertions(+), 43 deletions(-) delete mode 100644 tests/v1/utils.py diff --git a/tests/v1/utils.py b/tests/v1/utils.py deleted file mode 100644 index b06c775220cc..000000000000 --- a/tests/v1/utils.py +++ /dev/null @@ -1,28 +0,0 @@ -from vllm.distributed import get_tensor_model_parallel_rank -from vllm.model_executor.models.llama import LlamaModel - -NUMBER_OF_GOOD_PASSES = 10 - -class ModelForwardError(Exception): - pass - -class EvilLlamaModel(LlamaModel): - """Evil Llama Class For Simulating Model Issue.""" - - def __init__(self, *args, **kwargs): - super.__init__(*args, **kwargs) - self.good_passes = 0 - - def forward(self, *args, **kwargs): - """Raise an after N iterations""" - - if (self.good_passes == NUMBER_OF_GOOD_PASSES and - get_tensor_model_parallel_rank() == 0): - raise ModelForwardError( - "Simulated illegal memory access on rank 0!") - self.good_passes += 1 - return self.forward(*args, **kwargs) - - - - diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 434e2a7945d3..978740a6bd60 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -42,7 +42,7 @@ def __init__( start_engine_loop: bool = True, ) -> None: - self._errored = False + self.engine_core_errored = False self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers @@ -129,7 +129,7 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" - + if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() @@ -167,6 +167,9 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" + if self.engine_core_errored: + raise EngineDeadError() + # 1) Create a new output queue for the request. if request_id in self.rid_to_queue: raise ValueError(f"Request id {request_id} already running.") @@ -219,8 +222,6 @@ async def generate( The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. """ - if self.errored: - raise EngineDeadError() try: # We start the output_handler on the first call to generate() so @@ -258,25 +259,25 @@ async def generate( yield out - # If the request is disconnected by the client, the - # generate() task will be canceled. So, we abort the - # request if we end up here. + # If the request is disconnected by the client, generate() + # is cancelled. So, we abort the request if we end up here. except asyncio.CancelledError: await self.abort(request_id) if self.log_requests: logger.info("Request %s aborted.", request_id) raise - # EngineCore or output_handler pushed error. Raise so API Server - # can handle and shutdown in vllm/entrypoints/launcher.py. + # EngineCore or output_handler pushed error. except EngineDeadError: + # NOTE: we do not abort, since the EngineCore is dead + # and we will shut down anyways (unrecoverable). if self.log_requests: logger.info("Request %s failed.", request_id) raise - # Error in the generate() task (possibly recoverable). Raise so API - # Server can handle and maybe shutdown vllm/entrypoints/launcher.py. + # Error in the generate() task (possibly recoverable). except Exception as e: + await self.abort(request_id) if self.log_requests: logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e @@ -315,13 +316,14 @@ async def _run_output_handler(self): raise except Exception as e: - logger.fatal("AsyncLLM._run_output_handler failed") + logger.error( + "AsyncLLM output_handler got an exception, shutting down", + exec_info=e) self._set_errored_and_propagate() - raise EngineDeadError() from e def _set_errored_and_propagate(self): """Propagate to all generate() tasks.""" - self._errored = True + self.engine_core_errored = True # Put EngineDeadError() into each generate() task's queue, # each of which will raise it in their own context. @@ -396,7 +398,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return self._errored + return self.engine_core_errored @property def dead_error(self) -> BaseException: From 4ee6390b0563f100ab6fdc9a0222442323b3cf35 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 21:55:00 +0000 Subject: [PATCH 043/130] fixed --- tests/v1/shutdown/forward_error.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py index 824447ce4da0..83bc16a54fa7 100644 --- a/tests/v1/shutdown/forward_error.py +++ b/tests/v1/shutdown/forward_error.py @@ -32,6 +32,9 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) async def test_async_llm_model_error(monkeypatch, tensor_parallel_size): + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") From 3e23ee2bfebf233645c581cd8bd895d5a1c31774 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 21:55:16 +0000 Subject: [PATCH 044/130] updated --- tests/v1/shutdown/processor_error.py | 54 ++++++++++++++++++++++++++++ tests/v1/shutdown/startup_error.py | 0 2 files changed, 54 insertions(+) create mode 100644 tests/v1/shutdown/processor_error.py create mode 100644 tests/v1/shutdown/startup_error.py diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py new file mode 100644 index 000000000000..49cba6e9a674 --- /dev/null +++ b/tests/v1/shutdown/processor_error.py @@ -0,0 +1,54 @@ +"""Test error handling in Processor.""" + +import asyncio +import pytest + +from vllm import SamplingParams +from vllm.inputs.data import TokensPrompt +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.exceptions import EngineGenerateError + + +@pytest.mark.asyncio +async def test_async_llm_processor_error(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + enforce_eager=True) + async_llm = AsyncLLM.from_engine_args(engine_args) + + async def generate(request_id: str): + # [] is not allowed and will raise a ValueError in Processor. + generator = async_llm.generate(TokensPrompt([]), + request_id=request_id, + sampling_params=SamplingParams()) + try: + async for _ in generator: + pass + except Exception as e: + return e + + NUM_REQS = 3 + tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] + outputs = await asyncio.gather(*tasks) + + # Every request should have get an EngineGenerateError. + for output in outputs: + with pytest.raises(EngineGenerateError): + raise output + + # AsyncLLM should be errored. + assert not async_llm.errored + + # This should be no problem. + outputs = [] + async for out in async_llm.generate( + "Hello my name is", request_id="abc", + sampling_params=SamplingParams(max_tokens=5)): + outputs.append(out) + assert len(outputs) == 5 + + async_llm.shutdown() diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py new file mode 100644 index 000000000000..e69de29bb2d1 From 45456f921be3c332a0b053e67a54fc2accccca5a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 22:31:59 +0000 Subject: [PATCH 045/130] fixed error --- tests/v1/shutdown/forward_error.py | 22 ++++---- tests/v1/shutdown/processor_error.py | 10 ++-- tests/v1/shutdown/startup_error.py | 80 ++++++++++++++++++++++++++++ vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 40 +++++++------- vllm/v1/utils.py | 4 +- 6 files changed, 123 insertions(+), 35 deletions(-) diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/forward_error.py index 83bc16a54fa7..5017bc21ac71 100644 --- a/tests/v1/shutdown/forward_error.py +++ b/tests/v1/shutdown/forward_error.py @@ -1,17 +1,18 @@ """Test that we handle an Error in model forward and shutdown.""" import asyncio + import pytest +from tests.utils import wait_for_gpu_memory_to_clear from vllm import LLM, SamplingParams +from vllm.distributed import get_tensor_model_parallel_rank from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.utils import cuda_device_count_stateless from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.distributed import get_tensor_model_parallel_rank +from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError -from tests.utils import wait_for_gpu_memory_to_clear def evil_forward(self, *args, **kwargs): """Evil forward method that raise an exception after 5 calls.""" @@ -19,9 +20,9 @@ def evil_forward(self, *args, **kwargs): if not hasattr(self, "num_calls"): self.num_calls = 0 - - if (self.num_calls == NUMBER_OF_GOOD_PASSES and - get_tensor_model_parallel_rank() == 0): + + if (self.num_calls == NUMBER_OF_GOOD_PASSES + and get_tensor_model_parallel_rank() == 0): raise Exception("Simulated illegal memory access on Rank 0!") self.num_calls += 1 @@ -56,7 +57,7 @@ async def generate(request_id: str): pass except Exception as e: return e - + NUM_REQS = 3 tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] outputs = await asyncio.gather(*tasks) @@ -71,8 +72,9 @@ async def generate(request_id: str): # We should not be able to make another request. with pytest.raises(EngineDeadError): async for _ in async_llm.generate( - "Hello my name is", request_id="abc", - sampling_params=SamplingParams()): + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams()): raise Exception("We should not get here.") # Confirm all the processes are cleaned up. @@ -110,7 +112,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, with pytest.raises(EngineDeadError): llm.generate("Hello my name is Robert and I") - + # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py index 49cba6e9a674..0b7a40bdbd97 100644 --- a/tests/v1/shutdown/processor_error.py +++ b/tests/v1/shutdown/processor_error.py @@ -1,11 +1,12 @@ """Test error handling in Processor.""" import asyncio + import pytest from vllm import SamplingParams -from vllm.inputs.data import TokensPrompt from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.inputs.data import TokensPrompt from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineGenerateError @@ -30,7 +31,7 @@ async def generate(request_id: str): pass except Exception as e: return e - + NUM_REQS = 3 tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] outputs = await asyncio.gather(*tasks) @@ -46,8 +47,9 @@ async def generate(request_id: str): # This should be no problem. outputs = [] async for out in async_llm.generate( - "Hello my name is", request_id="abc", - sampling_params=SamplingParams(max_tokens=5)): + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams(max_tokens=5)): outputs.append(out) assert len(outputs) == 5 diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py index e69de29bb2d1..4a3119e29f47 100644 --- a/tests/v1/shutdown/startup_error.py +++ b/tests/v1/shutdown/startup_error.py @@ -0,0 +1,80 @@ +"""Test that we handle a startup Error and shutdown.""" + +import pytest + +from tests.utils import wait_for_gpu_memory_to_clear +from vllm import LLM +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.utils import cuda_device_count_stateless +from vllm.v1.engine.async_llm import AsyncLLM + + +def evil_forward(self, *args, **kwargs): + """Evil forward method that raise an exception.""" + + if get_tensor_model_parallel_rank() == 0: + raise Exception("Simulated Error in startup!") + + return self.model(*args, **kwargs, intermediate_tensors=None) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size): + + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + engine_args = AsyncEngineArgs( + model="meta-llama/Llama-3.2-1B", + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + + # Confirm we get an exception. + with pytest.raises(Exception, match="initialization failed"): + _ = AsyncLLM.from_engine_args(engine_args) + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +def test_llm_startup_error(monkeypatch, tensor_parallel_size, + enable_multiprocessing): + + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) + + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + with pytest.raises(Exception, match="initialization failed"): + _ = LLM(model="meta-llama/Llama-3.2-1B", + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 978740a6bd60..947a186cd0d6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -129,7 +129,7 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" - + if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 391aa59beb73..e65195c79c67 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -145,24 +145,28 @@ def __init__( executor_class: Type[Executor], log_stats: bool = False, ): - super().__init__(vllm_config, executor_class, log_stats) - - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue() - self.output_queue: queue.Queue[List[EngineCoreOutput]] = queue.Queue() - threading.Thread(target=self.process_input_socket, - args=(input_path, ), - daemon=True).start() - threading.Thread(target=self.process_output_socket, - args=(output_path, ), - daemon=True).start() - - # Send Readiness signal to EngineClient. - ready_pipe.send({"status": "READY"}) + try: + super().__init__(vllm_config, executor_class, log_stats) + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue: queue.Queue[ + EngineCoreRequestUnion] = queue.Queue() + self.output_queue: queue.Queue[ + List[EngineCoreOutput]] = queue.Queue() + threading.Thread(target=self.process_input_socket, + args=(input_path, ), + daemon=True).start() + threading.Thread(target=self.process_output_socket, + args=(output_path, ), + daemon=True).start() + # Send Readiness signal to EngineClient. + ready_pipe.send({"status": "READY"}) + except Exception: + ready_pipe.send({"status": "FAILED"}) @staticmethod def run_engine_core(*args, **kwargs): diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b0a7affbebb7..9eb81f104a26 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -110,8 +110,8 @@ def __init__( # Wait for startup. if reader.recv()["status"] != "READY": - raise RuntimeError(f"{process_name} initialization failed. " - "See root cause above.") + raise Exception(f"{process_name} initialization failed. " + "See stack trace for root cause.") def shutdown(self): self._finalizer() From 6128b1acb64ac1b79e91dad73ec06ac07c513843 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 22:58:11 +0000 Subject: [PATCH 046/130] update test coverage --- tests/v1/shutdown/startup_error.py | 17 ++++++++++++----- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 4 +++- vllm/v1/executor/multiproc_executor.py | 2 +- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/startup_error.py index 4a3119e29f47..25f2b77b2f3d 100644 --- a/tests/v1/shutdown/startup_error.py +++ b/tests/v1/shutdown/startup_error.py @@ -20,9 +20,15 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs, intermediate_tensors=None) -@pytest.mark.asyncio +MODELS = [ + "meta-llama/Llama-3.2-1B", # Raises on first fwd pass. + "mistralai/Mixtral-8x22B-Instruct-v0.1" # Causes OOM. +] + + +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size): +def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size): if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") @@ -34,7 +40,7 @@ async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size): monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) engine_args = AsyncEngineArgs( - model="meta-llama/Llama-3.2-1B", + model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) @@ -50,9 +56,10 @@ async def test_async_llm_startup_error(monkeypatch, tensor_parallel_size): ) -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -def test_llm_startup_error(monkeypatch, tensor_parallel_size, +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +def test_llm_startup_error(monkeypatch, model, tensor_parallel_size, enable_multiprocessing): if cuda_device_count_stateless() < tensor_parallel_size: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 947a186cd0d6..1e0cf6d5a810 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -318,7 +318,7 @@ async def _run_output_handler(self): except Exception as e: logger.error( "AsyncLLM output_handler got an exception, shutting down", - exec_info=e) + exc_info=e) self._set_errored_and_propagate() def _set_errored_and_propagate(self): diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e65195c79c67..b0b7cf3eecfe 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -165,7 +165,9 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. ready_pipe.send({"status": "READY"}) - except Exception: + + except Exception as e: + logger.exception("EngineCore got error at startup:", exc_info=e) ready_pipe.send({"status": "FAILED"}) @staticmethod diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 9464744e184b..140fc8293134 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -385,7 +385,7 @@ def wait_for_startup( # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - logger.debug("Waiting for WorkerProc to startup.") + logger.info("Waiting for WorkerProc to startup.") if not proc.is_alive(): raise RuntimeError("WorkerProc failed to start.") From de2455930fdc47316767c15e56fd8925171d028a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sun, 5 Jan 2025 23:25:44 +0000 Subject: [PATCH 047/130] stash --- tests/v1/shutdown/processor_error.py | 2 +- vllm/v1/engine/core_client.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/processor_error.py index 0b7a40bdbd97..056851025eca 100644 --- a/tests/v1/shutdown/processor_error.py +++ b/tests/v1/shutdown/processor_error.py @@ -1,4 +1,4 @@ -"""Test error handling in Processor.""" +"""Test error handling in Processor. Should not impact other reqs.""" import asyncio diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4a069b0edba9..4f009cbff166 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -170,6 +170,7 @@ def __init__( def shutdown(self): """Clean up background resources.""" + print("IN MPCLIENT.shutdown.") if hasattr(self, "proc_handle"): self.proc_handle.shutdown() From 7adf26ec2c1f3b276dbb4e4d767f48c08c86dfc5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 6 Jan 2025 03:15:23 +0000 Subject: [PATCH 048/130] added tests --- ...forward_error.py => test_forward_error.py} | 0 ...essor_error.py => test_processor_error.py} | 0 ...startup_error.py => test_startup_error.py} | 0 vllm/v1/engine/async_llm.py | 37 ++------- vllm/v1/engine/core_client.py | 83 ++++++++++++------- vllm/v1/executor/multiproc_executor.py | 2 +- vllm/v1/utils.py | 31 +++++-- 7 files changed, 87 insertions(+), 66 deletions(-) rename tests/v1/shutdown/{forward_error.py => test_forward_error.py} (100%) rename tests/v1/shutdown/{processor_error.py => test_processor_error.py} (100%) rename tests/v1/shutdown/{startup_error.py => test_startup_error.py} (100%) diff --git a/tests/v1/shutdown/forward_error.py b/tests/v1/shutdown/test_forward_error.py similarity index 100% rename from tests/v1/shutdown/forward_error.py rename to tests/v1/shutdown/test_forward_error.py diff --git a/tests/v1/shutdown/processor_error.py b/tests/v1/shutdown/test_processor_error.py similarity index 100% rename from tests/v1/shutdown/processor_error.py rename to tests/v1/shutdown/test_processor_error.py diff --git a/tests/v1/shutdown/startup_error.py b/tests/v1/shutdown/test_startup_error.py similarity index 100% rename from tests/v1/shutdown/startup_error.py rename to tests/v1/shutdown/test_startup_error.py diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1e0cf6d5a810..aaf4ca6ccaed 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,5 +1,4 @@ import asyncio -import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -17,7 +16,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.core_client import AsyncMPClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.processor import Processor @@ -48,16 +47,6 @@ def __init__( self.stat_loggers = stat_loggers self.model_config = vllm_config.model_config - # Background processes send SIGUSR1 when unrecoverable - # errors occur. Start the shutdown process if this happens. - def sigusr1_handler(): - logger.fatal("AsyncLLM got fatal signal from background process, " - "starting shutdown. See stack trace for root cause.") - self._set_errored_and_propagate() - - asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, - sigusr1_handler) - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -87,15 +76,14 @@ def sigusr1_handler(): ) # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_client( - multiprocess_mode=True, - asyncio_mode=True, + self.engine_core = AsyncMPClient( vllm_config=vllm_config, executor_class=executor_class, log_stats=self.log_stats, ) - self.output_handler: Optional[asyncio.Task] = None + # Output handler background task. + self.output_handler = asyncio.create_task(self._run_output_handler()) @classmethod def from_engine_args( @@ -129,13 +117,12 @@ def from_engine_args( def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + if handler := getattr(self, "output_handler", None): + handler.cancel() if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - if handler := getattr(self, "output_handler", None): - handler.cancel() - @classmethod def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] @@ -224,13 +211,6 @@ async def generate( """ try: - # We start the output_handler on the first call to generate() so - # we can call __init__ before the event loop, which enables us - # to handle startup failure gracefully in the OpenAI server. - if self.output_handler is None: - self.output_handler = asyncio.create_task( - self._run_output_handler()) - q = await self.add_request( request_id, prompt, @@ -316,9 +296,8 @@ async def _run_output_handler(self): raise except Exception as e: - logger.error( - "AsyncLLM output_handler got an exception, shutting down", - exc_info=e) + logger.error("AsyncLLM output_handler got an Exception:", + exc_info=e) self._set_errored_and_propagate() def _set_errored_and_propagate(self): diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 4f009cbff166..b446bac2e701 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,4 @@ +import asyncio import signal import weakref from abc import ABC, abstractmethod @@ -157,6 +158,7 @@ def __init__( zmq.constants.PUSH) # Start EngineCore in background process. + self.engine_core_errored = False self.proc_handle = BackgroundProcHandle( input_path=input_path, output_path=output_path, @@ -167,15 +169,31 @@ def __init__( "executor_class": executor_class, "log_stats": log_stats, }) + self.proc_handle.wait_for_startup() def shutdown(self): """Clean up background resources.""" - print("IN MPCLIENT.shutdown.") - if hasattr(self, "proc_handle"): - self.proc_handle.shutdown() + self.proc_handle.shutdown() self._finalizer() + def _sigusr1_handler(self): + """ + EngineCoreProc sends SIGUSR1 if it encounters an Exception. + Set self in errored state and begin shutdown. + """ + logger.fatal("Got fatal signal from EngineCore, shutting down.") + self.engine_core_errored = True + self.shutdown() + + def _format_exception(self, e: Exception) -> Exception: + """If errored, use EngineDeadError so root cause is clear.""" + + return (EngineDeadError( + "EngineCore encountered an issue. See stack trace " + "for the root cause.", + suppress_context=True) if self.engine_core_errored else e) + class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" @@ -185,21 +203,11 @@ def __init__(self, executor_class: Type[Executor], log_stats: bool = False): - # TODO(rob): signal handler only needed for SyncMPClient - # because AsyncLLM needs to handle the signal rather - # than the AsyncMPClient. TODO(rob): move the Client def - # to async_llm and llm_engine to make this clearer. - # Background procs sent SIGUSR1 if they hit error. - # Handle by setting _errored=True and shutting down. - # Next action taken will raise EngineDeadError. + # Setup EngineCore signal handler. def sigusr1_handler(signum, frame): - logger.fatal("LLMEngine got fatal signal from background " - "process, shutting down.") - self._errored = True - self.shutdown() + self._sigusr1_handler() signal.signal(signal.SIGUSR1, sigusr1_handler) - self._errored = False super().__init__( asyncio_mode=False, @@ -208,13 +216,6 @@ def sigusr1_handler(signum, frame): log_stats=log_stats, ) - def _format_exception(self, e: Exception) -> Exception: - """If _errored, use EngineDeadError so root cause is clear.""" - return (EngineDeadError( - "EngineCore encountered an issue. See stack trace " - "for the root cause.", - suppress_context=True) if self._errored else e) - def get_output(self) -> List[EngineCoreOutput]: try: (frame, ) = self.output_socket.recv_multipart(copy=False) @@ -249,6 +250,23 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor], log_stats: bool = False): + + # EngineCore sends SIGUSR1 when it gets an Exception. + def sigusr1_handler_asyncio(): + self._sigusr1_handler() + + asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, + sigusr1_handler_asyncio) + + # super().__init__ blocks the event loop until background + # procs are setup. This handler allows us to catch issues + # during startup. + def sigusr1_handler(signum, frame): + self._sigusr1_handler() + + signal.signal(signal.SIGUSR1, sigusr1_handler) + + # Initialize EngineCore + all background processes. super().__init__( asyncio_mode=True, vllm_config=vllm_config, @@ -256,18 +274,23 @@ def __init__(self, log_stats=log_stats, ) - async def get_output_async(self) -> List[EngineCoreOutput]: - - frames = await self.output_socket.recv_multipart(copy=False) - engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs + # Remove the non-asyncio handler. + signal.signal(signal.SIGUSR1, signal.SIG_DFL) - return engine_core_outputs + async def get_output_async(self) -> List[EngineCoreOutput]: + try: + frames = await self.output_socket.recv_multipart(copy=False) + return self.decoder.decode(frames[0].buffer).outputs + except Exception as e: + raise self._format_exception(e) from None async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: - - msg = (request_type.value, self.encoder.encode(request)) - await self.input_socket.send_multipart(msg, copy=False) + try: + msg = (request_type.value, self.encoder.encode(request)) + await self.input_socket.send_multipart(msg, copy=False) + except Exception as e: + raise self._format_exception(e) from None async def add_request_async(self, request: EngineCoreRequest) -> None: await self._send_input(EngineCoreRequestType.ADD, request) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 140fc8293134..36eca453307b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -48,8 +48,8 @@ def sigusr1_handler(signum, frame): "shutting down. See stack trace above for root cause issue.") # Shutdown first (avoid SysExit exceptions in __del__). self.shutdown() - # TODO(rob): move this to the VLLMConfig. if VLLM_ENABLE_V1_MULTIPROCESSING: + # TODO(rob): move this to the VLLMConfig. # Propagate up if using the mp engine. Note that # sending in non-mp mode crashes caller process. psutil.Process().parent().send_signal(signal.SIGUSR1) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 9eb81f104a26..1ebd71544d9a 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -93,7 +93,8 @@ def __init__( process_kwargs: Dict[Any, Any], ): context = get_mp_context() - reader, writer = context.Pipe(duplex=False) + self.reader, writer = context.Pipe(duplex=False) + self.process_name = process_name assert ("ready_pipe" not in process_kwargs and "input_path" not in process_kwargs @@ -102,20 +103,38 @@ def __init__( process_kwargs["input_path"] = input_path process_kwargs["output_path"] = output_path + # Flag for shutdown state. BackgroundProcs send signals + # when errors occur which calls shutdown(). If we are in + # startup loop when signaled, this flag breaks us out. + self.shutting_down = False + # Run busy loop in background process. self.proc = context.Process(target=target_fn, kwargs=process_kwargs) self._finalizer = weakref.finalize(self, shutdown, self.proc, input_path, output_path) self.proc.start() - # Wait for startup. - if reader.recv()["status"] != "READY": - raise Exception(f"{process_name} initialization failed. " - "See stack trace for root cause.") - def shutdown(self): + self.shutting_down = True self._finalizer() + def wait_for_startup(self): + """Wait until the background process is ready.""" + + e = Exception(f"{self.process_name} initialization failed due to " + "an exception in a background process. See stack trace " + "for root cause.") + + while not self.reader.poll(timeout=1): + if self.shutting_down: + raise e + try: + if self.reader.recv()["status"] != "READY": + raise e + except EOFError: + e.__suppress_context__ = True + raise e from None + # Note(rob): shutdown function cannot be a bound method, # else the gc cannot collect the object. From bf928540d16e13c7dde02a758031f9a14902d78a Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Tue, 7 Jan 2025 01:07:58 +0000 Subject: [PATCH 049/130] stash --- vllm/v1/engine/async_llm.py | 10 ++++------ vllm/v1/engine/core_client.py | 28 ++++++++++++++++------------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index aaf4ca6ccaed..f15317498a20 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -41,7 +41,6 @@ def __init__( start_engine_loop: bool = True, ) -> None: - self.engine_core_errored = False self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers = stat_loggers @@ -154,7 +153,7 @@ async def add_request( ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - if self.engine_core_errored: + if self.errored: raise EngineDeadError() # 1) Create a new output queue for the request. @@ -298,11 +297,10 @@ async def _run_output_handler(self): except Exception as e: logger.error("AsyncLLM output_handler got an Exception:", exc_info=e) - self._set_errored_and_propagate() + self._propagate_error() - def _set_errored_and_propagate(self): + def _propagate_error(self): """Propagate to all generate() tasks.""" - self.engine_core_errored = True # Put EngineDeadError() into each generate() task's queue, # each of which will raise it in their own context. @@ -377,7 +375,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return self.engine_core_errored + return self.engine_core.engine_core_errored @property def dead_error(self) -> BaseException: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b446bac2e701..dcbb8eb80fbf 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -252,19 +252,14 @@ def __init__(self, log_stats: bool = False): # EngineCore sends SIGUSR1 when it gets an Exception. - def sigusr1_handler_asyncio(): - self._sigusr1_handler() - - asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, - sigusr1_handler_asyncio) - - # super().__init__ blocks the event loop until background - # procs are setup. This handler allows us to catch issues - # during startup. + # NOTE: super().__init__ blocks the event loop until + # background procs are setup. This handler allows us + # to catch issues during startup (e.g. OOM). We switch + # to a signal handler in the event loop __init__. def sigusr1_handler(signum, frame): self._sigusr1_handler() - signal.signal(signal.SIGUSR1, sigusr1_handler) + # signal.signal(signal.SIGUSR1, sigusr1_handler) # Initialize EngineCore + all background processes. super().__init__( @@ -274,8 +269,17 @@ def sigusr1_handler(signum, frame): log_stats=log_stats, ) - # Remove the non-asyncio handler. - signal.signal(signal.SIGUSR1, signal.SIG_DFL) + # Unregister the hander in the main trhead, + # signal.signal(signal.SIGUSR1, signal.SIG_DFL) + + # NOTE TO SELF: putting this in AsyncMPClient is causing issues + # where the AsyncLLM is not triggering shutdown since the Excpections + # are not being raised. TODO: move it back to AsyncLLM. + def sigusr1_handler_asyncio(): + self._sigusr1_handler() + + asyncio.get_running_loop().add_signal_handler(signal.SIGUSR1, + sigusr1_handler_asyncio) async def get_output_async(self) -> List[EngineCoreOutput]: try: From 6b4fe88fa065ccd997b4d4ceb3689c054fe948a6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 7 Feb 2025 23:43:42 +0000 Subject: [PATCH 050/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/async_llm.py | 14 ++----------- vllm/v1/engine/core_client.py | 33 +++++++++++++++++++++++++----- vllm/v1/engine/output_processor.py | 7 +++++++ 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 5307df870f65..f318af5fa833 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -236,10 +236,8 @@ async def generate( logger.info("Request %s aborted.", request_id) raise - # EngineCore or output_handler pushed error. + # Engine is dead. Do not abort since we shut down. except EngineDeadError: - # NOTE: we do not abort, since the EngineCore is dead - # and we will shut down anyways (unrecoverable). if self.log_requests: logger.info("Request %s failed.", request_id) raise @@ -299,15 +297,7 @@ async def _run_output_handler(self): except Exception as e: logger.error("AsyncLLM output_handler got an Exception:", exc_info=e) - self._propagate_error() - - def _propagate_error(self): - """Propagate to all generate() tasks.""" - - # Put EngineDeadError() into each generate() task's queue, - # each of which will raise it in their own context. - for _, q in self.rid_to_queue.items(): - q.put_nowait(EngineDeadError()) + self.output_processor.propagate_error(EngineDeadError()) async def abort(self, request_id: str) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 342dd4bbd33c..e28696e150fa 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -4,7 +4,7 @@ import signal import weakref from abc import ABC, abstractmethod -from typing import List, Optional, Type +from typing import List, Optional, Type, Union import zmq import zmq.asyncio @@ -286,13 +286,36 @@ def sigusr1_handler(signum, frame): ) self.queue_task: Optional[asyncio.Task] = None + self.outputs_queue: Optional[asyncio.Queue[Union[EngineCoreOutputs, + Exception]]] = None - async def get_output_async(self) -> EngineCoreOutputs: + async def _process_outputs_socket_loop(self): + """ + ZMQ IO background loop. This helps performance because + ZMQ IO releases the GIL so we can overlap with output_handler_loop. + """ + + assert self.outputs_queue is not None try: - (frame, ) = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frame.buffer) + while True: + (frame, ) = await self.output_socket.recv_multipart(copy=False) + outputs = self.decoder.decode(frame.buffer) + self.outputs_queue.put_nowait(outputs) except Exception as e: - raise self._format_exception(e) from None + self.outputs_queue.put_nowait(e) + + async def get_output_async(self) -> EngineCoreOutputs: + + if self.outputs_queue is None: + self.outputs_queue = asyncio.Queue() + self.queue_task = asyncio.create_task( + self._process_outputs_socket_loop()) + + outputs = await self.outputs_queue.get() + if isinstance(outputs, Exception): + raise self._format_exception(outputs) from None + + return outputs async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 5dbf530caa17..69ff2fa2a802 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -93,6 +93,13 @@ def get_num_unfinished_requests(self): def has_unfinished_requests(self) -> bool: return len(self.request_states) > 0 + def propagate_error(self, e: Exception): + """Propagate error to all generate() tasks.""" + + for _, state in self.request_states.items(): + assert state.queue is not None + state.queue.put_nowait(e) + def abort_requests( self, request_ids: List[str], From efe85ee8ed687d0d7631a08f2ea4edfc30a02eac Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 7 Feb 2025 23:45:03 +0000 Subject: [PATCH 051/130] updared Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/exceptions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 5313c1e0943c..34ec1f6b0cd0 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -1,5 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 class EngineGenerateError(Exception): - """Raised when a AsyncLLM.generate() fails. Maybe recoverable.""" + """Raised when a AsyncLLM.generate() fails. Recoverable.""" pass From 619579554c5f4daa27ca17b95de4d22c59669aec Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 7 Feb 2025 23:47:58 +0000 Subject: [PATCH 052/130] fix typo Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/async_llm.py | 11 ++++++++--- vllm/v1/engine/core_client.py | 1 - 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f318af5fa833..70cdefc31196 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - import asyncio from typing import AsyncGenerator, List, Mapping, Optional, Type, Union @@ -83,8 +82,7 @@ def __init__( executor_class=executor_class, ) - # Output handler background task. - self.output_handler = asyncio.create_task(self._run_output_handler()) + self.output_handler: Optional[asyncio.Task] = None @classmethod def from_engine_args( @@ -193,6 +191,13 @@ async def generate( """ try: + # We start the output_handler on the first call to generate() so + # we can call __init__ before the event loop, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + q = await self.add_request( request_id, prompt, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e28696e150fa..40c06d024f3b 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 - import asyncio import signal import weakref From 0b2558695973886a45d6e17ebb755169a3e01fe1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Fri, 7 Feb 2025 23:58:03 +0000 Subject: [PATCH 053/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core_client.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 40c06d024f3b..2fe1873b6440 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -284,17 +284,19 @@ def sigusr1_handler(signum, frame): log_stats=True, ) + # ZMQ IO. Run it in background task so that we can + # overlap with AsyncLLM.output_handler_loop. This + # works because ZMQ IO releases the GIL. self.queue_task: Optional[asyncio.Task] = None - self.outputs_queue: Optional[asyncio.Queue[Union[EngineCoreOutputs, - Exception]]] = None + self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs, + Exception]] = asyncio.Queue() - async def _process_outputs_socket_loop(self): - """ - ZMQ IO background loop. This helps performance because - ZMQ IO releases the GIL so we can overlap with output_handler_loop. - """ + def shutdown(self): + super().shutdown() + if queue_task := getattr(self, "queue_task", None): + queue_task.cancel() - assert self.outputs_queue is not None + async def _process_outputs_socket_loop(self): try: while True: (frame, ) = await self.output_socket.recv_multipart(copy=False) @@ -305,11 +307,13 @@ async def _process_outputs_socket_loop(self): async def get_output_async(self) -> EngineCoreOutputs: - if self.outputs_queue is None: - self.outputs_queue = asyncio.Queue() + # Start output loop on the first call. + if self.queue_task is None: self.queue_task = asyncio.create_task( self._process_outputs_socket_loop()) + # NOTE: if an exception arises processing the socket, + # the exception is forwarded to the queue. outputs = await self.outputs_queue.get() if isinstance(outputs, Exception): raise self._format_exception(outputs) from None From 0b77b795bf9abc1d9035715f8cbee0473fe37000 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 13:50:23 +0000 Subject: [PATCH 054/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 116 ++++++++++++------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 2bf094e9d726..ed02e52474ad 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -9,12 +9,12 @@ from dataclasses import dataclass from enum import Enum, auto from functools import partial +from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess from typing import Any, Callable, Dict, List, Optional, Tuple, Union import cloudpickle import psutil -import zmq from vllm.config import VllmConfig from vllm.distributed import (destroy_distributed_environment, @@ -26,7 +26,7 @@ _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger from vllm.utils import (get_distributed_init_method, get_mp_context, - get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) + get_open_port) from vllm.v1.executor.abstract import Executor from vllm.worker.worker_base import WorkerWrapperBase @@ -191,7 +191,7 @@ def check_health(self) -> None: class WorkerProcHandle: proc: BaseProcess rank: int - ready_path: str + ready_pipe: Connection worker_response_mq: MessageQueue # The worker process writes to this MQ @@ -207,44 +207,45 @@ def __init__( rank: int, distributed_init_method: str, input_shm_handle: Handle, - ready_path: str, + ready_pipe: Connection, ): - self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) - # TODO: move `init_worker` to executor level as a collective rpc call - all_kwargs: List[Dict] = [ - {} for _ in range(vllm_config.parallel_config.world_size) - ] - all_kwargs[rank] = { - "vllm_config": vllm_config, - "local_rank": local_rank, - "rank": rank, - "distributed_init_method": distributed_init_method, - } - wrapper.init_worker(all_kwargs) - self.worker = wrapper.worker - - pid = os.getpid() - _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) - _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) - - # Initialize MessageQueue for receiving SchedulerOutput - self.rpc_broadcast_mq = MessageQueue.create_from_handle( - input_shm_handle, self.worker.rank) - - # Initializes a message queue for sending the model output - self.worker_response_mq = MessageQueue(1, 1) - worker_response_mq_handle = self.worker_response_mq.export_handle() - - # Send Readiness signal to EngineCore process. - with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: - payload = pickle.dumps(worker_response_mq_handle, - protocol=pickle.HIGHEST_PROTOCOL) - ready_socket.send_string(WorkerProc.READY_STR) - ready_socket.send(payload) + try: + self.rank = rank + wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) + # TODO: move `init_worker` to executor as a collective rpc call + all_kwargs: List[Dict] = [ + {} for _ in range(vllm_config.parallel_config.world_size) + ] + all_kwargs[rank] = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + } + wrapper.init_worker(all_kwargs) + self.worker = wrapper.worker + + pid = os.getpid() + _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) + _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) + + # Initialize MessageQueue for receiving SchedulerOutput + self.rpc_broadcast_mq = MessageQueue.create_from_handle( + input_shm_handle, self.worker.rank) + + # Initializes a message queue for sending the model output + self.worker_response_mq = MessageQueue(1, 1) + worker_response_mq_handle = self.worker_response_mq.export_handle() + + self.worker.init_device() + self.worker.load_model() + + # Send Readiness signal to Executor. + ready_pipe.send({"status": "READY"}) - self.worker.init_device() - self.worker.load_model() + except Exception as e: + logger.exception("WorkerProc got error at startup:", exc_info=e) + ready_pipe.send({"status": "FAILED"}) @staticmethod def make_worker_process( @@ -255,10 +256,7 @@ def make_worker_process( input_shm_handle, # Receive SchedulerOutput ) -> WorkerProcHandle: context = get_mp_context() - - # ZMQ path for worker to send ready message and shm_broadcast handle - # back to core process. - ready_path = get_open_zmq_ipc_path() + reader, writer = context.Pipe(duplex=False) process_kwargs = { "vllm_config": vllm_config, @@ -266,7 +264,7 @@ def make_worker_process( "rank": rank, "distributed_init_method": distributed_init_method, "input_shm_handle": input_shm_handle, - "ready_path": ready_path, + "ready_pipe": writer, } # Run EngineCore busy loop in background process. proc = context.Process(target=WorkerProc.worker_main, @@ -275,13 +273,12 @@ def make_worker_process( proc.start() # Wait for startup - worker_response_mq_handle = WorkerProc.wait_for_startup( - proc, ready_path) + worker_response_mq_handle = WorkerProc.wait_for_startup(proc, reader) worker_response_mq = MessageQueue.create_from_handle( worker_response_mq_handle, 0) - return WorkerProcHandle(proc, rank, ready_path, worker_response_mq) + return WorkerProcHandle(proc, rank, reader, worker_response_mq) def shutdown(self): self.rpc_broadcast_mq = None @@ -346,24 +343,21 @@ def signal_handler(signum, frame): @staticmethod def wait_for_startup( - proc: BaseProcess, - ready_path: str, + process_name: str, + reader: Connection, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: - - # Wait for Worker to send READY. - while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - logger.info("Waiting for WorkerProc to startup.") - if not proc.is_alive(): - raise RuntimeError("WorkerProc failed to start.") + e = Exception(f"{process_name} initialization failed due to " + "an exception in a background process. See stack trace " + "for root cause.") - message = socket.recv_string() - assert message == WorkerProc.READY_STR - handle_frame = socket.recv(copy=False) - handle = pickle.loads(handle_frame.buffer) - return handle + try: + if reader.recv()["status"] != "READY": + raise e + except EOFError: + e.__suppress_context__ = True + raise e from None class ResponseStatus(Enum): SUCCESS = auto() From 61f3dd7c2bd8549448a53e5128f65c87200803e5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 14:15:50 +0000 Subject: [PATCH 055/130] stash Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 30 +++++++++++++++----------- vllm/v1/worker/gpu_worker.py | 5 +++++ vllm/worker/worker.py | 3 +++ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ed02e52474ad..e8078f8de96a 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -163,15 +163,6 @@ def wait_for_termination(procs, timeout): for p in active_procs: p.kill() - self._cleanup_sockets() - - def _cleanup_sockets(self): - for w in self.workers: - # Remove the zmq ipc socket file - socket_path = w.ready_path.replace("ipc://", "") - if os and os.path.exists(socket_path): - os.remove(socket_path) - def shutdown(self): """Properly shut down the executor and its workers""" if not getattr(self, 'shutting_down', False): @@ -237,11 +228,20 @@ def __init__( self.worker_response_mq = MessageQueue(1, 1) worker_response_mq_handle = self.worker_response_mq.export_handle() + # Load model before we send readiness signal, such that + # we can catch any errors. + print("ABOUT TO INIT DEVICE") self.worker.init_device() + print("ABOUT TO LOAD MODEL") self.worker.load_model() + print("SENDING TO READINESS PIPE") # Send Readiness signal to Executor. - ready_pipe.send({"status": "READY"}) + ready_pipe.send({ + "status": "READY", + "handle": pickle.dumps(worker_response_mq_handle) + }) + print("SENT TO READINESS PIPE") except Exception as e: logger.exception("WorkerProc got error at startup:", exc_info=e) @@ -345,7 +345,7 @@ def signal_handler(signum, frame): def wait_for_startup( process_name: str, reader: Connection, - ) -> Optional[Handle]: + ) -> WorkerProcHandle: """Wait until the Worker is ready.""" e = Exception(f"{process_name} initialization failed due to " @@ -353,8 +353,14 @@ def wait_for_startup( "for root cause.") try: - if reader.recv()["status"] != "READY": + response = reader.recv() + if getattr(response, "status", None) != "READY": raise e + assert hasattr(response, "handle") + handle = pickle.loads(response["handle"]) + assert isinstance(handle, WorkerProcHandle) + return handle + except EOFError: e.__suppress_context__ = True raise e from None diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0adb69073397..985f86bd3e26 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -98,6 +98,7 @@ def wake_up(self) -> None: allocator.wake_up() def init_device(self): + print("init_device") if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until # the synchronization point. This causes the memory usage to grow @@ -119,15 +120,19 @@ def init_device(self): else: raise RuntimeError( f"Not support device type: {self.device_config.device}") + print("init_worker_distributed_environment") # Initialize the distributed environment. init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. + print("set_random_seed") set_random_seed(self.model_config.seed) # Construct the model runner + print("model_runner") self.model_runner = GPUModelRunner(self.vllm_config, self.device) + print("done") def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 582aa460eb4f..c1f06175901a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -140,6 +140,7 @@ def wake_up(self) -> None: allocator.wake_up() def init_device(self) -> None: + print("init_device") if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until # the synchronization point. This causes the memory usage to grow @@ -162,11 +163,13 @@ def init_device(self) -> None: else: raise RuntimeError( f"Not support device type: {self.device_config.device}") + print("init_worker_dist_enviornment") # Initialize the distributed environment. init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. + print("set_random_seed") set_random_seed(self.model_config.seed) def load_model(self): From fbf19ad656900551e41c00d2f0f2a044aa27dad6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 14:16:43 +0000 Subject: [PATCH 056/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/worker/worker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index c1f06175901a..582aa460eb4f 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -140,7 +140,6 @@ def wake_up(self) -> None: allocator.wake_up() def init_device(self) -> None: - print("init_device") if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until # the synchronization point. This causes the memory usage to grow @@ -163,13 +162,11 @@ def init_device(self) -> None: else: raise RuntimeError( f"Not support device type: {self.device_config.device}") - print("init_worker_dist_enviornment") # Initialize the distributed environment. init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. - print("set_random_seed") set_random_seed(self.model_config.seed) def load_model(self): From d25ce5ce58670183e2cd4347d76030bb5ce85219 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 15:03:21 +0000 Subject: [PATCH 057/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 134 +++++++++++++------------ 1 file changed, 72 insertions(+), 62 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index e8078f8de96a..d36a4d57153f 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -21,7 +21,6 @@ destroy_model_parallel) from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) -from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING from vllm.executor.multiproc_worker_utils import ( _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger @@ -43,22 +42,6 @@ def _init_executor(self) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) - # The child processes will send SIGUSR1 when unrecoverable - # errors happen. - def sigusr1_handler(signum, frame): - logger.fatal( - "MulitprocExecutor got fatal signal from worker processes, " - "shutting down. See stack trace above for root cause issue.") - # Shutdown first (avoid SysExit exceptions in __del__). - self.shutdown() - if VLLM_ENABLE_V1_MULTIPROCESSING: - # TODO(rob): move this to the VLLMConfig. - # Propagate up if using the mp engine. Note that - # sending in non-mp mode crashes caller process. - psutil.Process().parent().send_signal(signal.SIGUSR1) - - signal.signal(signal.SIGUSR1, sigusr1_handler) - self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size assert self.world_size == tensor_parallel_size, ( @@ -81,12 +64,25 @@ def sigusr1_handler(signum, frame): scheduler_output_handle = self.rpc_broadcast_mq.export_handle() # Create workers - self.workers: List[WorkerProcHandle] = [] + unready_workers: List[UnreadyWorkerProcHandle] = [] for rank in range(self.world_size): - worker = WorkerProc.make_worker_process(self.vllm_config, rank, - rank, - distributed_init_method, - scheduler_output_handle) + unready_worker = WorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=rank, + rank=rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + ) + unready_workers.append(unready_worker) + + # All workers are created before wait_for_ready, since + # initialization calls self.init_device(), which does a sync. + self.workers: List[WorkerProcHandle] = [] + for unready_worker in unready_workers: + # NOTE: the WorkerProc wraps startup in a try ... catch + # so if there are any issues in loading in a WorkerProcess + # (e.g. OOM), an Exception will be raised here. + worker = WorkerProc.wait_for_ready(unready_worker) self.workers.append(worker) # Ensure message queues are ready. Will deadlock if re-ordered @@ -178,13 +174,30 @@ def check_health(self) -> None: return +@dataclass +class UnreadyWorkerProcHandle: + """WorkerProcess handle before READY.""" + proc: BaseProcess + rank: int + ready_pipe: Tuple[Connection, Connection] + + @dataclass class WorkerProcHandle: proc: BaseProcess rank: int - ready_pipe: Connection worker_response_mq: MessageQueue # The worker process writes to this MQ + @classmethod + def from_unready_handle( + cls, unready_handle: UnreadyWorkerProcHandle, + worker_response_mq: MessageQueue) -> "WorkerProcHandle": + return cls( + proc=unready_handle.proc, + rank=unready_handle.rank, + worker_response_mq=worker_response_mq, + ) + class WorkerProc: """Wrapper that runs one Worker in a separate process.""" @@ -228,20 +241,15 @@ def __init__( self.worker_response_mq = MessageQueue(1, 1) worker_response_mq_handle = self.worker_response_mq.export_handle() - # Load model before we send readiness signal, such that - # we can catch any errors. - print("ABOUT TO INIT DEVICE") + # Initialize device and loads weights self.worker.init_device() - print("ABOUT TO LOAD MODEL") self.worker.load_model() - print("SENDING TO READINESS PIPE") - # Send Readiness signal to Executor. + # Send READY once we know everything is loaded ready_pipe.send({ "status": "READY", "handle": pickle.dumps(worker_response_mq_handle) }) - print("SENT TO READINESS PIPE") except Exception as e: logger.exception("WorkerProc got error at startup:", exc_info=e) @@ -254,9 +262,10 @@ def make_worker_process( rank: int, distributed_init_method: str, input_shm_handle, # Receive SchedulerOutput - ) -> WorkerProcHandle: + ) -> UnreadyWorkerProcHandle: context = get_mp_context() - reader, writer = context.Pipe(duplex=False) + # (reader, writer) + pipe_tuple = context.Pipe(duplex=False) process_kwargs = { "vllm_config": vllm_config, @@ -264,7 +273,7 @@ def make_worker_process( "rank": rank, "distributed_init_method": distributed_init_method, "input_shm_handle": input_shm_handle, - "ready_pipe": writer, + "ready_pipe": pipe_tuple[1], } # Run EngineCore busy loop in background process. proc = context.Process(target=WorkerProc.worker_main, @@ -272,13 +281,38 @@ def make_worker_process( daemon=True) proc.start() - # Wait for startup - worker_response_mq_handle = WorkerProc.wait_for_startup(proc, reader) + return UnreadyWorkerProcHandle(proc, rank, pipe_tuple) - worker_response_mq = MessageQueue.create_from_handle( - worker_response_mq_handle, 0) + @staticmethod + def wait_for_ready( + unready_proc_handle: UnreadyWorkerProcHandle) -> WorkerProcHandle: + + e = Exception("WorkerProc initialization failed due to " + "an exception in a background process. " + "See stack trace for root cause.") + + ready_pipe = unready_proc_handle.ready_pipe[0] + try: + response = ready_pipe.recv() + if getattr(response, "status", None) != "READY_TO_LOAD": + raise e + + assert hasattr(response, "handle") + mq_handle = pickle.loads(response["handle"]) + assert isinstance(mq_handle, Handle) - return WorkerProcHandle(proc, rank, reader, worker_response_mq) + worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0) + return WorkerProcHandle.from_unready_handle( + unready_proc_handle, worker_response_mq) + + except EOFError: + e.__suppress_context__ = True + raise e from None + + finally: + # Close connection. + unready_proc_handle.ready_pipe[0].close() + unready_proc_handle.ready_pipe[1].close() def shutdown(self): self.rpc_broadcast_mq = None @@ -341,30 +375,6 @@ def signal_handler(signum, frame): worker.shutdown() worker = None - @staticmethod - def wait_for_startup( - process_name: str, - reader: Connection, - ) -> WorkerProcHandle: - """Wait until the Worker is ready.""" - - e = Exception(f"{process_name} initialization failed due to " - "an exception in a background process. See stack trace " - "for root cause.") - - try: - response = reader.recv() - if getattr(response, "status", None) != "READY": - raise e - assert hasattr(response, "handle") - handle = pickle.loads(response["handle"]) - assert isinstance(handle, WorkerProcHandle) - return handle - - except EOFError: - e.__suppress_context__ = True - raise e from None - class ResponseStatus(Enum): SUCCESS = auto() FAILURE = auto() From 23342d76fc77e4f2d46eb58bdac13c8d5d9cec65 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 15:14:19 +0000 Subject: [PATCH 058/130] remove signal handler Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 24 +++++++++++------------- vllm/v1/worker/gpu_worker.py | 5 ----- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d36a4d57153f..02b503fae7d5 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import cloudpickle -import psutil from vllm.config import VllmConfig from vllm.distributed import (destroy_distributed_environment, @@ -355,20 +354,19 @@ def signal_handler(signum, frame): logger.debug("Worker interrupted.") except Exception as e: - # Log rather than raise so the stack trace is in order of - # WorkerProc -> EngineCore -> AsyncLLM. + # NOTE: if an Exception arises in busy_loop, we send + # a FAILURE message over the MQ RPC to notify the Executor, + # which triggers system shutdown. + # TODO(rob): handle case where the MQ itself breaks. + + # Log so stack trace order is: Worker -> EngineCore -> AsyncLLM logger.exception("WorkerProc got an Exception:", exc_info=e) - # The parent will send a SIGTERM to all worker processes - # after we send SIGUSR. Set this value so we don't re-throw - # SystemExit(), to avoid zmq exceptions during __del__. + # The parent sends a SIGTERM to all worker processes if + # any worker dies. Set this value so we don't re-throw + # SystemExit() to avoid zmq exceptions in __del__. shutdown_requested = True - # worker_busy_loop sends exceptions exceptons to Executor - # for shutdown, but if there is an error in startup or an - # error with IPC itself, we need to alert the parent. - psutil.Process().parent().send_signal(signal.SIGUSR1) - finally: # Clean up once worker exits busy loop if worker is not None: @@ -381,9 +379,9 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" - while True: - method, args, kwargs = self.rpc_broadcast_mq.dequeue() + method, args, kwargs = self.rpc_broadcast_mq.dequeue() + while True: try: if isinstance(method, str): func = getattr(self.worker, method) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 985f86bd3e26..0adb69073397 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -98,7 +98,6 @@ def wake_up(self) -> None: allocator.wake_up() def init_device(self): - print("init_device") if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until # the synchronization point. This causes the memory usage to grow @@ -120,19 +119,15 @@ def init_device(self): else: raise RuntimeError( f"Not support device type: {self.device_config.device}") - print("init_worker_distributed_environment") # Initialize the distributed environment. init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. - print("set_random_seed") set_random_seed(self.model_config.seed) # Construct the model runner - print("model_runner") self.model_runner = GPUModelRunner(self.vllm_config, self.device) - print("done") def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: From ebdf8f90b85d5b5577d87c0336484b6e7b1d169f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 15:18:23 +0000 Subject: [PATCH 059/130] remove signal handler Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 02b503fae7d5..d6edab688a28 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -254,6 +254,9 @@ def __init__( logger.exception("WorkerProc got error at startup:", exc_info=e) ready_pipe.send({"status": "FAILED"}) + finally: + ready_pipe.close() + @staticmethod def make_worker_process( vllm_config: VllmConfig, From 6a37020fcb275bd64da7ed009cf6e7bd43b7a1d2 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 15:20:02 +0000 Subject: [PATCH 060/130] update comment Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/executor/multiproc_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d6edab688a28..5a5c29a3eb82 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -74,8 +74,8 @@ def _init_executor(self) -> None: ) unready_workers.append(unready_worker) - # All workers are created before wait_for_ready, since - # initialization calls self.init_device(), which does a sync. + # Workers must be created before wait_for_ready to avoid + # deadlock, since worker.init_device() does a device sync. self.workers: List[WorkerProcHandle] = [] for unready_worker in unready_workers: # NOTE: the WorkerProc wraps startup in a try ... catch From 2ed3349de296e22f57da8e2af5137f4f0c0da42d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 16:53:51 +0000 Subject: [PATCH 061/130] avoid sigusr1 Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/__init__.py | 7 ++++ vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 54 +++++++++++++++++--------- vllm/v1/engine/core_client.py | 43 ++++++-------------- vllm/v1/executor/multiproc_executor.py | 1 - 5 files changed, 55 insertions(+), 52 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index b05ef3cc8c74..0b44baf4828d 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -93,6 +93,13 @@ class EngineCoreOutputs( outputs: List[EngineCoreOutput] scheduler_stats: SchedulerStats + @classmethod + def make_empty(cls): + cls(outputs=[], scheduler_stats=SchedulerStats()) + + +ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD' + @dataclass class EngineCoreProfile: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 70cdefc31196..d80dedae5d85 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -384,7 +384,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return self.engine_core.engine_core_errored + return self.engine_core.is_engine_dead @property def dead_error(self) -> BaseException: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0bf0ecb9112d..170645249d73 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -6,9 +6,8 @@ import threading import time from multiprocessing.connection import Connection -from typing import List, Tuple, Type +from typing import List, Tuple, Type, Union -import psutil import zmq import zmq.asyncio @@ -16,10 +15,10 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import get_exception_traceback, zmq_socket_ctx +from vllm.utils import zmq_socket_ctx from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.scheduler import Scheduler -from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, +from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion, EngineCoreResetPrefixCache) @@ -132,7 +131,8 @@ def step(self) -> EngineCoreOutputs: return engine_core_outputs def shutdown(self): - self.model_executor.shutdown() + if model_executor := getattr(self, "model_executor", None): + model_executor.shutdown() def profile(self, is_start: bool = True): self.model_executor.profile(is_start) @@ -165,8 +165,8 @@ def __init__( # Threads handle Socket <-> Queues and core_busy_loop uses Queue. self.input_queue: queue.Queue[ EngineCoreRequestUnion] = queue.Queue() - self.output_queue: queue.Queue[ - List[EngineCoreOutput]] = queue.Queue() + self.output_queue: queue.Queue[Union[ + bytes, EngineCoreOutputs]] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, ), daemon=True).start() @@ -174,12 +174,16 @@ def __init__( args=(output_path, ), daemon=True).start() + self.errored_sent_event = threading.Event() + # Send Readiness signal to EngineClient. ready_pipe.send({"status": "READY"}) except Exception as e: logger.exception("EngineCore got error at startup:", exc_info=e) ready_pipe.send({"status": "FAILED"}) + finally: + ready_pipe.close() @staticmethod def run_engine_core(*args, **kwargs): @@ -203,20 +207,12 @@ def signal_handler(signum, frame): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) - parent_process = psutil.Process().parent() - engine_core = None + engine_core = EngineCoreProc(*args, **kwargs) try: - engine_core = EngineCoreProc(*args, **kwargs) engine_core.run_busy_loop() - - except SystemExit: - logger.debug("EngineCore interrupted.") - - except Exception: - traceback = get_exception_traceback() - logger.error("EngineCore hit an exception: %s", traceback) - parent_process.send_signal(signal.SIGUSR1) - + except Exception as e: + logger.exception("EngineCore got an Exception:", exc_info=e) + engine_core._send_engine_dead() finally: if engine_core is not None: engine_core.shutdown() @@ -266,6 +262,19 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: assert isinstance(request, list) self.abort_requests(request) + def _send_engine_dead(self): + """Send EngineDead status to the EngineCoreClient.""" + + # Put ENGINE_CORE_DEAD to the front of the queue. + with self.output_queue.mutex: + self.output_queue.queue.clear() + self.output_queue.put_nowait(ENGINE_CORE_DEAD) + + # Wait until msg sent by the daemon before shutdown. + if not self.errored_sent_event.wait(timeout=10): + logger.fatal("vLLM shutdown signal from EngineCore failed " + "to send. Please report this issue.") + def process_input_socket(self, input_path: str): """Input socket IO thread.""" @@ -306,5 +315,12 @@ def process_output_socket(self, output_path: str): with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: outputs = self.output_queue.get() + if outputs == ENGINE_CORE_DEAD: + socket.send_multipart((ENGINE_CORE_DEAD, ), copy=False) + break + encoder.encode_into(outputs, buffer) socket.send_multipart((buffer, ), copy=False) + + # Signal to main thread that ENGINE_CORE_DEAD was sent. + self.errored_sent_event.set() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 2fe1873b6440..51b27082dfc4 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -import signal import weakref from abc import ABC, abstractmethod from typing import List, Optional, Type, Union @@ -11,9 +10,10 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket -from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, - EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion, EngineCoreResetPrefixCache) +from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs, + EngineCoreProfile, EngineCoreRequest, + EngineCoreRequestType, EngineCoreRequestUnion, + EngineCoreResetPrefixCache) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor @@ -167,7 +167,7 @@ def __init__( zmq.constants.PUSH) # Start EngineCore in background process. - self.engine_core_errored = False + self.is_engine_dead = False self.proc_handle = BackgroundProcHandle( input_path=input_path, output_path=output_path, @@ -186,22 +186,13 @@ def shutdown(self): self.proc_handle.shutdown() self._finalizer() - def _sigusr1_handler(self): - """ - EngineCoreProc sends SIGUSR1 if it encounters an Exception. - Set self in errored state and begin shutdown. - """ - logger.fatal("Got fatal signal from EngineCore, shutting down.") - self.engine_core_errored = True - self.shutdown() - def _format_exception(self, e: Exception) -> Exception: """If errored, use EngineDeadError so root cause is clear.""" return (EngineDeadError( "EngineCore encountered an issue. See stack trace " "for the root cause.", - suppress_context=True) if self.engine_core_errored else e) + suppress_context=True) if self.is_engine_dead else e) class SyncMPClient(MPClient): @@ -210,12 +201,6 @@ class SyncMPClient(MPClient): def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): - # Setup EngineCore signal handler. - def sigusr1_handler(signum, frame): - self._sigusr1_handler() - - signal.signal(signal.SIGUSR1, sigusr1_handler) - super().__init__( asyncio_mode=False, vllm_config=vllm_config, @@ -227,6 +212,9 @@ def get_output(self) -> EngineCoreOutputs: try: (frame, ) = self.output_socket.recv_multipart(copy=False) + if frame == ENGINE_CORE_DEAD: + self.is_engine_dead = True + raise EngineDeadError engine_core_outputs = self.decoder.decode(frame.buffer) return engine_core_outputs except Exception as e: @@ -266,16 +254,6 @@ class AsyncMPClient(MPClient): def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): - # EngineCore sends SIGUSR1 when it gets an Exception. - # NOTE: super().__init__ blocks the event loop until - # background procs are setup. This handler allows us - # to catch issues during startup (e.g. OOM). We switch - # to a signal handler in the event loop __init__. - def sigusr1_handler(signum, frame): - self._sigusr1_handler() - - # signal.signal(signal.SIGUSR1, sigusr1_handler) - # Initialize EngineCore + all background processes. super().__init__( asyncio_mode=True, @@ -300,6 +278,9 @@ async def _process_outputs_socket_loop(self): try: while True: (frame, ) = await self.output_socket.recv_multipart(copy=False) + if frame == ENGINE_CORE_DEAD: + self.is_engine_dead = True + raise EngineDeadError outputs = self.decoder.decode(frame.buffer) self.outputs_queue.put_nowait(outputs) except Exception as e: diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 5a5c29a3eb82..1daf71830c00 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -362,7 +362,6 @@ def signal_handler(signum, frame): # which triggers system shutdown. # TODO(rob): handle case where the MQ itself breaks. - # Log so stack trace order is: Worker -> EngineCore -> AsyncLLM logger.exception("WorkerProc got an Exception:", exc_info=e) # The parent sends a SIGTERM to all worker processes if From f9ef3d811b6a9a3411bee188f7bbe72002156fca Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:24:52 +0000 Subject: [PATCH 062/130] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/__init__.py | 4 -- vllm/v1/engine/core.py | 6 +++ vllm/v1/engine/core_client.py | 54 ++++++++++++-------------- vllm/v1/executor/multiproc_executor.py | 11 ++---- 4 files changed, 35 insertions(+), 40 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 0b44baf4828d..bfba0c7f6a0a 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -93,10 +93,6 @@ class EngineCoreOutputs( outputs: List[EngineCoreOutput] scheduler_stats: SchedulerStats - @classmethod - def make_empty(cls): - cls(outputs=[], scheduler_stats=SchedulerStats()) - ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD' diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 170645249d73..2f0351ca5688 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -174,6 +174,10 @@ def __init__( args=(output_path, ), daemon=True).start() + # Signal from process_output_socket that EngineDead + # message was sent. Since process_output_socket is a + # daemon thread, we need to ensure this message is + # sent before we exit from the main thread. self.errored_sent_event = threading.Event() # Send Readiness signal to EngineClient. @@ -182,6 +186,8 @@ def __init__( except Exception as e: logger.exception("EngineCore got error at startup:", exc_info=e) ready_pipe.send({"status": "FAILED"}) + raise e + finally: ready_pipe.close() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 51b27082dfc4..8a813463761a 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -2,7 +2,7 @@ import asyncio import weakref from abc import ABC, abstractmethod -from typing import List, Optional, Type, Union +from typing import Any, List, Optional, Type, Union import zmq import zmq.asyncio @@ -186,6 +186,11 @@ def shutdown(self): self.proc_handle.shutdown() self._finalizer() + def _validate_alive(self, frame: Any): + if frame == ENGINE_CORE_DEAD: + self.is_engine_dead = True + raise EngineDeadError + def _format_exception(self, e: Exception) -> Exception: """If errored, use EngineDeadError so root cause is clear.""" @@ -212,11 +217,8 @@ def get_output(self) -> EngineCoreOutputs: try: (frame, ) = self.output_socket.recv_multipart(copy=False) - if frame == ENGINE_CORE_DEAD: - self.is_engine_dead = True - raise EngineDeadError - engine_core_outputs = self.decoder.decode(frame.buffer) - return engine_core_outputs + self._validate_alive(frame) + return self.decoder.decode(frame.buffer) except Exception as e: raise self._format_exception(e) from None @@ -253,8 +255,6 @@ class AsyncMPClient(MPClient): def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): - - # Initialize EngineCore + all background processes. super().__init__( asyncio_mode=True, vllm_config=vllm_config, @@ -262,39 +262,35 @@ def __init__(self, vllm_config: VllmConfig, log_stats=True, ) - # ZMQ IO. Run it in background task so that we can - # overlap with AsyncLLM.output_handler_loop. This - # works because ZMQ IO releases the GIL. - self.queue_task: Optional[asyncio.Task] = None self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs, Exception]] = asyncio.Queue() + self.queue_task: Optional[asyncio.Task] = None def shutdown(self): super().shutdown() if queue_task := getattr(self, "queue_task", None): queue_task.cancel() - async def _process_outputs_socket_loop(self): - try: - while True: - (frame, ) = await self.output_socket.recv_multipart(copy=False) - if frame == ENGINE_CORE_DEAD: - self.is_engine_dead = True - raise EngineDeadError - outputs = self.decoder.decode(frame.buffer) - self.outputs_queue.put_nowait(outputs) - except Exception as e: - self.outputs_queue.put_nowait(e) - async def get_output_async(self) -> EngineCoreOutputs: - # Start output loop on the first call. if self.queue_task is None: - self.queue_task = asyncio.create_task( - self._process_outputs_socket_loop()) - # NOTE: if an exception arises processing the socket, - # the exception is forwarded to the queue. + async def process_outputs_socket(): + try: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self._validate_alive(frame) + self.outputs_queue.put_nowait(frame.buffer) + except Exception as e: + self.outputs_queue.put_nowait(e) + + # Run ZMQ IO (which releases the GIL) in a background task + # to overlap with this task (run_output_handler). + self.queue_task = asyncio.create_task(process_outputs_socket()) + + # If an exception arises in process_outputs_socket task, + # it is forwarded to the outputs_queue so we can raise it + # from this (run_output_handler) task to shut down the server. outputs = await self.outputs_queue.get() if isinstance(outputs, Exception): raise self._format_exception(outputs) from None diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 1daf71830c00..ce0522b81137 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -295,14 +295,14 @@ def wait_for_ready( ready_pipe = unready_proc_handle.ready_pipe[0] try: + # Wait until the WorkerProc is ready. response = ready_pipe.recv() - if getattr(response, "status", None) != "READY_TO_LOAD": + if response["status"] != "READY": raise e - assert hasattr(response, "handle") + # Extract the message queue handle. mq_handle = pickle.loads(response["handle"]) - assert isinstance(mq_handle, Handle) - + print(f"{mq_handle=}") worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0) return WorkerProcHandle.from_unready_handle( unready_proc_handle, worker_response_mq) @@ -353,9 +353,6 @@ def signal_handler(signum, frame): worker.worker_busy_loop() - except SystemExit: - logger.debug("Worker interrupted.") - except Exception as e: # NOTE: if an Exception arises in busy_loop, we send # a FAILURE message over the MQ RPC to notify the Executor, From 95c249f76a516684a7ef3b5bb6114ef6aeb198f6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:25:37 +0000 Subject: [PATCH 063/130] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 8a813463761a..48afca8de536 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -272,7 +272,6 @@ def shutdown(self): queue_task.cancel() async def get_output_async(self) -> EngineCoreOutputs: - if self.queue_task is None: async def process_outputs_socket(): From 030c671efb8076ec28307e99c0cae893f7385040 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:25:58 +0000 Subject: [PATCH 064/130] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core_client.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 48afca8de536..644c32213458 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -273,7 +273,8 @@ def shutdown(self): async def get_output_async(self) -> EngineCoreOutputs: if self.queue_task is None: - + # Run ZMQ IO (which releases the GIL) in a background task + # to overlap with this task (run_output_handler). async def process_outputs_socket(): try: (frame, ) = await self.output_socket.recv_multipart( @@ -283,8 +284,6 @@ async def process_outputs_socket(): except Exception as e: self.outputs_queue.put_nowait(e) - # Run ZMQ IO (which releases the GIL) in a background task - # to overlap with this task (run_output_handler). self.queue_task = asyncio.create_task(process_outputs_socket()) # If an exception arises in process_outputs_socket task, From 1bdb212d1ac80856f9fe31572f3d6b7665a6d058 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:26:50 +0000 Subject: [PATCH 065/130] cleanup Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 644c32213458..fdd3e28b724e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -205,7 +205,6 @@ class SyncMPClient(MPClient): def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor]): - super().__init__( asyncio_mode=False, vllm_config=vllm_config, From 25412a0a3f08716c91cff6f2d72a5026cd496f55 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:33:12 +0000 Subject: [PATCH 066/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core_client.py | 4 +--- vllm/v1/engine/exceptions.py | 5 ++++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index fdd3e28b724e..7e0dba5c4ec4 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -189,14 +189,12 @@ def shutdown(self): def _validate_alive(self, frame: Any): if frame == ENGINE_CORE_DEAD: self.is_engine_dead = True - raise EngineDeadError + raise EngineDeadError() def _format_exception(self, e: Exception) -> Exception: """If errored, use EngineDeadError so root cause is clear.""" return (EngineDeadError( - "EngineCore encountered an issue. See stack trace " - "for the root cause.", suppress_context=True) if self.is_engine_dead else e) diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 34ec1f6b0cd0..ff74556cc160 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -8,7 +8,10 @@ class EngineDeadError(Exception): """Raised when the EngineCore dies. Unrecoverable.""" def __init__(self, *args, suppress_context: bool = False, **kwargs): - super().__init__(args, kwargs) + ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. \ + See stack trace for the root cause issue." + + super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs) # Make stack trace clearer when using with LLMEngine by # silencing irrelevant ZMQError. self.__suppress_context__ = suppress_context From 7cf0647ab0352578c9ac08a265db4ea8e7ff1680 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 17:38:35 +0000 Subject: [PATCH 067/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/__init__.py | 3 --- vllm/v1/engine/core.py | 15 ++++++++------- vllm/v1/engine/core_client.py | 9 ++++----- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index bfba0c7f6a0a..b05ef3cc8c74 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -94,9 +94,6 @@ class EngineCoreOutputs( scheduler_stats: SchedulerStats -ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD' - - @dataclass class EngineCoreProfile: is_start: bool diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2f0351ca5688..9dadc0eb5c19 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -18,10 +18,9 @@ from vllm.utils import zmq_socket_ctx from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.scheduler import Scheduler -from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion, - EngineCoreResetPrefixCache) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -144,6 +143,8 @@ def reset_prefix_cache(self): class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" + ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD' + def __init__( self, input_path: str, @@ -274,7 +275,7 @@ def _send_engine_dead(self): # Put ENGINE_CORE_DEAD to the front of the queue. with self.output_queue.mutex: self.output_queue.queue.clear() - self.output_queue.put_nowait(ENGINE_CORE_DEAD) + self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) # Wait until msg sent by the daemon before shutdown. if not self.errored_sent_event.wait(timeout=10): @@ -321,8 +322,8 @@ def process_output_socket(self, output_path: str): with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: outputs = self.output_queue.get() - if outputs == ENGINE_CORE_DEAD: - socket.send_multipart((ENGINE_CORE_DEAD, ), copy=False) + if outputs == EngineCoreProc.ENGINE_CORE_DEAD: + socket.send_multipart((outputs, ), copy=False) break encoder.encode_into(outputs, buffer) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 7e0dba5c4ec4..b52a5400c5b1 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -10,10 +10,9 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket -from vllm.v1.engine import (ENGINE_CORE_DEAD, EngineCoreOutputs, - EngineCoreProfile, EngineCoreRequest, - EngineCoreRequestType, EngineCoreRequestUnion, - EngineCoreResetPrefixCache) +from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, + EngineCoreRequest, EngineCoreRequestType, + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor @@ -187,7 +186,7 @@ def shutdown(self): self._finalizer() def _validate_alive(self, frame: Any): - if frame == ENGINE_CORE_DEAD: + if frame == EngineCoreProc.ENGINE_CORE_DEAD: self.is_engine_dead = True raise EngineDeadError() From 352da94dc7cfcbf8d2d0a5a860ab40910c98312c Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 18:27:35 +0000 Subject: [PATCH 068/130] it starts? Signed-off-by: rshaw@neuralmagic.com --- examples/online_serving/openai_completion_client.py | 7 ++++--- vllm/v1/engine/async_llm.py | 8 ++++++-- vllm/v1/engine/core.py | 8 ++------ vllm/v1/engine/core_client.py | 2 +- vllm/v1/engine/exceptions.py | 3 +-- vllm/v1/executor/multiproc_executor.py | 7 +++---- vllm/v1/utils.py | 7 +++++-- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 06b93d7d1931..20fa043d9670 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -16,14 +16,15 @@ model = models.data[0].id # Completion API -stream = False +stream = True completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, - n=2, + # n=2, stream=stream, - logprobs=3) +) +# logprobs=3) print("Completion results:") if stream: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d80dedae5d85..22e882d49976 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -376,7 +376,11 @@ async def reset_prefix_cache(self) -> None: @property def is_running(self) -> bool: - return not self.errored + # Have not started the loop yet. + if self.output_handler is None: + return True + + return not self.output_handler.done() @property def is_stopped(self) -> bool: @@ -384,7 +388,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return self.engine_core.is_engine_dead + return (self.engine_core.is_engine_dead or not self.is_running) @property def dead_error(self) -> BaseException: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9dadc0eb5c19..dacd95f96fd5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -47,6 +47,7 @@ def __init__( # Setup Model. self.model_executor = executor_class(vllm_config) + print("EXECUTOR_READY") # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( @@ -168,6 +169,7 @@ def __init__( EngineCoreRequestUnion] = queue.Queue() self.output_queue: queue.Queue[Union[ bytes, EngineCoreOutputs]] = queue.Queue() + self.errored_sent_event = threading.Event() threading.Thread(target=self.process_input_socket, args=(input_path, ), daemon=True).start() @@ -175,12 +177,6 @@ def __init__( args=(output_path, ), daemon=True).start() - # Signal from process_output_socket that EngineDead - # message was sent. Since process_output_socket is a - # daemon thread, we need to ensure this message is - # sent before we exit from the main thread. - self.errored_sent_event = threading.Event() - # Send Readiness signal to EngineClient. ready_pipe.send({"status": "READY"}) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b52a5400c5b1..e1b9d65e3340 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -289,7 +289,7 @@ async def process_outputs_socket(): if isinstance(outputs, Exception): raise self._format_exception(outputs) from None - return outputs + return self.decoder.decode(outputs) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index ff74556cc160..266745124f37 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -8,8 +8,7 @@ class EngineDeadError(Exception): """Raised when the EngineCore dies. Unrecoverable.""" def __init__(self, *args, suppress_context: bool = False, **kwargs): - ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. \ - See stack trace for the root cause issue." + ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause issue." # noqa: E501 super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs) # Make stack trace clearer when using with LLMEngine by diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ce0522b81137..00dbb510b382 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -80,7 +80,7 @@ def _init_executor(self) -> None: for unready_worker in unready_workers: # NOTE: the WorkerProc wraps startup in a try ... catch # so if there are any issues in loading in a WorkerProcess - # (e.g. OOM), an Exception will be raised here. + # (e.g. OOM), an Exception will be caught here. worker = WorkerProc.wait_for_ready(unready_worker) self.workers.append(worker) @@ -302,7 +302,6 @@ def wait_for_ready( # Extract the message queue handle. mq_handle = pickle.loads(response["handle"]) - print(f"{mq_handle=}") worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0) return WorkerProcHandle.from_unready_handle( unready_proc_handle, worker_response_mq) @@ -378,9 +377,9 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" - method, args, kwargs = self.rpc_broadcast_mq.dequeue() - while True: + method, args, kwargs = self.rpc_broadcast_mq.dequeue() + try: if isinstance(method, str): func = getattr(self.worker, method) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 2f5168296e67..396f4bac75a2 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -102,13 +102,13 @@ def __init__( process_kwargs: Dict[Any, Any], ): context = get_mp_context() - self.reader, writer = context.Pipe(duplex=False) + self.reader, self.writer = context.Pipe(duplex=False) self.process_name = process_name assert ("ready_pipe" not in process_kwargs and "input_path" not in process_kwargs and "output_path" not in process_kwargs) - process_kwargs["ready_pipe"] = writer + process_kwargs["ready_pipe"] = self.writer process_kwargs["input_path"] = input_path process_kwargs["output_path"] = output_path @@ -143,6 +143,9 @@ def wait_for_startup(self): except EOFError: e.__suppress_context__ = True raise e from None + finally: + self.reader.close() + self.writer.close() # Note(rob): shutdown function cannot be a bound method, From a69e04096a7114b7f4293482cce7acf014c60734 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 18:45:44 +0000 Subject: [PATCH 069/130] updated Signed-off-by: rshaw@neuralmagic.com --- .../online_serving/openai_completion_client.py | 7 +++---- vllm/v1/engine/core.py | 3 +-- vllm/v1/engine/core_client.py | 15 ++++++++------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index 20fa043d9670..06b93d7d1931 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -16,15 +16,14 @@ model = models.data[0].id # Completion API -stream = True +stream = False completion = client.completions.create( model=model, prompt="A robot may not injure a human being", echo=False, - # n=2, + n=2, stream=stream, -) -# logprobs=3) + logprobs=3) print("Completion results:") if stream: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index dacd95f96fd5..ae9a33110216 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -47,7 +47,6 @@ def __init__( # Setup Model. self.model_executor = executor_class(vllm_config) - print("EXECUTOR_READY") # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( @@ -248,7 +247,7 @@ def run_busy_loop(self): # 3) Step the engine core. outputs = self.step() - # 5) Put EngineCoreOutputs into the output queue. + # 4) Put EngineCoreOutputs into the output queue. self.output_queue.put_nowait(outputs) def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e1b9d65e3340..33eaa2c211ca 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -185,8 +185,8 @@ def shutdown(self): self.proc_handle.shutdown() self._finalizer() - def _validate_alive(self, frame: Any): - if frame == EngineCoreProc.ENGINE_CORE_DEAD: + def _validate_alive(self, buffer: Any): + if buffer == EngineCoreProc.ENGINE_CORE_DEAD: self.is_engine_dead = True raise EngineDeadError() @@ -213,7 +213,7 @@ def get_output(self) -> EngineCoreOutputs: try: (frame, ) = self.output_socket.recv_multipart(copy=False) - self._validate_alive(frame) + self._validate_alive(frame.buffer) return self.decoder.decode(frame.buffer) except Exception as e: raise self._format_exception(e) from None @@ -273,10 +273,11 @@ async def get_output_async(self) -> EngineCoreOutputs: # to overlap with this task (run_output_handler). async def process_outputs_socket(): try: - (frame, ) = await self.output_socket.recv_multipart( - copy=False) - self._validate_alive(frame) - self.outputs_queue.put_nowait(frame.buffer) + while True: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self._validate_alive(frame.buffer) + self.outputs_queue.put_nowait(frame.buffer) except Exception as e: self.outputs_queue.put_nowait(e) From 8dddc206a5ff2c1262b9d84885c57f4f782cebb0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 18:47:47 +0000 Subject: [PATCH 070/130] updated Signed-off-by: rshaw@neuralmagic.com --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7ef40564c5bd..3a94511e3992 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -187,6 +187,7 @@ steps: commands: # split the test to avoid interference - VLLM_USE_V1=1 pytest -v -s v1/core + - VLLM_USE_V1=1 pytest -v -s v1/shutdown - VLLM_USE_V1=1 pytest -v -s v1/engine - VLLM_USE_V1=1 pytest -v -s v1/sample - VLLM_USE_V1=1 pytest -v -s v1/worker From 7b48b87a2ffb52c2f0e352235ebadbf4ad22d58b Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 18:58:14 +0000 Subject: [PATCH 071/130] updated Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core.py | 6 ++---- vllm/v1/engine/exceptions.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ae9a33110216..93b433e67f36 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -268,12 +268,10 @@ def _send_engine_dead(self): """Send EngineDead status to the EngineCoreClient.""" # Put ENGINE_CORE_DEAD to the front of the queue. - with self.output_queue.mutex: - self.output_queue.queue.clear() - self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) + self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) # Wait until msg sent by the daemon before shutdown. - if not self.errored_sent_event.wait(timeout=10): + if not self.errored_sent_event.wait(timeout=10.): logger.fatal("vLLM shutdown signal from EngineCore failed " "to send. Please report this issue.") diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 266745124f37..aa8a1227420e 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -8,7 +8,7 @@ class EngineDeadError(Exception): """Raised when the EngineCore dies. Unrecoverable.""" def __init__(self, *args, suppress_context: bool = False, **kwargs): - ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause issue." # noqa: E501 + ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause." # noqa: E501 super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs) # Make stack trace clearer when using with LLMEngine by From 74008529a3307414ea059309423981557e50bf54 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 19:12:26 +0000 Subject: [PATCH 072/130] updated Signed-off-by: rshaw@neuralmagic.com --- tests/v1/shutdown/test_forward_error.py | 3 ++- tests/v1/shutdown/test_processor_error.py | 1 + tests/v1/shutdown/test_startup_error.py | 1 + vllm/v1/engine/async_llm.py | 2 ++ vllm/v1/executor/multiproc_executor.py | 2 +- 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 5017bc21ac71..6cc143bfaddd 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that we handle an Error in model forward and shutdown.""" import asyncio @@ -84,7 +85,7 @@ async def generate(request_id: str): timeout_s=60, ) - # NOTE: shutdown is handled by the API Server. If an exception + # NOTE: shutdown is handled by the API Server if an exception # occurs, so it is expected that we would need to call this. async_llm.shutdown() diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 056851025eca..a98ed6f12324 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test error handling in Processor. Should not impact other reqs.""" import asyncio diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 25f2b77b2f3d..0516e22b31f0 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Test that we handle a startup Error and shutdown.""" import pytest diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 22e882d49976..dcf6c78930a4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -388,6 +388,8 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: + print(f"{self.engine_core.is_engine_dead=}") + print(f"{self.is_running=}") return (self.engine_core.is_engine_dead or not self.is_running) @property diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 00dbb510b382..8b6777f769db 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -389,7 +389,7 @@ def worker_busy_loop(self): except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) - logger.exception("WorkerProc hit an exception: %s", exc_info=e) + logger.exception("WorkerProc hit an exception:", exc_info=e) continue self.worker_response_mq.enqueue( From 80317a0fb62ed9a01ea00b36b03773dc8d855965 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 20:18:11 +0000 Subject: [PATCH 073/130] updated Signed-off-by: rshaw@neuralmagic.com --- tests/v1/shutdown/test_processor_error.py | 6 ++++-- vllm/v1/engine/core_client.py | 15 ++++++--------- vllm/v1/utils.py | 15 +++++---------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index a98ed6f12324..2d0a382ba99b 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -46,12 +46,14 @@ async def generate(request_id: str): assert not async_llm.errored # This should be no problem. + EXPECTED_TOKENS = 5 outputs = [] async for out in async_llm.generate( "Hello my name is", request_id="abc", - sampling_params=SamplingParams(max_tokens=5)): + sampling_params=SamplingParams(max_tokens=EXPECTED_TOKENS)): outputs.append(out) - assert len(outputs) == 5 + print(f"{outputs=}") + assert len(outputs) == EXPECTED_TOKENS async_llm.shutdown() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 33eaa2c211ca..b2911abcfd1d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -import weakref from abc import ABC, abstractmethod from typing import Any, List, Optional, Type, Union @@ -152,11 +151,6 @@ def __init__( zmq.asyncio.Context() # type: ignore[attr-defined] if asyncio_mode else zmq.Context()) # type: ignore[attr-defined] - # Note(rob): shutdown function cannot be a bound method, - # else the gc cannot collect the object. - self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0), - self.ctx) - # Paths and sockets for IPC. output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() @@ -177,13 +171,16 @@ def __init__( "executor_class": executor_class, "log_stats": log_stats, }) - self.proc_handle.wait_for_startup() + self.proc_handle.wait_for_startup(self.shutdown) def shutdown(self): """Clean up background resources.""" - self.proc_handle.shutdown() - self._finalizer() + if ctx := getattr(self, "ctx", None): + ctx.destroy(linger=0) + + if proc_handle := getattr(self, "proc_handle", None): + proc_handle.shutdown() def _validate_alive(self, buffer: Any): if buffer == EngineCoreProc.ENGINE_CORE_DEAD: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 396f4bac75a2..b40d36858f03 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -112,11 +112,6 @@ def __init__( process_kwargs["input_path"] = input_path process_kwargs["output_path"] = output_path - # Flag for shutdown state. BackgroundProcs send signals - # when errors occur which calls shutdown(). If we are in - # startup loop when signaled, this flag breaks us out. - self.shutting_down = False - # Run busy loop in background process. self.proc = context.Process(target=target_fn, kwargs=process_kwargs) self._finalizer = weakref.finalize(self, shutdown, self.proc, @@ -124,24 +119,24 @@ def __init__( self.proc.start() def shutdown(self): - self.shutting_down = True self._finalizer() - def wait_for_startup(self): + def wait_for_startup(self, shutdown_callback: Callable): """Wait until the background process is ready.""" e = Exception(f"{self.process_name} initialization failed due to " "an exception in a background process. See stack trace " "for root cause.") - while not self.reader.poll(timeout=1): - if self.shutting_down: - raise e try: if self.reader.recv()["status"] != "READY": raise e except EOFError: e.__suppress_context__ = True + shutdown_callback() + raise e from None + except Exception: + shutdown_callback() raise e from None finally: self.reader.close() From ca3796021d0ea6810940e4d1ea49c6816b7fc640 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 20:28:48 +0000 Subject: [PATCH 074/130] nits Signed-off-by: rshaw@neuralmagic.com --- tests/v1/shutdown/test_processor_error.py | 6 ++++-- vllm/v1/engine/async_llm.py | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 2d0a382ba99b..7f387b6693ad 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -8,6 +8,7 @@ from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs.data import TokensPrompt +from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineGenerateError @@ -51,9 +52,10 @@ async def generate(request_id: str): async for out in async_llm.generate( "Hello my name is", request_id="abc", - sampling_params=SamplingParams(max_tokens=EXPECTED_TOKENS)): + sampling_params=SamplingParams( + max_tokens=EXPECTED_TOKENS, + output_kind=RequestOutputKind.DELTA)): outputs.append(out) - print(f"{outputs=}") assert len(outputs) == EXPECTED_TOKENS async_llm.shutdown() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dcf6c78930a4..22e882d49976 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -388,8 +388,6 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - print(f"{self.engine_core.is_engine_dead=}") - print(f"{self.is_running=}") return (self.engine_core.is_engine_dead or not self.is_running) @property From 2d41499119b6085b051225fb50a57b1d3fad9129 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 20:35:28 +0000 Subject: [PATCH 075/130] fix test for bunched streaming Signed-off-by: rshaw@neuralmagic.com --- tests/v1/shutdown/test_processor_error.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 7f387b6693ad..681b9bb19162 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -56,6 +56,10 @@ async def generate(request_id: str): max_tokens=EXPECTED_TOKENS, output_kind=RequestOutputKind.DELTA)): outputs.append(out) - assert len(outputs) == EXPECTED_TOKENS + + generated_tokens = [] + for out in outputs: + generated_tokens.extend(out.outputs[0].token_ids) + assert len(generated_tokens) == EXPECTED_TOKENS async_llm.shutdown() From 4a39d39dac7a831e01d46dd3268367c3014c8946 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Sat, 8 Feb 2025 20:37:15 +0000 Subject: [PATCH 076/130] tweak typing Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/async_llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 22e882d49976..f7d7232da5e2 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -215,13 +215,13 @@ async def generate( # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() if q.qsize() > 0 else await q.get() - if isinstance(out, EngineDeadError): + if isinstance(out, Exception): raise out # Coalesce any additional queued outputs while not q.empty(): next_out = q.get_nowait() - if isinstance(next_out, EngineDeadError): + if isinstance(next_out, Exception): raise out if sampling_params.output_kind == RequestOutputKind.DELTA: out.add(next_out) From 43360f032d9d7861cd124ec1b83f292c53844557 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 10 Feb 2025 10:31:35 -0500 Subject: [PATCH 077/130] Update tests/v1/shutdown/test_forward_error.py Co-authored-by: Cyrus Leung --- tests/v1/shutdown/test_forward_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 6cc143bfaddd..a8f4b6097c90 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -16,7 +16,7 @@ def evil_forward(self, *args, **kwargs): - """Evil forward method that raise an exception after 5 calls.""" + """Evil forward method that raise an exception after 10 calls.""" NUMBER_OF_GOOD_PASSES = 10 if not hasattr(self, "num_calls"): From 218d095af6a7462439b51c084810449bf0ef20fd Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Mon, 10 Feb 2025 16:30:37 +0000 Subject: [PATCH 078/130] pre commit Signed-off-by: rshaw@neuralmagic.com --- vllm/v1/engine/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f9bd1dfe93ec..beb994a37456 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -164,7 +164,8 @@ def __init__( # Threads handle Socket <-> Queues and core_busy_loop uses Queue. self.input_queue: queue.Queue[Tuple[EngineCoreRequestType, Any]] = queue.Queue() - self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() + self.output_queue: queue.Queue[Union[EngineCoreOutputs, + bytes]] = queue.Queue() self.errored_sent_event = threading.Event() threading.Thread(target=self.process_input_socket, args=(input_path, ), @@ -175,7 +176,7 @@ def __init__( # Send Readiness signal to EngineClient. ready_pipe.send({"status": "READY"}) - + except Exception as e: logger.exception("EngineCore got error at startup:", exc_info=e) ready_pipe.send({"status": "FAILED"}) From c395634dd1a86e2f93bc6dc0a1ae9475617b3f1b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:38:14 -0500 Subject: [PATCH 079/130] Update tests/v1/shutdown/test_forward_error.py Co-authored-by: Russell Bryant --- tests/v1/shutdown/test_forward_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index a8f4b6097c90..e0d27c47a8ec 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -105,7 +105,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + m.setattr(LlamaForCausalLM, "forward", evil_forward) llm = LLM(model="meta-llama/Llama-3.2-1B", enforce_eager=True, From 042c486b27b065d6d493f1ca78f2ba4ac4dee1fe Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:42:55 -0500 Subject: [PATCH 080/130] Update vllm/v1/engine/core.py Co-authored-by: Russell Bryant --- vllm/v1/engine/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index beb994a37456..e012f6a00a67 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -128,7 +128,7 @@ def step(self) -> EngineCoreOutputs: return engine_core_outputs def shutdown(self): - if model_executor := getattr(self, "model_executor", None): + if self.model_executor: model_executor.shutdown() def profile(self, is_start: bool = True): From b5a7b6f26ff03a384d532fad164db5cd86737698 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:45:34 -0500 Subject: [PATCH 081/130] Update vllm/v1/engine/core.py Co-authored-by: Russell Bryant --- vllm/v1/engine/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e012f6a00a67..c53144f0bd18 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -234,8 +234,6 @@ def run_busy_loop(self): # Break out the loop so we can log_stats in step(). if self.log_stats: break - except Exception: - raise # 2) Handle any new client requests. while not self.input_queue.empty(): From dab77cf6fceae25d2a74249a1457e31daa1fbc6f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 10 Feb 2025 17:47:50 -0500 Subject: [PATCH 082/130] Update tests/v1/shutdown/test_forward_error.py Co-authored-by: Russell Bryant --- tests/v1/shutdown/test_forward_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index e0d27c47a8ec..217d21957e24 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -63,7 +63,7 @@ async def generate(request_id: str): tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] outputs = await asyncio.gather(*tasks) - # Every request should have get an EngineDeadError. + # Every request should get an EngineDeadError. for output in outputs: assert isinstance(output, EngineDeadError) From a0102812625a4038201bc3db54db496c791a718e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 25 Mar 2025 00:48:41 +0000 Subject: [PATCH 083/130] intermed tensors Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 217d21957e24..5204764cdf44 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -27,7 +27,7 @@ def evil_forward(self, *args, **kwargs): raise Exception("Simulated illegal memory access on Rank 0!") self.num_calls += 1 - return self.model(*args, **kwargs, intermediate_tensors=None) + return self.model(*args, **kwargs) @pytest.mark.asyncio @@ -115,8 +115,8 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, llm.generate("Hello my name is Robert and I") # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) + # wait_for_gpu_memory_to_clear( + # devices=list(range(tensor_parallel_size)), + # threshold_bytes=2 * 2**30, + # timeout_s=60, + # ) From adebbe3bdb0863f327053b08b3a49c57c3e66f31 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 25 Mar 2025 20:25:19 +0000 Subject: [PATCH 084/130] added multiproc on/off tests Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 20 +++++++++++--------- tests/v1/shutdown/test_processor_error.py | 6 ++++-- tests/v1/shutdown/test_startup_error.py | 8 +++++--- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 5204764cdf44..e3bdca0b0356 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -32,13 +32,16 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -async def test_async_llm_model_error(monkeypatch, tensor_parallel_size): +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +async def test_async_llm_model_error(monkeypatch, tensor_parallel_size, + enable_multiprocessing): if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) # Monkeypatch an error in the model. monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) @@ -99,7 +102,6 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") MP_VALUE = "1" if enable_multiprocessing else "0" m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) @@ -114,9 +116,9 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, with pytest.raises(EngineDeadError): llm.generate("Hello my name is Robert and I") - # Confirm all the processes are cleaned up. - # wait_for_gpu_memory_to_clear( - # devices=list(range(tensor_parallel_size)), - # threshold_bytes=2 * 2**30, - # timeout_s=60, - # ) + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 681b9bb19162..8fde7e619585 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -14,10 +14,12 @@ @pytest.mark.asyncio -async def test_async_llm_processor_error(monkeypatch): +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +async def test_async_llm_processor_error(monkeypatch, enable_multiprocessing): with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", enforce_eager=True) diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 0516e22b31f0..2756a68d36e0 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -29,13 +29,16 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size): +@pytest.mark.parametrize("enable_multiprocessing", [True, False]) +def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size, + enable_multiprocessing): if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) # Monkeypatch an error in the model. monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) @@ -67,7 +70,6 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size, pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") MP_VALUE = "1" if enable_multiprocessing else "0" m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) From f23bc2557d8bebb773de5cfa021062870ddea039 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 25 Mar 2025 21:06:40 +0000 Subject: [PATCH 085/130] wip sync Signed-off-by: Andrew Feldman --- vllm/v1/engine/core_client.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index efeda75d8113..99f9fbe41ab9 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -329,7 +329,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats=log_stats, ) - self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() + self.outputs_queue: queue.Queue[Union[EngineCoreOutputs, + Exception]] = queue.Queue() # Ensure that the outputs socket processing thread does not have # a ref to the client which prevents gc. @@ -363,7 +364,7 @@ def process_outputs_socket(): self._validate_alive(frame.buffer) outputs = decoder.decode(frame.buffer) except Exception as e: - raise self._format_exception(e) from None + self.outputs_queue.put_nowait(e) if outputs.utility_output: _process_utility_output(outputs.utility_output, utility_results) @@ -381,7 +382,13 @@ def process_outputs_socket(): self.output_queue_thread.start() def get_output(self) -> EngineCoreOutputs: - return self.outputs_queue.get() + # If an exception arises in process_outputs_socket task, + # it is forwarded to the outputs_queue so we can raise it + # from this (run_output_handler) task to shut down the server. + outputs = self.outputs_queue.get() + if isinstance(outputs, Exception): + raise self._format_exception(outputs) from None + return outputs def _send_input(self, request_type: EngineCoreRequestType, request: Any) -> None: From ae1dc32f4b9eabc1334022cf3ae68e0eafd07605 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 25 Mar 2025 21:41:03 +0000 Subject: [PATCH 086/130] check for correct exception Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index e3bdca0b0356..801a99d1bb02 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -113,12 +113,13 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) - with pytest.raises(EngineDeadError): + with pytest.raises( + EngineDeadError if enable_multiprocessing else Exception): llm.generate("Hello my name is Robert and I") - # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) From c2afedc980eaa10c7695816a904d95c360eb6b67 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 25 Mar 2025 21:52:48 +0000 Subject: [PATCH 087/130] wip llm tests Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_startup_error.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 2756a68d36e0..0c373ffc3cde 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -77,7 +77,10 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size, # Monkeypatch an error in the model. monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) - with pytest.raises(Exception, match="initialization failed"): + with pytest.raises( + Exception, + match="initialization failed" + if enable_multiprocessing else "Simulated Error in startup!"): _ = LLM(model="meta-llama/Llama-3.2-1B", enforce_eager=True, tensor_parallel_size=tensor_parallel_size) From 89a5461676137dfade0ec8db678ddb23c7f1b54e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 27 Mar 2025 23:54:34 +0000 Subject: [PATCH 088/130] removed tests of LLM engine without MP Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 139 +++++++++++----------- tests/v1/shutdown/test_processor_error.py | 88 +++++++------- tests/v1/shutdown/test_startup_error.py | 57 ++++----- 3 files changed, 144 insertions(+), 140 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 801a99d1bb02..b8e7f76c32b3 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -14,6 +14,8 @@ from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError +MODELS = ["meta-llama/Llama-3.2-1B"] + def evil_forward(self, *args, **kwargs): """Evil forward method that raise an exception after 10 calls.""" @@ -32,72 +34,73 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.asyncio @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) -async def test_async_llm_model_error(monkeypatch, tensor_parallel_size, - enable_multiprocessing): +@pytest.mark.parametrize("model", MODELS) +async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, + model: str) -> None: + """Test that AsyncLLM propagates a forward pass error and frees memory. + + AsyncLLM always uses an MP client. + """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") - with monkeypatch.context() as m: - MP_VALUE = "1" if enable_multiprocessing else "0" - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) - - # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) - - engine_args = AsyncEngineArgs( - model="meta-llama/Llama-3.2-1B", - enforce_eager=True, - tensor_parallel_size=tensor_parallel_size) - async_llm = AsyncLLM.from_engine_args(engine_args) - - async def generate(request_id: str): - generator = async_llm.generate("Hello my name is", - request_id=request_id, - sampling_params=SamplingParams()) - try: - async for _ in generator: - pass - except Exception as e: - return e - - NUM_REQS = 3 - tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] - outputs = await asyncio.gather(*tasks) - - # Every request should get an EngineDeadError. - for output in outputs: - assert isinstance(output, EngineDeadError) - - # AsyncLLM should be errored. - assert async_llm.errored - - # We should not be able to make another request. - with pytest.raises(EngineDeadError): - async for _ in async_llm.generate( - "Hello my name is", - request_id="abc", - sampling_params=SamplingParams()): - raise Exception("We should not get here.") - - # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) - - # NOTE: shutdown is handled by the API Server if an exception - # occurs, so it is expected that we would need to call this. - async_llm.shutdown() - - -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + engine_args = AsyncEngineArgs(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + async_llm = AsyncLLM.from_engine_args(engine_args) + + async def generate(request_id: str): + generator = async_llm.generate("Hello my name is", + request_id=request_id, + sampling_params=SamplingParams()) + try: + async for _ in generator: + pass + except Exception as e: + return e + + NUM_REQS = 3 + tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] + outputs = await asyncio.gather(*tasks) + + # Every request should get an EngineDeadError. + for output in outputs: + assert isinstance(output, EngineDeadError) + + # AsyncLLM should be errored. + assert async_llm.errored + + # We should not be able to make another request. + with pytest.raises(EngineDeadError): + async for _ in async_llm.generate("Hello my name is", + request_id="abc", + sampling_params=SamplingParams()): + raise Exception("We should not get here.") + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + # NOTE: shutdown is handled by the API Server if an exception + # occurs, so it is expected that we would need to call this. + async_llm.shutdown() + + +@pytest.mark.parametrize("enable_multiprocessing", [True]) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -def test_llm_model_error(monkeypatch, tensor_parallel_size, - enable_multiprocessing): - +@pytest.mark.parametrize("model", MODELS) +def test_llm_model_error(monkeypatch, tensor_parallel_size: int, + enable_multiprocessing: bool, model: str) -> None: + """Test that LLM propagates a forward pass error and frees memory. + TODO(andy) - LLM without multiprocessing. + """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") @@ -109,7 +112,7 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, # Monkeypatch an error in the model. m.setattr(LlamaForCausalLM, "forward", evil_forward) - llm = LLM(model="meta-llama/Llama-3.2-1B", + llm = LLM(model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) @@ -117,9 +120,9 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size, EngineDeadError if enable_multiprocessing else Exception): llm.generate("Hello my name is Robert and I") - # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 8fde7e619585..eb1341cbddf3 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -12,56 +12,56 @@ from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineGenerateError +MODELS = ["meta-llama/Llama-3.2-1B"] -@pytest.mark.asyncio -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) -async def test_async_llm_processor_error(monkeypatch, enable_multiprocessing): - - with monkeypatch.context() as m: - MP_VALUE = "1" if enable_multiprocessing else "0" - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) - engine_args = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", - enforce_eager=True) - async_llm = AsyncLLM.from_engine_args(engine_args) +@pytest.mark.asyncio +@pytest.mark.parametrize("model", MODELS) +async def test_async_llm_processor_error(model: str) -> None: + """Test that AsyncLLM propagates a processor error. + Test empty tokens prompt (failure) and non-empty prompt (no failure.) + AsyncLLM always uses an MP client. + """ + engine_args = AsyncEngineArgs(model=model, enforce_eager=True) + async_llm = AsyncLLM.from_engine_args(engine_args) - async def generate(request_id: str): - # [] is not allowed and will raise a ValueError in Processor. - generator = async_llm.generate(TokensPrompt([]), - request_id=request_id, - sampling_params=SamplingParams()) - try: - async for _ in generator: - pass - except Exception as e: - return e + async def generate(request_id: str): + # [] is not allowed and will raise a ValueError in Processor. + generator = async_llm.generate(TokensPrompt([]), + request_id=request_id, + sampling_params=SamplingParams()) + try: + async for _ in generator: + pass + except Exception as e: + return e - NUM_REQS = 3 - tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] - outputs = await asyncio.gather(*tasks) + NUM_REQS = 3 + tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] + outputs = await asyncio.gather(*tasks) - # Every request should have get an EngineGenerateError. - for output in outputs: - with pytest.raises(EngineGenerateError): - raise output + # Every request should have get an EngineGenerateError. + for output in outputs: + with pytest.raises(EngineGenerateError): + raise output - # AsyncLLM should be errored. - assert not async_llm.errored + # AsyncLLM should be errored. + assert not async_llm.errored - # This should be no problem. - EXPECTED_TOKENS = 5 - outputs = [] - async for out in async_llm.generate( - "Hello my name is", - request_id="abc", - sampling_params=SamplingParams( - max_tokens=EXPECTED_TOKENS, - output_kind=RequestOutputKind.DELTA)): - outputs.append(out) + # This should be no problem. + EXPECTED_TOKENS = 5 + outputs = [] + async for out in async_llm.generate( + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams( + max_tokens=EXPECTED_TOKENS, + output_kind=RequestOutputKind.DELTA)): + outputs.append(out) - generated_tokens = [] - for out in outputs: - generated_tokens.extend(out.outputs[0].token_ids) - assert len(generated_tokens) == EXPECTED_TOKENS + generated_tokens = [] + for out in outputs: + generated_tokens.extend(out.outputs[0].token_ids) + assert len(generated_tokens) == EXPECTED_TOKENS - async_llm.shutdown() + async_llm.shutdown() diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 0c373ffc3cde..62f6c3186339 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -29,43 +29,44 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) -def test_async_llm_startup_error(monkeypatch, model, tensor_parallel_size, - enable_multiprocessing): - +def test_async_llm_startup_error(monkeypatch, model: str, + tensor_parallel_size: int) -> None: + """Test that AsyncLLM propagates an __init__ error & frees memory. + + AsyncLLM always uses an MP client. + """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") - with monkeypatch.context() as m: - MP_VALUE = "1" if enable_multiprocessing else "0" - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) - - # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) - engine_args = AsyncEngineArgs( - model=model, - enforce_eager=True, - tensor_parallel_size=tensor_parallel_size) + engine_args = AsyncEngineArgs(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) - # Confirm we get an exception. - with pytest.raises(Exception, match="initialization failed"): - _ = AsyncLLM.from_engine_args(engine_args) + # Confirm we get an exception. + with pytest.raises(Exception, match="initialization failed"): + _ = AsyncLLM.from_engine_args(engine_args) - # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -@pytest.mark.parametrize("enable_multiprocessing", [True, False]) -def test_llm_startup_error(monkeypatch, model, tensor_parallel_size, - enable_multiprocessing): - +@pytest.mark.parametrize("enable_multiprocessing", [True]) +def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, + enable_multiprocessing: bool) -> None: + """Test that LLM propagates an __init__ error and frees memory. + TODO(andy) - LLM without multiprocessing. + """ + if model != "meta-llama/Llama-3.2-1B": + pytest.skip(reason="Only test meta-llama/Llama-3.2-1B") if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") @@ -81,7 +82,7 @@ def test_llm_startup_error(monkeypatch, model, tensor_parallel_size, Exception, match="initialization failed" if enable_multiprocessing else "Simulated Error in startup!"): - _ = LLM(model="meta-llama/Llama-3.2-1B", + _ = LLM(model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) From f60c8b5f73609aea746fd8d5fbdbda140f94776e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 28 Mar 2025 06:13:47 +0000 Subject: [PATCH 089/130] SyncMPClient & MPClient finalizers works Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 69 ++++++++++++++++++++++++++++++++ vllm/v1/engine/core_client.py | 19 ++++++--- 2 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 tests/v1/shutdown/test_delete.py diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py new file mode 100644 index 000000000000..3e45b8736188 --- /dev/null +++ b/tests/v1/shutdown/test_delete.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Test that we handle a startup Error and shutdown.""" + +import pytest + +from tests.utils import wait_for_gpu_memory_to_clear +from vllm import LLM +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.utils import cuda_device_count_stateless +from vllm.v1.engine.async_llm import AsyncLLM + +MODELS = [ + "meta-llama/Llama-3.2-1B", # Raises on first fwd pass. +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: + """Test that AsyncLLM frees GPU memory upon deletion. + AsyncLLM always uses an MP client. + """ + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + engine_args = AsyncEngineArgs(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + + # Instantiate & delete AsyncLLM + inst = AsyncLLM.from_engine_args(engine_args) + del inst + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +@pytest.mark.parametrize("enable_multiprocessing", [True]) +def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, + enable_multiprocessing: bool) -> None: + """Test that LLM frees GPU memory upon deletion. + TODO(andy) - LLM without multiprocessing. + """ + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + with monkeypatch.context() as m: + + MP_VALUE = "1" if enable_multiprocessing else "0" + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) + + # Instantiate and delete LLM + inst = LLM(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + del inst + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 99f9fbe41ab9..62acb2ea4303 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -2,6 +2,7 @@ import asyncio import queue import uuid +import weakref from abc import ABC, abstractmethod from concurrent.futures import Future from dataclasses import dataclass @@ -265,7 +266,9 @@ def __init__( # This will ensure resources created so far are closed # when the client is garbage collected, even if an # exception is raised mid-construction. - self.resources = BackgroundResources(ctx=sync_ctx) + resources = BackgroundResources(ctx=sync_ctx) + self.resources = resources + self._finalizer = weakref.finalize(self, resources) # Paths for IPC. self.output_path = get_open_zmq_ipc_path() @@ -293,7 +296,7 @@ def __init__( def shutdown(self): # Terminate background resources - self.resources() + self._finalizer() def _validate_alive(self, buffer: Any): if buffer == EngineCoreProc.ENGINE_CORE_DEAD: @@ -343,10 +346,13 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], shutdown_path = get_open_zmq_inproc_path() self.resources.shutdown_path = shutdown_path + self_weakref = weakref.ref(self) + def process_outputs_socket(): shutdown_socket = ctx.socket(zmq.PAIR) shutdown_socket.bind(shutdown_path) out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL) + local_self = None try: poller = zmq.Poller() poller.register(shutdown_socket) @@ -358,13 +364,16 @@ def process_outputs_socket(): if len(socks) == 2 or socks[0][0] == shutdown_socket: # shutdown signal, exit thread. break - + local_self = self_weakref() + if local_self is None: + # Instance is being gc'd, exit loop + break try: (frame, ) = out_socket.recv_multipart(copy=False) - self._validate_alive(frame.buffer) + local_self._validate_alive(frame.buffer) outputs = decoder.decode(frame.buffer) except Exception as e: - self.outputs_queue.put_nowait(e) + local_self.outputs_queue.put_nowait(e) if outputs.utility_output: _process_utility_output(outputs.utility_output, utility_results) From 9aed319944a1337f430eba4328a36eb2c6b7b498 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 31 Mar 2025 12:47:21 +0000 Subject: [PATCH 090/130] wip delete tests Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 35 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 3e45b8736188..2c9cb374a4cc 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -4,8 +4,9 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.sampling_params import RequestOutputKind from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM @@ -14,9 +15,10 @@ ] +@pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: +async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: """Test that AsyncLLM frees GPU memory upon deletion. AsyncLLM always uses an MP client. """ @@ -27,9 +29,16 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: enforce_eager=True, tensor_parallel_size=tensor_parallel_size) - # Instantiate & delete AsyncLLM - inst = AsyncLLM.from_engine_args(engine_args) - del inst + # Instantiate AsyncLLM; make request to complete any deferred + # initialization; then delete instance + async_llm = AsyncLLM.from_engine_args(engine_args) + async for _ in async_llm.generate( + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams( + max_tokens=1, output_kind=RequestOutputKind.DELTA)): + pass + del async_llm # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( @@ -51,15 +60,19 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: - MP_VALUE = "1" if enable_multiprocessing else "0" m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) - # Instantiate and delete LLM - inst = LLM(model=model, - enforce_eager=True, - tensor_parallel_size=tensor_parallel_size) - del inst + # Instantiate LLM; make request to complete any deferred + # initialization; then delete instance + llm = LLM(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + # llm.generate( + # "Hello my name is", + # sampling_params=SamplingParams( + # max_tokens=1)) + del llm # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( From be1a23df492cdbfd25fac8e004ca7a0ef54835ab Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 31 Mar 2025 13:40:31 +0000 Subject: [PATCH 091/130] rollback Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 2c9cb374a4cc..5f07984790c2 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -4,9 +4,8 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from vllm import LLM, SamplingParams +from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.sampling_params import RequestOutputKind from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM @@ -15,10 +14,9 @@ ] -@pytest.mark.asyncio @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: +def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: """Test that AsyncLLM frees GPU memory upon deletion. AsyncLLM always uses an MP client. """ @@ -32,12 +30,6 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: # Instantiate AsyncLLM; make request to complete any deferred # initialization; then delete instance async_llm = AsyncLLM.from_engine_args(engine_args) - async for _ in async_llm.generate( - "Hello my name is", - request_id="abc", - sampling_params=SamplingParams( - max_tokens=1, output_kind=RequestOutputKind.DELTA)): - pass del async_llm # Confirm all the processes are cleaned up. From 9f672d8868bc8ab1e07e58195dcc96d15a94a9ef Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 31 Mar 2025 18:22:07 +0000 Subject: [PATCH 092/130] async fix Signed-off-by: Andrew Feldman --- vllm/v1/engine/core_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 1d82becde41a..9852a0d59125 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -588,6 +588,7 @@ async def process_outputs_socket(): try: while True: (frame, ) = await output_socket.recv_multipart(copy=False) + self._validate_alive(frame.buffer) outputs: EngineCoreOutputs = decoder.decode(frame.buffer) if outputs.utility_output: _process_utility_output(outputs.utility_output, From 79c4e1933300f240fc362cdd3fe7ac0913acefa6 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 1 Apr 2025 18:23:24 +0000 Subject: [PATCH 093/130] remove strong refs Signed-off-by: Andrew Feldman --- vllm/v1/engine/core_client.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 9852a0d59125..987518bf442f 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -292,7 +292,7 @@ class BackgroundResources: def __call__(self): """Clean up background resources.""" - + print("\n\n\nHYPERBANANA\n\n\n") for core_engine in self.core_engines: core_engine.close() @@ -578,7 +578,7 @@ def _ensure_output_queue_task(self): utility_results = self.utility_results outputs_queue = self.outputs_queue output_handler = self.outputs_handler - _self_ref = weakref.ref(self) if output_handler else None + _self_ref = weakref.ref(self) output_path = self.output_path output_socket = make_zmq_socket(self.ctx, output_path, zmq.constants.PULL) @@ -586,9 +586,13 @@ def _ensure_output_queue_task(self): async def process_outputs_socket(): try: + _self = _self_ref() + if not _self: + # Client has been garbage collected, abort. + return while True: (frame, ) = await output_socket.recv_multipart(copy=False) - self._validate_alive(frame.buffer) + _self._validate_alive(frame.buffer) outputs: EngineCoreOutputs = decoder.decode(frame.buffer) if outputs.utility_output: _process_utility_output(outputs.utility_output, @@ -606,7 +610,8 @@ async def process_outputs_socket(): if outputs.outputs or outputs.scheduler_stats: outputs_queue.put_nowait(outputs) except Exception as e: - self.outputs_queue.put_nowait(e) + if _self: + _self.outputs_queue.put_nowait(e) self.queue_task = asyncio.create_task(process_outputs_socket(), name="EngineCoreOutputQueueTask") From 07824d5bea9df15357f2bb5ba22fa29425fbabb0 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 1 Apr 2025 18:41:57 +0000 Subject: [PATCH 094/130] add back strong refs Signed-off-by: Andrew Feldman --- vllm/v1/engine/core_client.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 987518bf442f..9852a0d59125 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -292,7 +292,7 @@ class BackgroundResources: def __call__(self): """Clean up background resources.""" - print("\n\n\nHYPERBANANA\n\n\n") + for core_engine in self.core_engines: core_engine.close() @@ -578,7 +578,7 @@ def _ensure_output_queue_task(self): utility_results = self.utility_results outputs_queue = self.outputs_queue output_handler = self.outputs_handler - _self_ref = weakref.ref(self) + _self_ref = weakref.ref(self) if output_handler else None output_path = self.output_path output_socket = make_zmq_socket(self.ctx, output_path, zmq.constants.PULL) @@ -586,13 +586,9 @@ def _ensure_output_queue_task(self): async def process_outputs_socket(): try: - _self = _self_ref() - if not _self: - # Client has been garbage collected, abort. - return while True: (frame, ) = await output_socket.recv_multipart(copy=False) - _self._validate_alive(frame.buffer) + self._validate_alive(frame.buffer) outputs: EngineCoreOutputs = decoder.decode(frame.buffer) if outputs.utility_output: _process_utility_output(outputs.utility_output, @@ -610,8 +606,7 @@ async def process_outputs_socket(): if outputs.outputs or outputs.scheduler_stats: outputs_queue.put_nowait(outputs) except Exception as e: - if _self: - _self.outputs_queue.put_nowait(e) + self.outputs_queue.put_nowait(e) self.queue_task = asyncio.create_task(process_outputs_socket(), name="EngineCoreOutputQueueTask") From 74d8e8fa95828b43553d0fa0380b15655a6f13bb Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 2 Apr 2025 13:54:04 +0000 Subject: [PATCH 095/130] removed async forward error test Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 67 +------------------------ 1 file changed, 1 insertion(+), 66 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index b8e7f76c32b3..5ec5bd1cee53 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 """Test that we handle an Error in model forward and shutdown.""" -import asyncio - import pytest from tests.utils import wait_for_gpu_memory_to_clear -from vllm import LLM, SamplingParams +from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank -from vllm.engine.arg_utils import AsyncEngineArgs from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.utils import cuda_device_count_stateless -from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError MODELS = ["meta-llama/Llama-3.2-1B"] @@ -32,67 +28,6 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs) -@pytest.mark.asyncio -@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -@pytest.mark.parametrize("model", MODELS) -async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, - model: str) -> None: - """Test that AsyncLLM propagates a forward pass error and frees memory. - - AsyncLLM always uses an MP client. - """ - - if cuda_device_count_stateless() < tensor_parallel_size: - pytest.skip(reason="Not enough CUDA devices") - - # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) - - engine_args = AsyncEngineArgs(model=model, - enforce_eager=True, - tensor_parallel_size=tensor_parallel_size) - async_llm = AsyncLLM.from_engine_args(engine_args) - - async def generate(request_id: str): - generator = async_llm.generate("Hello my name is", - request_id=request_id, - sampling_params=SamplingParams()) - try: - async for _ in generator: - pass - except Exception as e: - return e - - NUM_REQS = 3 - tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] - outputs = await asyncio.gather(*tasks) - - # Every request should get an EngineDeadError. - for output in outputs: - assert isinstance(output, EngineDeadError) - - # AsyncLLM should be errored. - assert async_llm.errored - - # We should not be able to make another request. - with pytest.raises(EngineDeadError): - async for _ in async_llm.generate("Hello my name is", - request_id="abc", - sampling_params=SamplingParams()): - raise Exception("We should not get here.") - - # Confirm all the processes are cleaned up. - wait_for_gpu_memory_to_clear( - devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, - ) - - # NOTE: shutdown is handled by the API Server if an exception - # occurs, so it is expected that we would need to call this. - async_llm.shutdown() - - @pytest.mark.parametrize("enable_multiprocessing", [True]) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) From d66844fa391acf3b043b0402f55a8a8f3226e26d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 2 Apr 2025 14:07:27 +0000 Subject: [PATCH 096/130] removed sync delete dummy request Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 5f07984790c2..31bbabc1c232 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -60,10 +60,6 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, llm = LLM(model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) - # llm.generate( - # "Hello my name is", - # sampling_params=SamplingParams( - # max_tokens=1)) del llm # Confirm all the processes are cleaned up. From 2ee74b6f7201951f94ea3a56c9b2959e63652da4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 4 Apr 2025 15:03:42 +0000 Subject: [PATCH 097/130] temporarily removed test case Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_startup_error.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 62f6c3186339..cc7087e10c79 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -23,7 +23,6 @@ def evil_forward(self, *args, **kwargs): MODELS = [ "meta-llama/Llama-3.2-1B", # Raises on first fwd pass. - "mistralai/Mixtral-8x22B-Instruct-v0.1" # Causes OOM. ] From 86263dc5c5bd966a0251dcf436e04e5b6c6b0a9f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 4 Apr 2025 16:11:00 +0000 Subject: [PATCH 098/130] test load weights failure Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_startup_error.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index cc7087e10c79..a40823dbd45b 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -12,8 +12,8 @@ from vllm.v1.engine.async_llm import AsyncLLM -def evil_forward(self, *args, **kwargs): - """Evil forward method that raise an exception.""" +def evil_method(self, *args, **kwargs): + """Evil method that raises an exception.""" if get_tensor_model_parallel_rank() == 0: raise Exception("Simulated Error in startup!") @@ -28,17 +28,19 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +@pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) def test_async_llm_startup_error(monkeypatch, model: str, - tensor_parallel_size: int) -> None: + tensor_parallel_size: int, + failing_method: str) -> None: """Test that AsyncLLM propagates an __init__ error & frees memory. - + Test profiling (forward()) and load weights failures. AsyncLLM always uses an MP client. """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method) engine_args = AsyncEngineArgs(model=model, enforce_eager=True, @@ -59,9 +61,12 @@ def test_async_llm_startup_error(monkeypatch, model: str, @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) +@pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, - enable_multiprocessing: bool) -> None: + enable_multiprocessing: bool, + failing_method: str) -> None: """Test that LLM propagates an __init__ error and frees memory. + Test profiling (forward()) and load weights failures. TODO(andy) - LLM without multiprocessing. """ if model != "meta-llama/Llama-3.2-1B": @@ -75,7 +80,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", MP_VALUE) # Monkeypatch an error in the model. - monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + monkeypatch.setattr(LlamaForCausalLM, failing_method, evil_method) with pytest.raises( Exception, From f824c154df143f7b88c2aed4e314cd7cb0345cd9 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Fri, 4 Apr 2025 12:14:22 -0400 Subject: [PATCH 099/130] Update vllm/v1/engine/exceptions.py Co-authored-by: Cyrus Leung --- vllm/v1/engine/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index aa8a1227420e..97dd31d5e521 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -8,7 +8,7 @@ class EngineDeadError(Exception): """Raised when the EngineCore dies. Unrecoverable.""" def __init__(self, *args, suppress_context: bool = False, **kwargs): - ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace for the root cause." # noqa: E501 + ENGINE_DEAD_MESSAGE = "EngineCore encountered an issue. See stack trace (above) for the root cause." # noqa: E501 super().__init__(ENGINE_DEAD_MESSAGE, *args, **kwargs) # Make stack trace clearer when using with LLMEngine by From 7dc02fa03da17d29e0353bcbb8979fd1b294aa94 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 8 Apr 2025 17:00:20 -0700 Subject: [PATCH 100/130] Post main-merge cleanup/fixes Signed-off-by: Nick Hill --- vllm/entrypoints/launcher.py | 2 +- vllm/v1/engine/async_llm.py | 24 +++++----- vllm/v1/engine/core.py | 70 +++++++++++++----------------- vllm/v1/engine/core_client.py | 68 +++++++++++++++-------------- vllm/v1/engine/output_processor.py | 29 ++++++++----- 5 files changed, 95 insertions(+), 98 deletions(-) diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index 1056ffec93c9..a4f70a51ebaf 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -103,7 +103,7 @@ def terminate_if_errored(server: uvicorn.Server, engine: EngineClient): for this request. """ engine_errored = engine.errored and not engine.is_running - if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored): + if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored: server.should_exit = True diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0db3d057d5b5..6e3a3c658ea8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,7 +26,7 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.core_client import AsyncMPClient +from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.output_processor import (OutputProcessor, RequestOutputCollector) @@ -96,7 +96,11 @@ def __init__( log_stats=self.log_stats) # EngineCore (starts the engine in background process). - self.engine_core = AsyncMPClient( + core_client_class = AsyncMPClient if ( + vllm_config.parallel_config.data_parallel_size + == 1) else DPAsyncMPClient + + self.engine_core = core_client_class( vllm_config=vllm_config, executor_class=executor_class, log_stats=self.log_stats, @@ -280,8 +284,6 @@ async def generate( # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() or await q.get() - if isinstance(out, Exception): - raise out # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. @@ -355,9 +357,8 @@ async def _run_output_handler(self): iteration_stats=iteration_stats, ) - except Exception as e: - logger.error("AsyncLLM output_handler got an Exception:", - exc_info=e) + except Exception: + logger.exception("AsyncLLM output_handler failed.") self.output_processor.propagate_error(EngineDeadError()) async def abort(self, request_id: str) -> None: @@ -463,11 +464,8 @@ async def pin_lora(self, lora_id: int) -> bool: @property def is_running(self) -> bool: - # Have not started the loop yet. - if self.output_handler is None: - return True - - return not self.output_handler.done() + # Is None before the loop is started. + return self.output_handler is None or not self.output_handler.done() @property def is_stopped(self) -> bool: @@ -475,7 +473,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return (self.engine_core.is_engine_dead or not self.is_running) + return self.engine_core.is_engine_dead or not self.is_running @property def dead_error(self) -> BaseException: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index dc75a17b4248..014b5431d39e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -317,32 +317,28 @@ def __init__( log_stats: bool, engine_index: int = 0, ): - try: - super().__init__(vllm_config, executor_class, log_stats) - - self.errored_sent_event = threading.Event() - self.step_fn = (self.step if self.batch_queue is None else - self.step_with_batch_queue) - self.global_unfinished_reqs = False - - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[tuple[EngineCoreRequestType, - Any]] = queue.Queue() - self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() - threading.Thread(target=self.process_input_socket, - args=(input_path, engine_index), - daemon=True).start() - threading.Thread(target=self.process_output_socket, - args=(output_path, engine_index), - daemon=True).start() - - except Exception as e: - logger.exception("Error during EngineCore initialization.") - raise e + super().__init__(vllm_config, executor_class, log_stats) + + self.errored_sent_event = threading.Event() + self.step_fn = (self.step if self.batch_queue is None else + self.step_with_batch_queue) + self.global_unfinished_reqs = False + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue: queue.Queue[tuple[EngineCoreRequestType, + Any]] = queue.Queue() + self.output_queue: queue.Queue[Union[EngineCoreOutputs, + bytes]] = queue.Queue() + threading.Thread(target=self.process_input_socket, + args=(input_path, engine_index), + daemon=True).start() + threading.Thread(target=self.process_output_socket, + args=(output_path, engine_index), + daemon=True).start() @staticmethod def run_engine_core(*args, @@ -370,7 +366,6 @@ def signal_handler(signum, frame): signal.signal(signal.SIGINT, signal_handler) engine_core: Optional[EngineCoreProc] = None - startup_failed = True try: parallel_config: ParallelConfig = kwargs[ "vllm_config"].parallel_config @@ -382,21 +377,18 @@ def signal_handler(signum, frame): else: engine_core = EngineCoreProc(*args, **kwargs) - startup_failed = False engine_core.run_busy_loop() except SystemExit: - logger.debug("EngineCore interrupted.") + logger.debug("EngineCore exiting.") + except Exception as e: - if startup_failed: - logger.exception("EngineCore got error at startup:", - exc_info=e) - # TODO need to send failure here - raise e + if engine_core is None: + logger.exception("EngineCore failed to start.") else: - assert engine_core is not None - logger.exception("EngineCore got an Exception:", exc_info=e) + logger.exception("EngineCore encountered a fatal error.") engine_core._send_engine_dead() + raise e finally: if engine_core is not None: engine_core.shutdown() @@ -486,11 +478,11 @@ def _convert_msgspec_args(method, args): def _send_engine_dead(self): """Send EngineDead status to the EngineCoreClient.""" - # Put ENGINE_CORE_DEAD to the front of the queue. + # Put ENGINE_CORE_DEAD in the queue. self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) # Wait until msg sent by the daemon before shutdown. - if not self.errored_sent_event.wait(timeout=10.): + if not self.errored_sent_event.wait(timeout=5.): logger.fatal("vLLM shutdown signal from EngineCore failed " "to send. Please report this issue.") @@ -536,7 +528,7 @@ def process_output_socket(self, output_path: str, engine_index: int): while True: outputs = self.output_queue.get() if outputs == EngineCoreProc.ENGINE_CORE_DEAD: - socket.send_multipart((outputs, ), copy=False) + socket.send(outputs, copy=False) break assert not isinstance(outputs, bytes) outputs.engine_index = engine_index diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 441be3b4e377..62bf7f9fe809 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -354,34 +354,38 @@ def __init__( self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx # This will ensure resources created so far are closed - # when the client is garbage collected, even if an + # when the client is garbage collected, even if an # exception is raised mid-construction. - resources = BackgroundResources(ctx=sync_ctx) - self.resources = resources - self._finalizer = weakref.finalize(self, resources) - - # Paths and sockets for IPC. - self.output_path = get_open_zmq_ipc_path() - input_path = get_open_zmq_ipc_path() - self.input_socket = make_zmq_socket(self.ctx, - input_path, - zmq.ROUTER, - bind=True) - self.resources.input_socket = self.input_socket - - self.is_engine_dead = False - new_core_engine = lambda index, local_dp_rank=None: CoreEngine( - vllm_config, executor_class, log_stats, input_path, self. - output_path, index, local_dp_rank) - - # Start engine core process(es). - self._init_core_engines(vllm_config, new_core_engine, - self.resources.core_engines) - - # Wait for engine core process(es) to start. - self._wait_for_engine_startup() - - self.utility_results: dict[int, AnyFuture] = {} + self.resources = BackgroundResources(ctx=sync_ctx) + self._finalizer = weakref.finalize(self, self.resources) + success = False + try: + # Paths and sockets for IPC. + self.output_path = get_open_zmq_ipc_path() + input_path = get_open_zmq_ipc_path() + self.input_socket = make_zmq_socket(self.ctx, + input_path, + zmq.ROUTER, + bind=True) + self.resources.input_socket = self.input_socket + + self.is_engine_dead = False + new_core_engine = lambda index, local_dp_rank=None: CoreEngine( + vllm_config, executor_class, log_stats, input_path, self. + output_path, index, local_dp_rank) + + # Start engine core process(es). + self._init_core_engines(vllm_config, new_core_engine, + self.resources.core_engines) + + # Wait for engine core process(es) to start. + self._wait_for_engine_startup() + + self.utility_results: dict[int, AnyFuture] = {} + success = True + finally: + if not success: + self._finalizer() def _wait_for_engine_startup(self): # Get a sync handle to the socket which can be sync or async. @@ -429,7 +433,7 @@ def _init_core_engines( self.core_engine = core_engine def shutdown(self): - # Terminate background resources + # Terminate background resources. self._finalizer() def _validate_alive(self, buffer: Any): @@ -440,8 +444,8 @@ def _validate_alive(self, buffer: Any): def _format_exception(self, e: Exception) -> Exception: """If errored, use EngineDeadError so root cause is clear.""" - return (EngineDeadError( - suppress_context=True) if self.is_engine_dead else e) + return EngineDeadError( + suppress_context=True) if self.is_engine_dead else e def _process_utility_output(output: UtilityOutput, @@ -485,7 +489,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], def process_outputs_socket(): shutdown_socket = ctx.socket(zmq.PAIR) out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL) - local_self = None try: shutdown_socket.bind(shutdown_path) poller = zmq.Poller() @@ -631,7 +634,6 @@ def _ensure_output_queue_task(self): # Perform IO in separate task to parallelize as much as possible. # Avoid task having direct reference back to the client. - self.outputs_queue = asyncio.Queue() decoder = self.decoder utility_results = self.utility_results outputs_queue = self.outputs_queue @@ -645,7 +647,7 @@ def _ensure_output_queue_task(self): async def process_outputs_socket(): try: while True: - (frame, ) = await output_socket.recv_multipart(copy=False) + frame = await output_socket.recv(copy=False) self._validate_alive(frame.buffer) outputs: EngineCoreOutputs = decoder.decode(frame.buffer) if outputs.utility_output: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 917a4c52afd3..21e2a1aee4e2 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -28,35 +28,40 @@ class RequestOutputCollector: def __init__(self, output_kind: RequestOutputKind): self.aggregate = output_kind == RequestOutputKind.DELTA - self.output: Optional[RequestOutput] = None + self.output: Optional[Union[RequestOutput, Exception]] = None self.ready = asyncio.Event() - def put(self, output: RequestOutput) -> None: - '''Non-blocking put operation''' - if self.output is None: + def put(self, output: Union[RequestOutput, Exception]) -> None: + """Non-blocking put operation.""" + if self.output is None or isinstance(output, Exception): self.output = output self.ready.set() - elif self.aggregate: - # Coalesce the outputs in delta case. - self.output.add(output) - else: - # Just replace latest in non-delta case. - self.output = output + elif isinstance(self.output, RequestOutput): + if self.aggregate: + # Coalesce the outputs in delta case. + self.output.add(output) + else: + # Just replace latest in non-delta case. + self.output = output async def get(self) -> RequestOutput: - '''Get operation blocks on put event''' + """Get operation blocks on put event.""" while (output := self.output) is None: await self.ready.wait() self.output = None self.ready.clear() + if isinstance(output, Exception): + raise output return output def get_nowait(self) -> Optional[RequestOutput]: - '''Non-blocking get operation''' + """Non-blocking get operation.""" output = self.output if output is not None: self.output = None self.ready.clear() + if isinstance(output, Exception): + raise output return output From f1bce10c936e41fafbd0997dea6057f00a93812a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 8 Apr 2025 20:43:25 -0700 Subject: [PATCH 101/130] Some updates to MultiprocExecutor Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 135 ++++++++++++++----------- 1 file changed, 74 insertions(+), 61 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 3cab18d94042..a146cad8332b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 - +import multiprocessing import os import pickle import signal @@ -65,31 +65,33 @@ def _init_executor(self) -> None: # Create workers unready_workers: list[UnreadyWorkerProcHandle] = [] - for rank in range(self.world_size): - unready_worker = WorkerProc.make_worker_process( - vllm_config=self.vllm_config, - local_rank=rank, - rank=rank, - distributed_init_method=distributed_init_method, - input_shm_handle=scheduler_output_handle, - ) - unready_workers.append(unready_worker) - - # Workers must be created before wait_for_ready to avoid - # deadlock, since worker.init_device() does a device sync. - self.workers: list[WorkerProcHandle] = [] - for unready_worker in unready_workers: - # NOTE: the WorkerProc wraps startup in a try ... catch - # so if there are any issues in loading in a WorkerProcess - # (e.g. OOM), an Exception will be caught here. - worker = WorkerProc.wait_for_ready(unready_worker) - self.workers.append(worker) - - # Ensure message queues are ready. Will deadlock if re-ordered - # Must be kept consistent with the WorkerProc - self.rpc_broadcast_mq.wait_until_ready() - for w in self.workers: - w.worker_response_mq.wait_until_ready() + success = False + try: + for rank in range(self.world_size): + unready_worker = WorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=rank, + rank=rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + ) + unready_workers.append(unready_worker) + + # Workers must be created before wait_for_ready to avoid + # deadlock, since worker.init_device() does a device sync. + self.workers = WorkerProc.wait_for_ready(unready_workers) + + # Ensure message queues are ready. Will deadlock if re-ordered + # Must be kept consistent with the WorkerProc. + self.rpc_broadcast_mq.wait_until_ready() + for w in self.workers: + w.worker_response_mq.wait_until_ready() + success = True + finally: + if not success: + # Clean up the worker procs if there was a failure. + for handle in unready_workers: + handle.proc.kill() def collective_rpc(self, method: Union[str, Callable], @@ -178,7 +180,7 @@ class UnreadyWorkerProcHandle: """WorkerProcess handle before READY.""" proc: BaseProcess rank: int - ready_pipe: tuple[Connection, Connection] + ready_pipe: Connection @dataclass @@ -211,8 +213,10 @@ def __init__( rank: int, distributed_init_method: str, input_shm_handle: Handle, - ready_pipe: Connection, + ready_pipe: tuple[Connection, Connection], ): + reader, writer = ready_pipe + reader.close() try: self.rank = rank wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) @@ -244,21 +248,20 @@ def __init__( worker_response_mq_handle = self.worker_response_mq.export_handle() # Send READY once we know everything is loaded - ready_pipe.send({ + writer.send({ "status": WorkerProc.READY_STR, - "handle": pickle.dumps(worker_response_mq_handle) + "handle": worker_response_mq_handle, }) # Initialize device and loads weights self.worker.init_device() self.worker.load_model() - except Exception as e: - logger.exception("WorkerProc got error at startup:", exc_info=e) - ready_pipe.send({"status": WorkerProc.FAILED_STR}) + except Exception: + logger.exception("WorkerProc startup failed.") finally: - ready_pipe.close() + writer.close() @staticmethod def make_worker_process( @@ -270,7 +273,7 @@ def make_worker_process( ) -> UnreadyWorkerProcHandle: context = get_mp_context() # (reader, writer) - pipe_tuple = context.Pipe(duplex=False) + reader, writer = context.Pipe(duplex=False) process_kwargs = { "vllm_config": vllm_config, @@ -278,7 +281,7 @@ def make_worker_process( "rank": rank, "distributed_init_method": distributed_init_method, "input_shm_handle": input_shm_handle, - "ready_pipe": pipe_tuple[1], + "ready_pipe": (reader, writer), } # Run EngineCore busy loop in background process. proc = context.Process(target=WorkerProc.worker_main, @@ -286,37 +289,47 @@ def make_worker_process( daemon=True) proc.start() - return UnreadyWorkerProcHandle(proc, rank, pipe_tuple) + writer.close() + return UnreadyWorkerProcHandle(proc, rank, reader) @staticmethod def wait_for_ready( - unready_proc_handle: UnreadyWorkerProcHandle) -> WorkerProcHandle: + unready_proc_handles: list[UnreadyWorkerProcHandle] + ) -> list[WorkerProcHandle]: e = Exception("WorkerProc initialization failed due to " "an exception in a background process. " "See stack trace for root cause.") - ready_pipe = unready_proc_handle.ready_pipe[0] - try: - # Wait until the WorkerProc is ready. - response = ready_pipe.recv() - if response["status"] != "READY": - raise e - - # Extract the message queue handle. - mq_handle = pickle.loads(response["handle"]) - worker_response_mq = MessageQueue.create_from_handle(mq_handle, 0) - return WorkerProcHandle.from_unready_handle( - unready_proc_handle, worker_response_mq) - - except EOFError: - e.__suppress_context__ = True - raise e from None - - finally: - # Close connection. - unready_proc_handle.ready_pipe[0].close() - unready_proc_handle.ready_pipe[1].close() + pipes = {handle.ready_pipe: handle for handle in unready_proc_handles} + ready_proc_handles = [] + while pipes: + ready = multiprocessing.connection.wait(pipes.keys()) + for pipe in ready: + assert isinstance(pipe, Connection) + try: + # Wait until the WorkerProc is ready. + unready_proc_handle = pipes.pop(pipe) + response: dict[str, Any] = pipe.recv() + if response["status"] != "READY": + raise e + + # Extract the message queue handle. + worker_response_mq = MessageQueue.create_from_handle( + response["handle"], 0) + ready_proc_handles.append( + WorkerProcHandle.from_unready_handle( + unready_proc_handle, worker_response_mq)) + + except EOFError: + e.__suppress_context__ = True + raise e from None + + finally: + # Close connection. + pipe.close() + + return ready_proc_handles def shutdown(self): self.rpc_broadcast_mq = None @@ -355,13 +368,13 @@ def signal_handler(signum, frame): worker.worker_busy_loop() - except Exception as e: + except Exception: # NOTE: if an Exception arises in busy_loop, we send # a FAILURE message over the MQ RPC to notify the Executor, # which triggers system shutdown. # TODO(rob): handle case where the MQ itself breaks. - logger.exception("WorkerProc got an Exception:", exc_info=e) + logger.exception("WorkerProc got an Exception.") # The parent sends a SIGTERM to all worker processes if # any worker dies. Set this value so we don't re-throw From d014a6bc0920b030550199b2ba2a2315ac76fb69 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 9 Apr 2025 13:39:18 -0700 Subject: [PATCH 102/130] More multiproc_executor.py streamlining Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 120 ++++++++++++------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index a146cad8332b..772d4aeb54c4 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -68,14 +68,14 @@ def _init_executor(self) -> None: success = False try: for rank in range(self.world_size): - unready_worker = WorkerProc.make_worker_process( - vllm_config=self.vllm_config, - local_rank=rank, - rank=rank, - distributed_init_method=distributed_init_method, - input_shm_handle=scheduler_output_handle, - ) - unready_workers.append(unready_worker) + unready_workers.append( + WorkerProc.make_worker_process( + vllm_config=self.vllm_config, + local_rank=rank, + rank=rank, + distributed_init_method=distributed_init_method, + input_shm_handle=scheduler_output_handle, + )) # Workers must be created before wait_for_ready to avoid # deadlock, since worker.init_device() does a device sync. @@ -90,8 +90,8 @@ def _init_executor(self) -> None: finally: if not success: # Clean up the worker procs if there was a failure. - for handle in unready_workers: - handle.proc.kill() + self._ensure_worker_termination( + [w.proc for w in unready_workers]) def collective_rpc(self, method: Union[str, Callable], @@ -133,7 +133,8 @@ def collective_rpc(self, # Re-raise any other exceptions raise e - def _ensure_worker_termination(self): + @staticmethod + def _ensure_worker_termination(worker_procs: list[BaseProcess]): """Ensure that all worker processes are terminated. Assumes workers have received termination requests. Waits for processing, then sends termination and kill signals if needed.""" @@ -151,7 +152,7 @@ def wait_for_termination(procs, timeout): return False # Send SIGTERM if still running - active_procs = [w.proc for w in self.workers if w.proc.is_alive()] + active_procs = [proc for proc in worker_procs if proc.is_alive()] for p in active_procs: p.terminate() if not wait_for_termination(active_procs, 4): @@ -166,7 +167,7 @@ def shutdown(self): self.shutting_down = True for w in self.workers: w.worker_response_mq = None - self._ensure_worker_termination() + self._ensure_worker_termination([w.proc for w in self.workers]) self.rpc_broadcast_mq = None @@ -204,7 +205,6 @@ class WorkerProc: """Wrapper that runs one Worker in a separate process.""" READY_STR = "READY" - FAILED_STR = "FAILED" def __init__( self, @@ -213,55 +213,38 @@ def __init__( rank: int, distributed_init_method: str, input_shm_handle: Handle, - ready_pipe: tuple[Connection, Connection], ): - reader, writer = ready_pipe - reader.close() - try: - self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) - # TODO: move `init_worker` to executor level as a collective rpc - # call - all_kwargs: list[dict] = [ - {} for _ in range(vllm_config.parallel_config.world_size) - ] - all_kwargs[rank] = { - "vllm_config": vllm_config, - "local_rank": local_rank, - "rank": rank, - "distributed_init_method": distributed_init_method, - "is_driver_worker": rank == 0, - } - wrapper.init_worker(all_kwargs) - self.worker = wrapper - - pid = os.getpid() - _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) - _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) - - # Initialize MessageQueue for receiving SchedulerOutput - self.rpc_broadcast_mq = MessageQueue.create_from_handle( - input_shm_handle, self.worker.rank) - - # Initializes a message queue for sending the model output - self.worker_response_mq = MessageQueue(1, 1) - worker_response_mq_handle = self.worker_response_mq.export_handle() + self.rank = rank + wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) + # TODO: move `init_worker` to executor level as a collective rpc + # call + all_kwargs: list[dict] = [ + {} for _ in range(vllm_config.parallel_config.world_size) + ] + all_kwargs[rank] = { + "vllm_config": vllm_config, + "local_rank": local_rank, + "rank": rank, + "distributed_init_method": distributed_init_method, + "is_driver_worker": rank == 0, + } + wrapper.init_worker(all_kwargs) + self.worker = wrapper - # Send READY once we know everything is loaded - writer.send({ - "status": WorkerProc.READY_STR, - "handle": worker_response_mq_handle, - }) + pid = os.getpid() + _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) + _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) - # Initialize device and loads weights - self.worker.init_device() - self.worker.load_model() + # Initialize MessageQueue for receiving SchedulerOutput + self.rpc_broadcast_mq = MessageQueue.create_from_handle( + input_shm_handle, self.worker.rank) - except Exception: - logger.exception("WorkerProc startup failed.") + # Initializes a message queue for sending the model output + self.worker_response_mq = MessageQueue(1, 1) - finally: - writer.close() + # Initialize device and loads weights + self.worker.init_device() + self.worker.load_model() @staticmethod def make_worker_process( @@ -358,13 +341,26 @@ def signal_handler(signum, frame): signal.signal(signal.SIGINT, signal_handler) worker = None + # tuple[Connection, Connection] + reader, ready_writer = kwargs.pop("ready_pipe") try: + reader.close() worker = WorkerProc(*args, **kwargs) + # Send READY once we know everything is loaded + ready_writer.send({ + "status": + WorkerProc.READY_STR, + "handle": + worker.worker_response_mq.export_handle(), + }) + # Ensure message queues are ready. Will deadlock if re-ordered. # Must be kept consistent with the Executor worker.rpc_broadcast_mq.wait_until_ready() worker.worker_response_mq.wait_until_ready() + ready_writer.close() + ready_writer = None worker.worker_busy_loop() @@ -374,7 +370,10 @@ def signal_handler(signum, frame): # which triggers system shutdown. # TODO(rob): handle case where the MQ itself breaks. - logger.exception("WorkerProc got an Exception.") + if ready_writer is not None: + logger.exception("WorkerProc failed to start.") + else: + logger.exception("WorkerProc failed.") # The parent sends a SIGTERM to all worker processes if # any worker dies. Set this value so we don't re-throw @@ -382,10 +381,11 @@ def signal_handler(signum, frame): shutdown_requested = True finally: + if ready_writer is not None: + ready_writer.close() # Clean up once worker exits busy loop if worker is not None: worker.shutdown() - worker = None class ResponseStatus(Enum): SUCCESS = auto() From c9941da5455db3bda8190d0ac46b6458ac57bd6c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 9 Apr 2025 15:56:19 -0700 Subject: [PATCH 103/130] core_client.py streamlining Signed-off-by: Nick Hill --- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core_client.py | 81 ++++++++++++-------------- vllm/v1/executor/multiproc_executor.py | 1 + 3 files changed, 40 insertions(+), 44 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6e3a3c658ea8..82a7fc9abf4b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -473,7 +473,7 @@ def is_stopped(self) -> bool: @property def errored(self) -> bool: - return self.engine_core.is_engine_dead or not self.is_running + return self.engine_core.resources.engine_dead or not self.is_running @property def dead_error(self) -> BaseException: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 62bf7f9fe809..631467705f2d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -304,6 +304,10 @@ class BackgroundResources: input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None shutdown_path: Optional[str] = None + # Set if any of the engines are dead. Here so that the output + # processing threads can access it without holding a ref to the client. + engine_dead: bool = False + def __call__(self): """Clean up background resources.""" @@ -324,6 +328,11 @@ def __call__(self): # Send shutdown signal. shutdown_sender.send(b'') + def validate_alive(self, buffer: Any): + if buffer == EngineCoreProc.ENGINE_CORE_DEAD: + self.engine_dead = True + raise EngineDeadError() + class MPClient(EngineCoreClient): """ @@ -369,7 +378,6 @@ def __init__( bind=True) self.resources.input_socket = self.input_socket - self.is_engine_dead = False new_core_engine = lambda index, local_dp_rank=None: CoreEngine( vllm_config, executor_class, log_stats, input_path, self. output_path, index, local_dp_rank) @@ -436,16 +444,14 @@ def shutdown(self): # Terminate background resources. self._finalizer() - def _validate_alive(self, buffer: Any): - if buffer == EngineCoreProc.ENGINE_CORE_DEAD: - self.is_engine_dead = True - raise EngineDeadError() - def _format_exception(self, e: Exception) -> Exception: """If errored, use EngineDeadError so root cause is clear.""" - return EngineDeadError( - suppress_context=True) if self.is_engine_dead else e + suppress_context=True) if self.resources.engine_dead else e + + def ensure_alive(self): + if self.resources.engine_dead: + raise EngineDeadError() def _process_utility_output(output: UtilityOutput, @@ -482,9 +488,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], outputs_queue = self.outputs_queue shutdown_path = get_open_zmq_inproc_path() - self.resources.shutdown_path = shutdown_path - - self_weakref = weakref.ref(self) + resources = self.resources + resources.shutdown_path = shutdown_path def process_outputs_socket(): shutdown_socket = ctx.socket(zmq.PAIR) @@ -501,21 +506,16 @@ def process_outputs_socket(): if len(socks) == 2 or socks[0][0] == shutdown_socket: # shutdown signal, exit thread. break - local_self = self_weakref() - if local_self is None: - # Instance is being gc'd, exit loop - break - try: - frame = out_socket.recv(copy=False) - local_self._validate_alive(frame.buffer) - outputs = decoder.decode(frame.buffer) - except Exception as e: - local_self.outputs_queue.put_nowait(e) + frame = out_socket.recv(copy=False) + resources.validate_alive(frame.buffer) + outputs = decoder.decode(frame.buffer) if outputs.utility_output: _process_utility_output(outputs.utility_output, utility_results) else: outputs_queue.put_nowait(outputs) + except Exception as e: + outputs_queue.put_nowait(e) finally: # Close sockets. shutdown_socket.close(linger=0) @@ -537,13 +537,11 @@ def get_output(self) -> EngineCoreOutputs: return outputs def _send_input(self, request_type: EngineCoreRequestType, request: Any): - try: - # (Identity, RequestType, SerializedRequest) - msg = (self.core_engine.identity, request_type.value, - self.encoder.encode(request)) - self.input_socket.send_multipart(msg, copy=False) - except Exception as e: - raise self._format_exception(e) from None + self.ensure_alive() + # (Identity, RequestType, SerializedRequest) + msg = (self.core_engine.identity, request_type.value, + self.encoder.encode(request)) + self.input_socket.send_multipart(msg, copy=False) def call_utility(self, method: str, *args) -> Any: call_id = uuid.uuid1().int >> 64 @@ -642,13 +640,14 @@ def _ensure_output_queue_task(self): output_path = self.output_path output_socket = make_zmq_socket(self.ctx, output_path, zmq.constants.PULL) - self.resources.output_socket = output_socket + resources = self.resources + resources.output_socket = output_socket async def process_outputs_socket(): try: while True: frame = await output_socket.recv(copy=False) - self._validate_alive(frame.buffer) + resources.validate_alive(frame.buffer) outputs: EngineCoreOutputs = decoder.decode(frame.buffer) if outputs.utility_output: _process_utility_output(outputs.utility_output, @@ -666,7 +665,7 @@ async def process_outputs_socket(): if outputs.outputs or outputs.scheduler_stats: outputs_queue.put_nowait(outputs) except Exception as e: - self.outputs_queue.put_nowait(e) + outputs_queue.put_nowait(e) self.queue_task = asyncio.create_task(process_outputs_socket(), name="EngineCoreOutputQueueTask") @@ -691,22 +690,18 @@ def _send_input(self, request_type: EngineCoreRequestType, request: Any, engine: Optional[CoreEngine] = None) -> Awaitable[None]: - try: - if engine is None: - engine = self.core_engine + self.ensure_alive() + if engine is None: + engine = self.core_engine - message = (request_type.value, self.encoder.encode(request)) - return self._send_input_message(message, engine) - except Exception as e: - raise self._format_exception(e) from None + message = (request_type.value, self.encoder.encode(request)) + return self._send_input_message(message, engine) def _send_input_message(self, message: tuple[bytes, bytes], engine: CoreEngine) -> Awaitable[None]: - try: - message = (engine.identity, ) + message # type: ignore[assignment] - return self.input_socket.send_multipart(message, copy=False) - except Exception as e: - raise self._format_exception(e) from None + self.ensure_alive() + message = (engine.identity, ) + message # type: ignore[assignment] + return self.input_socket.send_multipart(message, copy=False) async def call_utility_async(self, method: str, *args) -> Any: return await self._call_utility_async(method, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 772d4aeb54c4..64ae03587001 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -269,6 +269,7 @@ def make_worker_process( # Run EngineCore busy loop in background process. proc = context.Process(target=WorkerProc.worker_main, kwargs=process_kwargs, + name=f"VllmWorker-{rank}", daemon=True) proc.start() From 72740ca2c04e9f53e71857c271f8fe0a1abd4c5d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 00:04:53 +0000 Subject: [PATCH 104/130] timeout Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 3 +++ tests/v1/shutdown/test_forward_error.py | 2 ++ tests/v1/shutdown/test_processor_error.py | 2 ++ tests/v1/shutdown/test_startup_error.py | 3 +++ tests/v1/shutdown/util.py | 4 ++++ 5 files changed, 14 insertions(+) create mode 100644 tests/v1/shutdown/util.py diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 31bbabc1c232..639ecf0c5727 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -4,6 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear +from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.utils import cuda_device_count_stateless @@ -14,6 +15,7 @@ ] +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: @@ -40,6 +42,7 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: ) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 5ec5bd1cee53..9851524724fa 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -4,6 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear +from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.models.llama import LlamaForCausalLM @@ -28,6 +29,7 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("enable_multiprocessing", [True]) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index eb1341cbddf3..84328500202b 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -5,6 +5,7 @@ import pytest +from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs.data import TokensPrompt @@ -16,6 +17,7 @@ @pytest.mark.asyncio +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("model", MODELS) async def test_async_llm_processor_error(model: str) -> None: """Test that AsyncLLM propagates a processor error. diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index a40823dbd45b..62b14cd5f66f 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -4,6 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear +from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.engine.arg_utils import AsyncEngineArgs @@ -26,6 +27,7 @@ def evil_method(self, *args, **kwargs): ] +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) @@ -58,6 +60,7 @@ def test_async_llm_startup_error(monkeypatch, model: str, ) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) diff --git a/tests/v1/shutdown/util.py b/tests/v1/shutdown/util.py new file mode 100644 index 000000000000..ab6111cde348 --- /dev/null +++ b/tests/v1/shutdown/util.py @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Shutdown test utils""" + +SHUTDOWN_TEST_TIMEOUT = 60 From 1a76f3655c7931d2bf5e89fcac59160ec6a876be Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 00:33:59 +0000 Subject: [PATCH 105/130] refactor Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 2 +- tests/v1/shutdown/test_forward_error.py | 7 ++++--- tests/v1/shutdown/test_processor_error.py | 2 +- tests/v1/shutdown/test_startup_error.py | 2 +- tests/v1/shutdown/{util.py => utils.py} | 0 5 files changed, 7 insertions(+), 6 deletions(-) rename tests/v1/shutdown/{util.py => utils.py} (100%) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 639ecf0c5727..c01447b2e137 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -4,7 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.utils import cuda_device_count_stateless diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 9851524724fa..e9b6022774f6 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -4,7 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.models.llama import LlamaForCausalLM @@ -31,12 +31,13 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) @pytest.mark.parametrize("enable_multiprocessing", [True]) -@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) +@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("model", MODELS) def test_llm_model_error(monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str) -> None: """Test that LLM propagates a forward pass error and frees memory. - TODO(andy) - LLM without multiprocessing. + TODO(andy) - LLM without multiprocessing; LLM with multiprocessing + and >1 rank """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 84328500202b..78221c0f173a 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -5,7 +5,7 @@ import pytest -from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs.data import TokensPrompt diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 62b14cd5f66f..c7abf03bc9a6 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -4,7 +4,7 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.util import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.engine.arg_utils import AsyncEngineArgs diff --git a/tests/v1/shutdown/util.py b/tests/v1/shutdown/utils.py similarity index 100% rename from tests/v1/shutdown/util.py rename to tests/v1/shutdown/utils.py From 5bde29d1c4fd6969d014e6f01c705e1887395618 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 00:42:32 +0000 Subject: [PATCH 106/130] refactor Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 17 +++++++---------- tests/v1/shutdown/test_forward_error.py | 8 ++++---- tests/v1/shutdown/test_processor_error.py | 4 ++-- tests/v1/shutdown/test_startup_error.py | 20 ++++++++------------ tests/v1/shutdown/utils.py | 3 ++- 5 files changed, 23 insertions(+), 29 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index c01447b2e137..e11f267f15d9 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -4,18 +4,17 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES, + SHUTDOWN_TEST_TIMEOUT_SEC) from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM -MODELS = [ - "meta-llama/Llama-3.2-1B", # Raises on first fwd pass. -] +MODELS = ["meta-llama/Llama-3.2-1B"] -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: @@ -37,12 +36,11 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, + threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, ) -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) @@ -68,6 +66,5 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, + threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, ) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index e9b6022774f6..04e585046461 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -4,7 +4,8 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES, + SHUTDOWN_TEST_TIMEOUT_SEC) from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.models.llama import LlamaForCausalLM @@ -29,7 +30,7 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs) -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("enable_multiprocessing", [True]) @pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("model", MODELS) @@ -61,6 +62,5 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int, # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, + threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, ) diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 78221c0f173a..0fe48da475c6 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -5,7 +5,7 @@ import pytest -from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs.data import TokensPrompt @@ -17,7 +17,7 @@ @pytest.mark.asyncio -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) async def test_async_llm_processor_error(model: str) -> None: """Test that AsyncLLM propagates a processor error. diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index c7abf03bc9a6..1bba19102ec6 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -4,7 +4,8 @@ import pytest from tests.utils import wait_for_gpu_memory_to_clear -from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT +from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES, + SHUTDOWN_TEST_TIMEOUT_SEC) from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.engine.arg_utils import AsyncEngineArgs @@ -12,6 +13,8 @@ from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM +MODELS = ["meta-llama/Llama-3.2-1B"] + def evil_method(self, *args, **kwargs): """Evil method that raises an exception.""" @@ -22,12 +25,7 @@ def evil_method(self, *args, **kwargs): return self.model(*args, **kwargs, intermediate_tensors=None) -MODELS = [ - "meta-llama/Llama-3.2-1B", # Raises on first fwd pass. -] - - -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("failing_method", ["forward", "load_weights"]) @@ -55,12 +53,11 @@ def test_async_llm_startup_error(monkeypatch, model: str, # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, + threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, ) -@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT) +@pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) @@ -96,6 +93,5 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int, # Confirm all the processes are cleaned up. wait_for_gpu_memory_to_clear( devices=list(range(tensor_parallel_size)), - threshold_bytes=2 * 2**30, - timeout_s=60, + threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES, ) diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py index ab6111cde348..8f7c0380d407 100644 --- a/tests/v1/shutdown/utils.py +++ b/tests/v1/shutdown/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 """Shutdown test utils""" -SHUTDOWN_TEST_TIMEOUT = 60 +SHUTDOWN_TEST_TIMEOUT_SEC = 120 +SHUTDOWN_TEST_THRESHOLD_BYTES = 2 * 2**30 From 1a0a217207593133bac1def2efa95eb47d1fbca5 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 9 Apr 2025 18:23:23 -0700 Subject: [PATCH 107/130] Process monitor for TP workers Signed-off-by: Nick Hill --- vllm/v1/engine/__init__.py | 2 + vllm/v1/engine/async_llm.py | 11 +++++- vllm/v1/engine/core.py | 51 +++++++++++++++++--------- vllm/v1/engine/core_client.py | 19 ++++++++-- vllm/v1/executor/abstract.py | 9 ++++- vllm/v1/executor/multiproc_executor.py | 46 +++++++++++++++++++++-- 6 files changed, 110 insertions(+), 28 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 1264e43c79d9..af4122a51077 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -156,3 +156,5 @@ class EngineCoreRequestType(enum.Enum): ABORT = b'\x01' START_DP = b'\x02' UTILITY = b'\x03' + # Sentinel used within EngineCoreProc. + EXECUTOR_FAILED = b'\x04' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 82a7fc9abf4b..44ead7b22cfa 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -107,6 +107,13 @@ def __init__( ) self.output_handler: Optional[asyncio.Task] = None + try: + # Start output handler eagerly if we are in the asyncio eventloop. + asyncio.get_running_loop() + self.output_handler = asyncio.create_task( + self._run_output_handler()) + except RuntimeError: + pass @classmethod def from_vllm_config( @@ -357,9 +364,9 @@ async def _run_output_handler(self): iteration_stats=iteration_stats, ) - except Exception: + except Exception as e: logger.exception("AsyncLLM output_handler failed.") - self.output_processor.propagate_error(EngineDeadError()) + self.output_processor.propagate_error(e) async def abort(self, request_id: str) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 4c92cd8d86e1..8b2b2311d199 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -48,12 +48,11 @@ class EngineCore: """Inner loop of vLLM's Engine.""" - def __init__( - self, - vllm_config: VllmConfig, - executor_class: type[Executor], - log_stats: bool, - ): + def __init__(self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + executor_fail_callback: Optional[Callable] = None): assert vllm_config.model_config.runner_type != "pooling" logger.info("Initializing a V1 LLM engine (v%s) with config: %s", @@ -63,6 +62,9 @@ def __init__( # Setup Model. self.model_executor = executor_class(vllm_config) + if executor_fail_callback is not None: + self.model_executor.register_failure_callback( + executor_fail_callback) # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = \ @@ -317,9 +319,15 @@ def __init__( log_stats: bool, engine_index: int = 0, ): - super().__init__(vllm_config, executor_class, log_stats) + input_queue: queue.Queue[tuple[EngineCoreRequestType, + Any]] = queue.Queue() + + executor_fail_callback = lambda: input_queue.put_nowait( + (EngineCoreRequestType.EXECUTOR_FAILED, b'')) + + super().__init__(vllm_config, executor_class, log_stats, + executor_fail_callback) - self.errored_sent_event = threading.Event() self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) self.global_unfinished_reqs = False @@ -329,16 +337,17 @@ def __init__( # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue: queue.Queue[tuple[EngineCoreRequestType, - Any]] = queue.Queue() + self.input_queue = input_queue self.output_queue: queue.Queue[Union[EngineCoreOutputs, bytes]] = queue.Queue() threading.Thread(target=self.process_input_socket, args=(input_path, engine_index), daemon=True).start() - threading.Thread(target=self.process_output_socket, - args=(output_path, engine_index), - daemon=True).start() + self.output_thread = threading.Thread( + target=self.process_output_socket, + args=(output_path, engine_index), + daemon=True) + self.output_thread.start() @staticmethod def run_engine_core(*args, @@ -460,6 +469,11 @@ def _handle_client_request(self, request_type: EngineCoreRequestType, f" failed: {str(e)}") self.output_queue.put_nowait( EngineCoreOutputs(utility_output=output)) + elif request_type == EngineCoreRequestType.EXECUTOR_FAILED: + raise RuntimeError("Executor failed.") + else: + logger.error("Unrecognized input request type encountered: %s", + request_type) @staticmethod def _convert_msgspec_args(method, args): @@ -482,7 +496,8 @@ def _send_engine_dead(self): self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) # Wait until msg sent by the daemon before shutdown. - if not self.errored_sent_event.wait(timeout=5.): + self.output_thread.join(timeout=5.0) + if self.output_thread.is_alive(): logger.fatal("vLLM shutdown signal from EngineCore failed " "to send. Please report this issue.") @@ -524,7 +539,10 @@ def process_output_socket(self, output_path: str, engine_index: int): # Reuse send buffer. buffer = bytearray() - with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: + # We must set linger to ensure the ENGINE_CORE_DEAD + # message is sent prior to closing the socket. + with zmq_socket_ctx(output_path, zmq.constants.PUSH, + linger=4000) as socket: while True: outputs = self.output_queue.get() if outputs == EngineCoreProc.ENGINE_CORE_DEAD: @@ -535,9 +553,6 @@ def process_output_socket(self, output_path: str, engine_index: int): encoder.encode_into(outputs, buffer) socket.send(buffer, copy=False) - # Signal to main thread that ENGINE_CORE_DEAD was sent. - self.errored_sent_event.set() - ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 631467705f2d..342cbf5d47e4 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -626,6 +626,16 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], self.outputs_handler: Optional[Callable[ [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None + try: + # If we are running in an asyncio event loop, start the queue task. + # Otherwise, it will be started lazily. If it is not started here, + # we could miss EXECUTOR_FAILED messages from engine core if they + # occur prior to any requests being sent. + asyncio.get_running_loop() + self._ensure_output_queue_task() + except RuntimeError: + pass + def _ensure_output_queue_task(self): if self.queue_task is not None: return @@ -648,7 +658,7 @@ async def process_outputs_socket(): while True: frame = await output_socket.recv(copy=False) resources.validate_alive(frame.buffer) - outputs: EngineCoreOutputs = decoder.decode(frame.buffer) + outputs: EngineCoreOutputs = decoder.decode(frame) if outputs.utility_output: _process_utility_output(outputs.utility_output, utility_results) @@ -783,9 +793,6 @@ class DPAsyncMPClient(AsyncMPClient): def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): - super().__init__(vllm_config, executor_class, log_stats) - - assert len(self.core_engines) > 1 # Control message used for triggering dp idle mode loop. self.start_dp_msg = (EngineCoreRequestType.START_DP.value, @@ -796,6 +803,10 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], self.outputs_handler = DPAsyncMPClient.process_engine_outputs # type: ignore[assignment] + super().__init__(vllm_config, executor_class, log_stats) + + assert len(self.core_engines) > 1 + def _init_core_engines( self, vllm_config: VllmConfig, diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index e3a4cd98c1f8..dae6ca613080 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from concurrent.futures import Future -from typing import Union +from typing import Callable, Union import torch import torch.distributed as dist @@ -62,6 +62,13 @@ def initialize_from_config(self, args=(kv_cache_configs, )) self.collective_rpc("compile_or_warm_up_model") + def register_failure_callback(self, callback: Callable): + """ + Register a function to be called if the executor enters a permanent + failed state. + """ + pass + def determine_available_memory(self) -> list[int]: # in bytes output = self.collective_rpc("determine_available_memory") return output diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 64ae03587001..3734d431292c 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -12,6 +12,7 @@ from functools import partial from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess +from threading import Thread from typing import Any, Callable, Optional, Union import cloudpickle @@ -41,6 +42,8 @@ def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) + self.is_failed = False + self.failure_callback: Optional[Callable] = None self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -86,6 +89,8 @@ def _init_executor(self) -> None: self.rpc_broadcast_mq.wait_until_ready() for w in self.workers: w.worker_response_mq.wait_until_ready() + + self.start_worker_monitor() success = True finally: if not success: @@ -93,6 +98,41 @@ def _init_executor(self) -> None: self._ensure_worker_termination( [w.proc for w in unready_workers]) + def start_worker_monitor(self): + workers = self.workers + self_ref = weakref.ref(self) + + # Monitors worker process liveness. If any die unexpectedly, + # logs an error, shuts down the executor and invokes the failure + # callback to inform the engine. + def monitor_workers(): + sentinels = [h.proc.sentinel for h in workers] + died = multiprocessing.connection.wait(sentinels) + _self = self_ref() + if not _self or getattr(_self, 'shutting_down', False): + return + _self.is_failed = True + proc_name = next(h.proc.name for h in workers + if h.proc.sentinel == died[0]) + logger.error( + "Worker proc %s died unexpectedly, " + "shutting down executor.", proc_name) + _self.shutdown() + callback = _self.failure_callback + if callback is not None: + _self.failure_callback = None + callback() + + Thread(target=monitor_workers, + daemon=True, + name="MultiprocWorkerMonitor").start() + + def register_failure_callback(self, callback: Callable): + if self.is_failed: + callback() + else: + self.failure_callback = callback + def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, @@ -101,6 +141,9 @@ def collective_rpc(self, start_time = time.monotonic() kwargs = kwargs or {} + if self.is_failed: + raise RuntimeError("Executor failed.") + # NOTE: If the args are heterogeneous, then we pack them into a list, # and unpack them in the method of every worker, because every worker # knows their own rank. @@ -129,9 +172,6 @@ def collective_rpc(self, return responses except TimeoutError as e: raise TimeoutError(f"RPC call to {method} timed out.") from e - except Exception as e: - # Re-raise any other exceptions - raise e @staticmethod def _ensure_worker_termination(worker_procs: list[BaseProcess]): From 1abcac38c9ab10da8f0b0e537d4007176df17022 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 15:00:10 +0000 Subject: [PATCH 108/130] ValueError exception Signed-off-by: Andrew Feldman --- vllm/v1/engine/async_llm.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 44ead7b22cfa..84be8e83b8d2 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -311,6 +311,12 @@ async def generate( logger.info("Request %s failed.", request_id) raise + # Bad request discovered in generate() + except ValueError: + if self.log_requests: + logger.info("Request %s failed.", request_id) + raise + # Error in the generate() task (possibly recoverable). except Exception as e: await self.abort(request_id) From 1a4b6a097e2bca986a5a48ade70f80a2723b55cd Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 15:18:01 +0000 Subject: [PATCH 109/130] added llm 2-rank forward error test back Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 04e585046461..7340065b4a4f 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -32,7 +32,7 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("enable_multiprocessing", [True]) -@pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) def test_llm_model_error(monkeypatch, tensor_parallel_size: int, enable_multiprocessing: bool, model: str) -> None: From 863aa08babdaec0df37d89be156a79d09734b88c Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 15:41:42 +0000 Subject: [PATCH 110/130] added back async test Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_forward_error.py | 66 ++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 7340065b4a4f..4d0c72771908 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -1,15 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 """Test that we handle an Error in model forward and shutdown.""" +import asyncio + import pytest from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES, SHUTDOWN_TEST_TIMEOUT_SEC) -from vllm import LLM +from vllm import LLM, AsyncEngineArgs, SamplingParams from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.utils import cuda_device_count_stateless +from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError MODELS = ["meta-llama/Llama-3.2-1B"] @@ -30,6 +33,67 @@ def evil_forward(self, *args, **kwargs): return self.model(*args, **kwargs) +@pytest.mark.asyncio +@pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("model", MODELS) +async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, + model: str) -> None: + """Test that AsyncLLM propagates a forward pass error and frees memory. + + AsyncLLM always uses an MP client. + """ + + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip(reason="Not enough CUDA devices") + + # Monkeypatch an error in the model. + monkeypatch.setattr(LlamaForCausalLM, "forward", evil_forward) + + engine_args = AsyncEngineArgs(model=model, + enforce_eager=True, + tensor_parallel_size=tensor_parallel_size) + async_llm = AsyncLLM.from_engine_args(engine_args) + + async def generate(request_id: str): + generator = async_llm.generate("Hello my name is", + request_id=request_id, + sampling_params=SamplingParams()) + try: + async for _ in generator: + pass + except Exception as e: + return e + + NUM_REQS = 3 + tasks = [generate(f"request-{idx}") for idx in range(NUM_REQS)] + outputs = await asyncio.gather(*tasks) + + # Every request should get an EngineDeadError. + for output in outputs: + assert isinstance(output, EngineDeadError) + + # AsyncLLM should be errored. + assert async_llm.errored + + # We should not be able to make another request. + with pytest.raises(EngineDeadError): + async for _ in async_llm.generate("Hello my name is", + request_id="abc", + sampling_params=SamplingParams()): + raise Exception("We should not get here.") + + # Confirm all the processes are cleaned up. + wait_for_gpu_memory_to_clear( + devices=list(range(tensor_parallel_size)), + threshold_bytes=2 * 2**30, + timeout_s=60, + ) + + # NOTE: shutdown is handled by the API Server if an exception + # occurs, so it is expected that we would need to call this. + async_llm.shutdown() + + @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("enable_multiprocessing", [True]) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) From be9d3560cc797dcd9fab00e46ab1c6c9c99c7406 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 12:10:32 -0700 Subject: [PATCH 111/130] Adjust per request failure log messages Signed-off-by: Nick Hill --- vllm/v1/engine/async_llm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 84be8e83b8d2..51617fdbb28e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -308,16 +308,16 @@ async def generate( # Engine is dead. Do not abort since we shut down. except EngineDeadError: if self.log_requests: - logger.info("Request %s failed.", request_id) + logger.info("Request %s failed (engine dead).", request_id) raise - # Bad request discovered in generate() + # Request validation error. except ValueError: if self.log_requests: - logger.info("Request %s failed.", request_id) + logger.info("Request %s failed (bad request).", request_id) raise - # Error in the generate() task (possibly recoverable). + # Unexpected error in the generate() task (possibly recoverable). except Exception as e: await self.abort(request_id) if self.log_requests: From 95a45baace0224fca3350380df2208f1d9f0bd34 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 13:18:03 -0700 Subject: [PATCH 112/130] Move output queue task ref / cleanup to BackgroundResource Signed-off-by: Nick Hill --- vllm/v1/engine/core_client.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 397526af32ce..f359339994eb 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio +import contextlib import queue import uuid import weakref @@ -302,6 +303,7 @@ class BackgroundResources: core_engines: list[CoreEngine] = field(default_factory=list) output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None + output_queue_task: Optional[asyncio.Task] = None shutdown_path: Optional[str] = None # Set if any of the engines are dead. Here so that the output @@ -314,6 +316,10 @@ def __call__(self): for core_engine in self.core_engines: core_engine.close() + if self.output_queue_task is not None: + with contextlib.suppress(Exception): + self.output_queue_task.cancel() + # ZMQ context termination can hang if the sockets # aren't explicitly closed first. if self.output_socket is not None: @@ -622,8 +628,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs, Exception]] = asyncio.Queue() - self.queue_task: Optional[asyncio.Task] = None - self.outputs_handler: Optional[Callable[ [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None @@ -638,7 +642,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], pass def _ensure_output_queue_task(self): - if self.queue_task is not None: + resources = self.resources + if resources.output_queue_task is not None: return # Perform IO in separate task to parallelize as much as possible. @@ -651,7 +656,6 @@ def _ensure_output_queue_task(self): output_path = self.output_path output_socket = make_zmq_socket(self.ctx, output_path, zmq.constants.PULL) - resources = self.resources resources.output_socket = output_socket async def process_outputs_socket(): @@ -678,13 +682,8 @@ async def process_outputs_socket(): except Exception as e: outputs_queue.put_nowait(e) - self.queue_task = asyncio.create_task(process_outputs_socket(), - name="EngineCoreOutputQueueTask") - - def shutdown(self): - super().shutdown() - if queue_task := getattr(self, "queue_task", None): - queue_task.cancel() + resources.output_queue_task = asyncio.create_task( + process_outputs_socket(), name="EngineCoreOutputQueueTask") async def get_output_async(self) -> EngineCoreOutputs: self._ensure_output_queue_task() From cb70c37d62ffcdecfde738292c7e86e8d539da4d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 20:21:43 +0000 Subject: [PATCH 113/130] added tests back Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 14 ++++++++++++-- tests/v1/shutdown/test_forward_error.py | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index e11f267f15d9..9a3145502a3a 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -6,18 +6,20 @@ from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.shutdown.utils import (SHUTDOWN_TEST_THRESHOLD_BYTES, SHUTDOWN_TEST_TIMEOUT_SEC) -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.sampling_params import RequestOutputKind from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM MODELS = ["meta-llama/Llama-3.2-1B"] +@pytest.mark.asyncio @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: +async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: """Test that AsyncLLM frees GPU memory upon deletion. AsyncLLM always uses an MP client. """ @@ -31,6 +33,12 @@ def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: # Instantiate AsyncLLM; make request to complete any deferred # initialization; then delete instance async_llm = AsyncLLM.from_engine_args(engine_args) + async for _ in async_llm.generate( + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams( + max_tokens=1, output_kind=RequestOutputKind.DELTA)): + pass del async_llm # Confirm all the processes are cleaned up. @@ -61,6 +69,8 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, llm = LLM(model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) + llm.generate("Hello my name is", + sampling_params=SamplingParams(max_tokens=1)) del llm # Confirm all the processes are cleaned up. diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 4d0c72771908..558e2cccb8af 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -34,7 +34,7 @@ def evil_forward(self, *args, **kwargs): @pytest.mark.asyncio -@pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("model", MODELS) async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, model: str) -> None: From 775e0c3aaf30a1d96308fd9b84eca3023358cd04 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 10 Apr 2025 20:42:39 +0000 Subject: [PATCH 114/130] knobs for tests Signed-off-by: Andrew Feldman --- tests/v1/shutdown/test_delete.py | 37 ++++++++++++++++++------- tests/v1/shutdown/test_forward_error.py | 3 +- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 9a3145502a3a..2acb175571e5 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -19,9 +19,16 @@ @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: +@pytest.mark.parametrize("send_one_request", [False]) +async def test_async_llm_delete(model: str, tensor_parallel_size: int, + send_one_request: bool) -> None: """Test that AsyncLLM frees GPU memory upon deletion. AsyncLLM always uses an MP client. + + Args: + model: model under test + tensor_parallel_size: degree of tensor parallelism + send_one_request: send one request to engine before deleting """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") @@ -33,12 +40,13 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: # Instantiate AsyncLLM; make request to complete any deferred # initialization; then delete instance async_llm = AsyncLLM.from_engine_args(engine_args) - async for _ in async_llm.generate( - "Hello my name is", - request_id="abc", - sampling_params=SamplingParams( - max_tokens=1, output_kind=RequestOutputKind.DELTA)): - pass + if send_one_request: + async for _ in async_llm.generate( + "Hello my name is", + request_id="abc", + sampling_params=SamplingParams( + max_tokens=1, output_kind=RequestOutputKind.DELTA)): + pass del async_llm # Confirm all the processes are cleaned up. @@ -52,10 +60,18 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int) -> None: @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) +@pytest.mark.parametrize("send_one_request", [False]) def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, - enable_multiprocessing: bool) -> None: + enable_multiprocessing: bool, + send_one_request: bool) -> None: """Test that LLM frees GPU memory upon deletion. TODO(andy) - LLM without multiprocessing. + + Args: + model: model under test + tensor_parallel_size: degree of tensor parallelism + enable_multiprocessing: enable workers in separate process(es) + send_one_request: send one request to engine before deleting """ if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") @@ -69,8 +85,9 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, llm = LLM(model=model, enforce_eager=True, tensor_parallel_size=tensor_parallel_size) - llm.generate("Hello my name is", - sampling_params=SamplingParams(max_tokens=1)) + if send_one_request: + llm.generate("Hello my name is", + sampling_params=SamplingParams(max_tokens=1)) del llm # Confirm all the processes are cleaned up. diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 558e2cccb8af..b9d0cd8b0b5f 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -42,7 +42,8 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, AsyncLLM always uses an MP client. """ - + if tensor_parallel_size > 1: + pytest.skip(reason="Parallelism > 1 not yet supported for this test.") if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") From 35241157325c78eb2084b39711a61e94bfb63d63 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 14:27:44 -0700 Subject: [PATCH 115/130] Fix rebase bug Signed-off-by: Nick Hill --- vllm/v1/engine/core_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index f359339994eb..813320dd8ca5 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -335,7 +335,8 @@ def __call__(self): shutdown_sender.send(b'') def validate_alive(self, frames: Sequence[zmq.Frame]): - if len(frames) == 1 and frames[0] == EngineCoreProc.ENGINE_CORE_DEAD: + if len(frames) == 1 and (frames[0].buffer + == EngineCoreProc.ENGINE_CORE_DEAD): self.engine_dead = True raise EngineDeadError() From de51ec11b9b10cc2593380c9b871e6bc8848c271 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 15:22:50 -0700 Subject: [PATCH 116/130] Fix AsyncLLM garbage collection cleanup issue Signed-off-by: Nick Hill --- tests/v1/engine/test_async_llm.py | 10 +-- vllm/v1/engine/async_llm.py | 130 ++++++++++++++++-------------- vllm/v1/engine/core_client.py | 4 +- 3 files changed, 75 insertions(+), 69 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index da0639678af8..6ff5e082032b 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -from contextlib import ExitStack from typing import Optional import pytest @@ -86,11 +85,10 @@ async def test_load(monkeypatch: pytest.MonkeyPatch, # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. - with monkeypatch.context() as m, ExitStack() as after: + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) - after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 10 @@ -129,11 +127,10 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch, output_kind: RequestOutputKind, engine_args: AsyncEngineArgs, prompt: PromptType): - with monkeypatch.context() as m, ExitStack() as after: + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) - after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 100 @@ -195,11 +192,10 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch, async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int, engine_args: AsyncEngineArgs, prompt: PromptType): - with monkeypatch.context() as m, ExitStack() as after: + with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) - after.callback(engine.shutdown) sampling_params = SamplingParams(max_tokens=100, output_kind=RequestOutputKind.DELTA, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 51617fdbb28e..46e004ef939c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -110,8 +110,7 @@ def __init__( try: # Start output handler eagerly if we are in the asyncio eventloop. asyncio.get_running_loop() - self.output_handler = asyncio.create_task( - self._run_output_handler()) + self._run_output_handler() except RuntimeError: pass @@ -171,6 +170,9 @@ def from_engine_args( usage_context=usage_context, ) + def __del__(self): + self.shutdown() + def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" @@ -270,9 +272,7 @@ async def generate( # We start the output_handler on the first call to generate() so # we can call __init__ before the event loop, which enables us # to handle startup failure gracefully in the OpenAI server. - if self.output_handler is None: - self.output_handler = asyncio.create_task( - self._run_output_handler()) + self._run_output_handler() q = await self.add_request( request_id, @@ -324,55 +324,69 @@ async def generate( logger.info("Request %s failed.", request_id) raise EngineGenerateError() from e - async def _run_output_handler(self): + def _run_output_handler(self): """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" - try: - while True: - # 1) Pull EngineCoreOutputs from the EngineCore. - outputs = await self.engine_core.get_output_async() - num_outputs = len(outputs.outputs) - - iteration_stats = IterationStats() if ( - self.log_stats and num_outputs) else None - - # Split outputs into chunks of at most - # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the - # event loop for too long. - if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: - slices = (outputs.outputs, ) - else: - slices = np.array_split( - outputs.outputs, - cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) - - for i, outputs_slice in enumerate(slices): - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs_slice, outputs.timestamp, iteration_stats) - # NOTE: RequestOutputs are pushed to their queues. - assert not processed_outputs.request_outputs - - # Allow other asyncio tasks to run between chunks - if i + 1 < len(slices): - await asyncio.sleep(0) - - # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) - - # 4) Logging. - # TODO(rob): make into a coroutine and launch it in - # background thread once Prometheus overhead is non-trivial. - self._record_stats( - engine_index=outputs.engine_index, - scheduler_stats=outputs.scheduler_stats, - iteration_stats=iteration_stats, - ) + if self.output_handler is not None: + return - except Exception as e: - logger.exception("AsyncLLM output_handler failed.") - self.output_processor.propagate_error(e) + # Ensure that the task doesn't have a circular ref back to the AsyncLLM + # object, or else it won't be garbage collected and cleaned up properly. + engine_core = self.engine_core + output_processor = self.output_processor + log_stats = self.log_stats + stat_loggers = self.stat_loggers if log_stats else None + + async def output_handler(): + try: + while True: + # 1) Pull EngineCoreOutputs from the EngineCore. + outputs = await engine_core.get_output_async() + num_outputs = len(outputs.outputs) + + iteration_stats = IterationStats() if ( + log_stats and num_outputs) else None + + # Split outputs into chunks of at most + # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the + # event loop for too long. + if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + slices = (outputs.outputs, ) + else: + slices = np.array_split( + outputs.outputs, + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + + for i, outputs_slice in enumerate(slices): + # 2) Process EngineCoreOutputs. + processed_outputs = output_processor.process_outputs( + outputs_slice, outputs.timestamp, iteration_stats) + # NOTE: RequestOutputs are pushed to their queues. + assert not processed_outputs.request_outputs + + # Allow other asyncio tasks to run between chunks + if i + 1 < len(slices): + await asyncio.sleep(0) + + # 3) Abort any reqs that finished due to stop strings. + await engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) + + # 4) Logging. + # TODO(rob): make into a coroutine and launch it in + # background thread once Prometheus overhead is non-trivial. + if stat_loggers: + assert outputs.scheduler_stats is not None + AsyncLLM._record_stats( + stat_loggers[outputs.engine_index], + scheduler_stats=outputs.scheduler_stats, + iteration_stats=iteration_stats, + ) + except Exception as e: + logger.exception("AsyncLLM output_handler failed.") + output_processor.propagate_error(e) + + self.output_handler = asyncio.create_task(output_handler()) async def abort(self, request_id: str) -> None: """Abort RequestId in OutputProcessor and EngineCore.""" @@ -383,17 +397,15 @@ async def abort(self, request_id: str) -> None: if self.log_requests: logger.info("Aborted request %s.", request_id) + @staticmethod def _record_stats( - self, - scheduler_stats: Optional[SchedulerStats], + stat_loggers: list[StatLoggerBase], + scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats], - engine_index: int = 0, ): - if not self.log_stats: - return - - assert scheduler_stats is not None - for stat_logger in self.stat_loggers[engine_index]: + """static so that it can be used from the output_handler task + without a circular ref to AsyncLLM.""" + for stat_logger in stat_loggers: stat_logger.record(scheduler_stats=scheduler_stats, iteration_stats=iteration_stats) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 813320dd8ca5..3ed3d92290a8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio -import contextlib import queue import uuid import weakref @@ -317,8 +316,7 @@ def __call__(self): core_engine.close() if self.output_queue_task is not None: - with contextlib.suppress(Exception): - self.output_queue_task.cancel() + self.output_queue_task.cancel() # ZMQ context termination can hang if the sockets # aren't explicitly closed first. From a0536c453e9f58e090c67063b0c44e50062c3aac Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 17:44:59 -0700 Subject: [PATCH 117/130] Re-enable failing test (seems to work now) Signed-off-by: Nick Hill --- tests/v1/shutdown/test_forward_error.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index b9d0cd8b0b5f..9fedbe4f9a01 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -42,8 +42,6 @@ async def test_async_llm_model_error(monkeypatch, tensor_parallel_size: int, AsyncLLM always uses an MP client. """ - if tensor_parallel_size > 1: - pytest.skip(reason="Parallelism > 1 not yet supported for this test.") if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") From 76494dce61371c01cacaba8721d880fda8760204 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 10 Apr 2025 17:55:53 -0700 Subject: [PATCH 118/130] Re-enable other failing test (also seems to work now) Signed-off-by: Nick Hill --- tests/v1/shutdown/test_delete.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index 2acb175571e5..ed368fe828d0 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -19,7 +19,7 @@ @pytest.mark.timeout(SHUTDOWN_TEST_TIMEOUT_SEC) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) -@pytest.mark.parametrize("send_one_request", [False]) +@pytest.mark.parametrize("send_one_request", [False, True]) async def test_async_llm_delete(model: str, tensor_parallel_size: int, send_one_request: bool) -> None: """Test that AsyncLLM frees GPU memory upon deletion. @@ -60,7 +60,7 @@ async def test_async_llm_delete(model: str, tensor_parallel_size: int, @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tensor_parallel_size", [2, 1]) @pytest.mark.parametrize("enable_multiprocessing", [True]) -@pytest.mark.parametrize("send_one_request", [False]) +@pytest.mark.parametrize("send_one_request", [False, True]) def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int, enable_multiprocessing: bool, send_one_request: bool) -> None: From b5d870294d71c79a2b3eef3fbc7086fc2f1132aa Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 11 Apr 2025 02:01:17 +0000 Subject: [PATCH 119/130] CUDA_VISIBLE_DEVICES for shutdown tests in buildkite Signed-off-by: Andrew Feldman --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e75470ca5ef..a435c9a4ec0b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -201,7 +201,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/engine - - pytest -v -s v1/shutdown + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/entrypoints - pytest -v -s v1/sample - pytest -v -s v1/worker From b067f8dd4b98d71a4a8006c76609ad2d89d5a42f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 11 Apr 2025 02:07:58 +0000 Subject: [PATCH 120/130] temporarily enabled v1 fastcheck test Signed-off-by: Andrew Feldman --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a435c9a4ec0b..99674dce905a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -197,6 +197,7 @@ steps: source_file_dependencies: - vllm/ - tests/v1 + fast_check: true commands: # split the test to avoid interference - pytest -v -s v1/core From e94c89e9f567b6766632dfad86d3d09753b69509 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Fri, 11 Apr 2025 03:37:43 +0000 Subject: [PATCH 121/130] moved shutdown tests to 2 GPU section Signed-off-by: Andrew Feldman --- .buildkite/test-pipeline.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 99674dce905a..3c0d18103731 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -197,12 +197,10 @@ steps: source_file_dependencies: - vllm/ - tests/v1 - fast_check: true commands: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/engine - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/entrypoints - pytest -v -s v1/sample - pytest -v -s v1/worker @@ -540,6 +538,7 @@ steps: # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - label: Plugin Tests (2 GPUs) # 40min working_dir: "/vllm-workspace/tests" From 6de94aa9a4a386e34b866f63de017948b9c8de17 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 11 Apr 2025 07:12:51 -0700 Subject: [PATCH 122/130] Fix breakage to DP case Signed-off-by: Nick Hill --- vllm/v1/engine/core_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 3ed3d92290a8..edd5788eec57 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -793,10 +793,6 @@ class DPAsyncMPClient(AsyncMPClient): def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool): - # Control message used for triggering dp idle mode loop. - self.start_dp_msg = (EngineCoreRequestType.START_DP.value, - *self.encoder.encode(None)) - self.num_engines_running = 0 self.reqs_in_flight: dict[str, CoreEngine] = {} @@ -804,6 +800,10 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], super().__init__(vllm_config, executor_class, log_stats) + # Control message used for triggering dp idle mode loop. + self.start_dp_msg = (EngineCoreRequestType.START_DP.value, + *self.encoder.encode(None)) + assert len(self.core_engines) > 1 def _init_core_engines( From 060ecd931c42a0eff10fa8b6c44876d4f2650441 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 11 Apr 2025 12:56:43 -0700 Subject: [PATCH 123/130] Properly fix DP breakage Signed-off-by: Nick Hill --- vllm/v1/engine/core_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index edd5788eec57..b63821bb56e2 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -627,9 +627,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs, Exception]] = asyncio.Queue() - self.outputs_handler: Optional[Callable[ - [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None - try: # If we are running in an asyncio event loop, start the queue task. # Otherwise, it will be started lazily. If it is not started here, @@ -650,7 +647,10 @@ def _ensure_output_queue_task(self): decoder = self.decoder utility_results = self.utility_results outputs_queue = self.outputs_queue - output_handler = self.outputs_handler + output_handler: Optional[Callable[[AsyncMPClient, EngineCoreOutputs], + Awaitable[None]]] = getattr( + self.__class__, + "process_engine_outputs", None) _self_ref = weakref.ref(self) if output_handler else None output_path = self.output_path output_socket = make_zmq_socket(self.ctx, output_path, From 4228bb4ad2b7b07853d0aa6a533210a5445c5795 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 11 Apr 2025 17:23:30 -0700 Subject: [PATCH 124/130] Add timeout to TP execute_model, reply only from rank0 Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 45 ++++++++++++++++++-------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 3734d431292c..b23bdc5ffd4c 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -7,13 +7,14 @@ import time import traceback import weakref +from concurrent.futures import Future from dataclasses import dataclass from enum import Enum, auto from functools import partial from multiprocessing.connection import Connection from multiprocessing.process import BaseProcess from threading import Thread -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional, Union, cast import cloudpickle @@ -28,6 +29,7 @@ from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port) from vllm.v1.executor.abstract import Executor +from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -35,6 +37,8 @@ POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 +EXECUTE_MODEL_TIMEOUT_S = 30 + class MultiprocExecutor(Executor): @@ -133,11 +137,22 @@ def register_failure_callback(self, callback: Callable): else: self.failure_callback = callback + def execute_model( + self, + scheduler_output, + ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: + (output, ) = self.collective_rpc("execute_model", + args=(scheduler_output, ), + rank0_reply_only=True, + timeout=EXECUTE_MODEL_TIMEOUT_S) + return output + def collective_rpc(self, method: Union[str, Callable], - timeout: Optional[float] = None, + timeout: Optional[float] = 180.0, args: tuple = (), - kwargs: Optional[dict] = None) -> list[Any]: + kwargs: Optional[dict] = None, + rank0_reply_only: bool = False) -> list[Any]: start_time = time.monotonic() kwargs = kwargs or {} @@ -153,10 +168,11 @@ def collective_rpc(self, else: send_method = cloudpickle.dumps( method, protocol=pickle.HIGHEST_PROTOCOL) - self.rpc_broadcast_mq.enqueue((send_method, args, kwargs)) + self.rpc_broadcast_mq.enqueue( + (send_method, args, kwargs, rank0_reply_only)) responses = [None] * self.world_size - for w in self.workers: + for w in (self.workers[0], ) if rank0_reply_only else self.workers: dequeue_timeout = timeout - (time.monotonic() - start_time ) if timeout is not None else None status, result = w.worker_response_mq.dequeue( @@ -326,7 +342,8 @@ def wait_for_ready( "See stack trace for root cause.") pipes = {handle.ready_pipe: handle for handle in unready_proc_handles} - ready_proc_handles = [] + ready_proc_handles: list[Optional[WorkerProcHandle]] = ( + [None] * len(unready_proc_handles)) while pipes: ready = multiprocessing.connection.wait(pipes.keys()) for pipe in ready: @@ -341,7 +358,7 @@ def wait_for_ready( # Extract the message queue handle. worker_response_mq = MessageQueue.create_from_handle( response["handle"], 0) - ready_proc_handles.append( + ready_proc_handles[unready_proc_handle.rank] = ( WorkerProcHandle.from_unready_handle( unready_proc_handle, worker_response_mq)) @@ -353,7 +370,7 @@ def wait_for_ready( # Close connection. pipe.close() - return ready_proc_handles + return cast(list[WorkerProcHandle], ready_proc_handles) def shutdown(self): self.rpc_broadcast_mq = None @@ -435,7 +452,7 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" while True: - method, args, kwargs = self.rpc_broadcast_mq.dequeue() + method, args, kwargs, rank0_only = self.rpc_broadcast_mq.dequeue() try: if isinstance(method, str): @@ -450,9 +467,11 @@ def worker_busy_loop(self): logger.exception("WorkerProc hit an exception.") # exception might not be serializable, so we convert it to # string, only for logging purpose. - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.FAILURE, str(e))) + if not rank0_only or self.rank == 0: + self.worker_response_mq.enqueue( + (WorkerProc.ResponseStatus.FAILURE, str(e))) continue - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.SUCCESS, output)) + if not rank0_only or self.rank == 0: + self.worker_response_mq.enqueue( + (WorkerProc.ResponseStatus.SUCCESS, output)) From 6c540c3cc8f166facfac12d7b15517494dd5209d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 11 Apr 2025 18:07:21 -0700 Subject: [PATCH 125/130] Cancel shm dequeue on shutdown Signed-off-by: Nick Hill --- .../device_communicators/shm_broadcast.py | 31 ++++++++++++++----- vllm/v1/executor/multiproc_executor.py | 5 ++- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 11ed7c084377..49a65bd0d118 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -7,11 +7,13 @@ from contextlib import contextmanager from dataclasses import dataclass, field from multiprocessing import shared_memory -from typing import List, Optional, Tuple, Union +from threading import Event +from typing import Any, List, Optional, Tuple, Union from unittest.mock import patch import torch import torch.distributed as dist +import zmq from torch.distributed import ProcessGroup from zmq import IPV6 # type: ignore from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore @@ -400,7 +402,9 @@ def acquire_write(self, timeout: Optional[float] = None): break @contextmanager - def acquire_read(self, timeout: Optional[float] = None): + def acquire_read(self, + timeout: Optional[float] = None, + cancel: Optional[Event] = None): assert self._is_local_reader, "Only readers can acquire read" start_time = time.monotonic() n_warning = 1 @@ -430,6 +434,9 @@ def acquire_read(self, timeout: Optional[float] = None): ) n_warning += 1 + if cancel is not None and cancel.is_set(): + raise RuntimeError("cancelled") + # if we time out, raise an exception if (timeout is not None and time.monotonic() - start_time > timeout): @@ -464,10 +471,12 @@ def enqueue(self, obj, timeout: Optional[float] = None): if self.n_remote_reader > 0: self.remote_socket.send(serialized_obj) - def dequeue(self, timeout: Optional[float] = None): + def dequeue(self, + timeout: Optional[float] = None, + cancel: Optional[Event] = None): """ Read from message queue with optional timeout (in seconds) """ if self._is_local_reader: - with self.acquire_read(timeout) as buf: + with self.acquire_read(timeout, cancel) as buf: overflow = buf[0] == 1 if not overflow: # no need to know the size of serialized object @@ -475,15 +484,21 @@ def dequeue(self, timeout: Optional[float] = None): # see https://docs.python.org/3/library/pickle.html obj = pickle.loads(buf[1:]) if overflow: - recv = self.local_socket.recv() - obj = pickle.loads(recv) + obj = MessageQueue.recv(self.local_socket, timeout) elif self._is_remote_reader: - recv = self.remote_socket.recv() - obj = pickle.loads(recv) + obj = MessageQueue.recv(self.remote_socket, timeout) else: raise RuntimeError("Only readers can dequeue") return obj + @staticmethod + def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any: + timeout_ms = None if timeout is None else int(timeout * 1000) + if not socket.poll(timeout=timeout_ms): + raise TimeoutError + recv = socket.recv(copy=False) + return pickle.loads(recv.buffer) + def broadcast_object(self, obj=None): if self._is_writer: self.enqueue(obj) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index b23bdc5ffd4c..aeafe33adcd8 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -4,6 +4,7 @@ import pickle import signal import sys +import threading import time import traceback import weakref @@ -47,6 +48,7 @@ def _init_executor(self) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) self.is_failed = False + self.shutdown_event = threading.Event() self.failure_callback: Optional[Callable] = None self.world_size = self.parallel_config.world_size @@ -176,7 +178,7 @@ def collective_rpc(self, dequeue_timeout = timeout - (time.monotonic() - start_time ) if timeout is not None else None status, result = w.worker_response_mq.dequeue( - timeout=dequeue_timeout) + timeout=dequeue_timeout, cancel=self.shutdown_event) if status != WorkerProc.ResponseStatus.SUCCESS: raise RuntimeError( @@ -221,6 +223,7 @@ def shutdown(self): """Properly shut down the executor and its workers""" if not getattr(self, 'shutting_down', False): self.shutting_down = True + self.shutdown_event.set() for w in self.workers: w.worker_response_mq = None self._ensure_worker_termination([w.proc for w in self.workers]) From da8c253c842230d5f19b58d402e811fd49bff664 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 11 Apr 2025 18:42:29 -0700 Subject: [PATCH 126/130] fix Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index aeafe33adcd8..fd4ea4b4421a 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -173,8 +173,9 @@ def collective_rpc(self, self.rpc_broadcast_mq.enqueue( (send_method, args, kwargs, rank0_reply_only)) - responses = [None] * self.world_size - for w in (self.workers[0], ) if rank0_reply_only else self.workers: + workers = (self.workers[0], ) if rank0_reply_only else self.workers + responses = [None] * len(workers) + for w in workers: dequeue_timeout = timeout - (time.monotonic() - start_time ) if timeout is not None else None status, result = w.worker_response_mq.dequeue( From 27d7d82f0e7bc5d0bb56b9c7553b60fea34fb8e5 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 12 Apr 2025 00:25:33 -0700 Subject: [PATCH 127/130] Fix exception message Signed-off-by: Nick Hill --- vllm/v1/executor/multiproc_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index fd4ea4b4421a..b3dfd0755da8 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -183,8 +183,8 @@ def collective_rpc(self, if status != WorkerProc.ResponseStatus.SUCCESS: raise RuntimeError( - "Worker failed with error %s, please check the" - " stack trace above for the root cause", result) + f"Worker failed with error '{result}', please check the" + " stack trace above for the root cause") responses[w.rank] = result From 444a446ba45bf57bcab89933e1c9b94247437bee Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 12 Apr 2025 00:35:45 -0700 Subject: [PATCH 128/130] Cleanup Signed-off-by: Nick Hill --- vllm/v1/engine/core_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b63821bb56e2..7b7505423ff6 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -796,8 +796,6 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], self.num_engines_running = 0 self.reqs_in_flight: dict[str, CoreEngine] = {} - self.outputs_handler = DPAsyncMPClient.process_engine_outputs # type: ignore[assignment] - super().__init__(vllm_config, executor_class, log_stats) # Control message used for triggering dp idle mode loop. From e33000e4e2fc173570ad85e646fbc129e03c8032 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Mon, 14 Apr 2025 15:30:57 +0000 Subject: [PATCH 129/130] revert Signed-off-by: Andrew Feldman --- tests/v1/engine/test_async_llm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 6ff5e082032b..da0639678af8 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio +from contextlib import ExitStack from typing import Optional import pytest @@ -85,10 +86,11 @@ async def test_load(monkeypatch: pytest.MonkeyPatch, # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) + after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 10 @@ -127,10 +129,11 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch, output_kind: RequestOutputKind, engine_args: AsyncEngineArgs, prompt: PromptType): - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) + after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 100 @@ -192,10 +195,11 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch, async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int, engine_args: AsyncEngineArgs, prompt: PromptType): - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(engine_args) + after.callback(engine.shutdown) sampling_params = SamplingParams(max_tokens=100, output_kind=RequestOutputKind.DELTA, From 7cf6b6f170c5a717a908487d72c1e1530ca1d7f8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 16 Apr 2025 11:17:13 -0700 Subject: [PATCH 130/130] Address review comments from @DarkLight1337 Signed-off-by: Nick Hill --- vllm/v1/engine/core.py | 7 ++----- vllm/v1/engine/core_client.py | 7 +++---- vllm/v1/executor/abstract.py | 4 +++- vllm/v1/executor/multiproc_executor.py | 9 ++++----- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index c6b6febbfd5c..ba5e5050abbb 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -12,7 +12,6 @@ import msgspec import zmq -import zmq.asyncio from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group @@ -320,8 +319,7 @@ def __init__( log_stats: bool, engine_index: int = 0, ): - input_queue: queue.Queue[tuple[EngineCoreRequestType, - Any]] = queue.Queue() + input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() executor_fail_callback = lambda: input_queue.put_nowait( (EngineCoreRequestType.EXECUTOR_FAILED, b'')) @@ -339,8 +337,7 @@ def __init__( # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. self.input_queue = input_queue - self.output_queue: queue.Queue[Union[EngineCoreOutputs, - bytes]] = queue.Queue() + self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]() threading.Thread(target=self.process_input_socket, args=(input_path, engine_index), daemon=True).start() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 7b7505423ff6..f54b3546f06d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -481,8 +481,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats=log_stats, ) - self.outputs_queue: queue.Queue[Union[EngineCoreOutputs, - Exception]] = queue.Queue() + self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]() # Ensure that the outputs socket processing thread does not have # a ref to the client which prevents gc. @@ -625,8 +624,8 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], log_stats=log_stats, ) - self.outputs_queue: asyncio.Queue[Union[EngineCoreOutputs, - Exception]] = asyncio.Queue() + self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, + Exception]]() try: # If we are running in an asyncio event loop, start the queue task. # Otherwise, it will be started lazily. If it is not started here, diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index dae6ca613080..3b9feb0d3298 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -15,6 +15,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput +FailureCallback = Callable[[], None] + class Executor(ExecutorBase): """ @@ -62,7 +64,7 @@ def initialize_from_config(self, args=(kv_cache_configs, )) self.collective_rpc("compile_or_warm_up_model") - def register_failure_callback(self, callback: Callable): + def register_failure_callback(self, callback: FailureCallback): """ Register a function to be called if the executor enters a permanent failed state. diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index b3dfd0755da8..cff6181fa3ad 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -29,7 +29,7 @@ from vllm.logger import init_logger from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port) -from vllm.v1.executor.abstract import Executor +from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase @@ -49,7 +49,7 @@ def _init_executor(self) -> None: self._finalizer = weakref.finalize(self, self.shutdown) self.is_failed = False self.shutdown_event = threading.Event() - self.failure_callback: Optional[Callable] = None + self.failure_callback: Optional[FailureCallback] = None self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -133,7 +133,7 @@ def monitor_workers(): daemon=True, name="MultiprocWorkerMonitor").start() - def register_failure_callback(self, callback: Callable): + def register_failure_callback(self, callback: FailureCallback): if self.is_failed: callback() else: @@ -276,8 +276,7 @@ def __init__( ): self.rank = rank wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) - # TODO: move `init_worker` to executor level as a collective rpc - # call + # TODO: move `init_worker` to executor level as a collective rpc call all_kwargs: list[dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) ]