diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index 2dcdda04ecb5..2d677a00b646 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -21,12 +21,16 @@ def default_server_args(): @pytest.fixture(scope="module") -def server(default_server_args): - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: +def server_with_store(default_server_args): + with RemoteOpenAIServer( + MODEL_NAME, + default_server_args, + env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"}, + ) as remote_server: yield remote_server @pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: +async def client(server_with_store): + async with server_with_store.get_async_client() as async_client: yield async_client diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py index f3bce91e97cd..c8d09fd39fb1 100644 --- a/tests/v1/entrypoints/openai/responses/test_image.py +++ b/tests/v1/entrypoints/openai/responses/test_image.py @@ -37,8 +37,11 @@ def default_image_server_args(): @pytest.fixture(scope="module") def image_server(default_image_server_args): - with RemoteOpenAIServer(MODEL_NAME, - default_image_server_args) as remote_server: + with RemoteOpenAIServer( + MODEL_NAME, + default_image_server_args, + env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"}, + ) as remote_server: yield remote_server diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 64880a3a5377..5e9401cbd747 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -11,6 +11,7 @@ from fastapi import Request from openai.types.responses import ResponseOutputMessage, ResponseOutputText +from vllm import envs from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, @@ -89,15 +90,17 @@ def __init__( logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) + # False by default. + self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE # HACK(woosuk): This is a hack. We should use a better store. - # FIXME: This causes a memory leak since we never remove responses - # from the store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove responses from the store. self.response_store: dict[str, ResponsesResponse] = {} self.response_store_lock = asyncio.Lock() # HACK(woosuk): This is a hack. We should use a better store. - # FIXME: This causes a memory leak since we never remove messages - # from the store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove messages from the store. self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} self.background_tasks: dict[str, asyncio.Task] = {} @@ -118,6 +121,10 @@ async def create_responses( if self.engine_client.errored: raise self.engine_client.dead_error + # If store is not enabled, return an error. + if request.store and not self.enable_store: + return self._make_store_not_supported_error() + # Handle the previous response ID. prev_response_id = request.previous_response_id if prev_response_id is not None: @@ -456,3 +463,13 @@ def _make_not_found_error(self, response_id: str) -> ErrorResponse: message=f"Response with id '{response_id}' not found.", status_code=HTTPStatus.NOT_FOUND, ) + + def _make_store_not_supported_error(self) -> ErrorResponse: + return self.create_error_response( + err_type="invalid_request_error", + message=("`store=True` (default) is not supported. Please set " + "`store=False` in Responses API or set " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when " + "starting the vLLM server."), + status_code=HTTPStatus.BAD_REQUEST, + ) diff --git a/vllm/envs.py b/vllm/envs.py index 2d470c6dccbf..8d3c7eab471c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -151,6 +151,7 @@ VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False + VLLM_ENABLE_RESPONSES_API_STORE: bool = False def get_default_cache_root(): @@ -1056,6 +1057,17 @@ def get_vllm_port() -> Optional[int]: "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": lambda: bool(int(os.getenv(\ "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), + + # Enables support for the "store" option in the OpenAI Responses API. + # When set to 1, vLLM's OpenAI server will retain the input and output + # messages for those requests in memory. By default, this is disabled (0). + # NOTE/WARNING: + # 1. Messages are kept in memory only (not persisted to disk) and will be + # lost when the vLLM server shuts down. + # 2. Enabling this option will cause a memory leak, as stored messages are + # never removed from memory until the server terminates. + "VLLM_ENABLE_RESPONSES_API_STORE": + lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))), } # --8<-- [end:env-vars-definition]