Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions tests/v1/entrypoints/openai/responses/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@ def default_server_args():


@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
def server_with_store(default_server_args):
with RemoteOpenAIServer(
MODEL_NAME,
default_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
async def client(server_with_store):
async with server_with_store.get_async_client() as async_client:
yield async_client
7 changes: 5 additions & 2 deletions tests/v1/entrypoints/openai/responses/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,11 @@ def default_image_server_args():

@pytest.fixture(scope="module")
def image_server(default_image_server_args):
with RemoteOpenAIServer(MODEL_NAME,
default_image_server_args) as remote_server:
with RemoteOpenAIServer(
MODEL_NAME,
default_image_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server


Expand Down
25 changes: 21 additions & 4 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from fastapi import Request
from openai.types.responses import ResponseOutputMessage, ResponseOutputText

from vllm import envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
Expand Down Expand Up @@ -89,15 +90,17 @@ def __init__(
logger.info("Using default chat sampling params from %s: %s",
source, self.default_sampling_params)

# False by default.
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove responses
# from the store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove responses from the store.
self.response_store: dict[str, ResponsesResponse] = {}
self.response_store_lock = asyncio.Lock()

# HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove messages
# from the store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove messages from the store.
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}

self.background_tasks: dict[str, asyncio.Task] = {}
Expand All @@ -118,6 +121,10 @@ async def create_responses(
if self.engine_client.errored:
raise self.engine_client.dead_error

# If store is not enabled, return an error.
if request.store and not self.enable_store:
return self._make_store_not_supported_error()

# Handle the previous response ID.
prev_response_id = request.previous_response_id
if prev_response_id is not None:
Expand Down Expand Up @@ -456,3 +463,13 @@ def _make_not_found_error(self, response_id: str) -> ErrorResponse:
message=f"Response with id '{response_id}' not found.",
status_code=HTTPStatus.NOT_FOUND,
)

def _make_store_not_supported_error(self) -> ErrorResponse:
return self.create_error_response(
err_type="invalid_request_error",
message=("`store=True` (default) is not supported. Please set "
"`store=False` in Responses API or set "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
"starting the vLLM server."),
status_code=HTTPStatus.BAD_REQUEST,
)
12 changes: 12 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = ""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
VLLM_ENABLE_RESPONSES_API_STORE: bool = False


def get_default_cache_root():
Expand Down Expand Up @@ -1056,6 +1057,17 @@ def get_vllm_port() -> Optional[int]:
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
lambda: bool(int(os.getenv(\
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),

# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
# messages for those requests in memory. By default, this is disabled (0).
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.
# 2. Enabling this option will cause a memory leak, as stored messages are
# never removed from memory until the server terminates.
"VLLM_ENABLE_RESPONSES_API_STORE":
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
}

# --8<-- [end:env-vars-definition]
Expand Down