vllm-project · WoosukKwon · Jul 7, 2025 · Jul 4, 2025 · Jul 5, 2025 · Jul 5, 2025
@@ -95,6 +95,10 @@ def test_openapi_stateless(case: schemathesis.Case):
         case.operation.method.upper(),
         case.operation.path,
     )
+    if case.operation.path.startswith("/v1/responses"):
+        # Skip responses API as it is meant to be stateful.
+        return
+
     timeout = {
         # requires a longer timeout
         ("POST", "/v1/chat/completions"):

diff --git a/tests/v1/entrypoints/openai/responses/__init__.py b/tests/v1/entrypoints/openai/responses/__init__.py
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+# Use a small reasoning model to test the responses API.
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",  # For faster startup.
+        "--reasoning-parser",
+        "deepseek_r1",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import openai  # use the official client for correctness check
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_simple_input(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="What is 13 * 24?")
+    print(response)
+
+    outputs = response.output
+    # Whether the output contains the answer.
+    assert outputs[-1].type == "message"
+    assert "312" in outputs[-1].content[0].text
+
+    # Whether the output contains the reasoning.
+    assert outputs[0].type == "reasoning"
+    assert outputs[0].text != ""
+
+
+@pytest.mark.asyncio
+async def test_instructions(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        instructions="Finish the answer with QED.",
+        input="What is 13 * 24?",
+    )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "312" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "system",
+            "content": "Finish the answer with QED."
+        },
+        {
+            "role": "user",
+            "content": "What is 5 * 3?"
+        },
+        {
+            "role": "assistant",
+            "content": "15. QED."
+        },
+        {
+            "role": "user",
+            "content": "Multiply the result by 2."
+        },
+    ], )
+    print(response)
+
+    output_text = response.output[-1].content[0].text
+    assert "30" in output_text
+    assert "QED" in output_text
+
+
+@pytest.mark.asyncio
+async def test_chat_with_input_type(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input=[
+        {
+            "role": "user",
+            "content": [{
+                "type": "input_text",
+                "text": "Hello!"
+            }],
+        },
+    ], )
+    print(response)
+    assert response.status == "completed"
diff --git a/tests/v1/entrypoints/openai/responses/test_stateful.py b/tests/v1/entrypoints/openai/responses/test_stateful.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+
+import openai
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_store(client: openai.AsyncOpenAI):
+    # By default, store is True.
+    response = await client.responses.create(input="Hello!")
+    assert response.status == "completed"
+
+    # Retrieve the response.
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "completed"
+
+    # Test store=False.
+    response = await client.responses.create(
+        input="Hello!",
+        store=False,
+    )
+    assert response.status == "completed"
+
+    # The response should not be found.
+    with pytest.raises(openai.NotFoundError,
+                       match="Response with id .* not found."):
+        await client.responses.retrieve(response.id)
+
+
+@pytest.mark.asyncio
+async def test_background(client: openai.AsyncOpenAI):
+    # NOTE: This query should be easy enough for the model to answer
+    # within the 10 seconds.
+    response = await client.responses.create(
+        input="Hello!",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    max_retries = 10
+    for _ in range(max_retries):
+        await asyncio.sleep(1)
+        response = await client.responses.retrieve(response.id)
+        if response.status != "queued":
+            break
+    print(response)
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+async def test_background_error(client: openai.AsyncOpenAI):
+    with pytest.raises(
+            openai.BadRequestError,
+            match="background can only be used when `store` is true"):
+        _ = await client.responses.create(
+            input="What is 13 * 24?",
+            background=True,
+            store=False,
+        )
+
+
+@pytest.mark.asyncio
+async def test_background_cancel(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response.status == "queued"
+
+    # Cancel the response before it is completed.
+    # FIXME: This test can be flaky.
+    await asyncio.sleep(0.5)
+    response = await client.responses.cancel(response.id)
+    assert response.status == "cancelled"
+
+    # Make sure the response status remains unchanged.
+    await asyncio.sleep(5)
+    response = await client.responses.retrieve(response.id)
+    assert response.status == "cancelled"
+
+
+@pytest.mark.asyncio
+async def test_cancel_completed(client: openai.AsyncOpenAI):
+    response = await client.responses.create(input="Hello")
+    assert response.status == "completed"
+
+    with pytest.raises(openai.BadRequestError,
+                       match="Cannot cancel a synchronous response."):
+        await client.responses.cancel(response.id)
+
+
+@pytest.mark.asyncio
+async def test_previous_response_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    response2 = await client.responses.create(
+        input="Actually, my name is not John. My real name is Mark.",
+        previous_response_id=response1.id,
+    )
+
+    response3 = await client.responses.create(
+        input="What is my real name again? Answer in one word.",
+        previous_response_id=response2.id,
+    )
+    print(response3)
+    assert "Mark" in response3.output[-1].content[0].text
+    assert "John" not in response3.output[-1].content[0].text
+
+
+@pytest.mark.asyncio
+async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
+    response1 = await client.responses.create(
+        instructions="You are tested on your ability to retrieve the correct "
+        "information from the previous response.",
+        input="Hello, my name is John.")
+
+    # Both response 2 and 3 use response 1 as the previous response.
+    response2 = client.responses.create(
+        input="Actually, my name is not John. My name is Mark.",
+        previous_response_id=response1.id,
+    )
+    response3 = client.responses.create(
+        input="What is my name again? Answer in one word.",
+        previous_response_id=response1.id,
+    )
+
+    _ = await response2
+    response3_result = await response3
+    print(response3_result)
+    assert "John" in response3_result.output[-1].content[0].text
+    assert "Mark" not in response3_result.output[-1].content[0].text
diff --git a/tests/v1/entrypoints/openai/responses/test_structured_output.py b/tests/v1/entrypoints/openai/responses/test_structured_output.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+
+import openai
+import pytest
+from pydantic import BaseModel
+
+
+@pytest.mark.asyncio
+async def test_structured_output(client: openai.AsyncOpenAI):
+    response = await client.responses.create(
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "event_name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["event_name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    print(response)
+
+    # NOTE: The JSON schema is applied to the output text, not reasoning.
+    output_text = response.output[-1].content[0].text
+    event = json.loads(output_text)
+
+    assert event["event_name"].lower() == "science fair"
+    assert event["date"] == "Friday"
+    participants = event["participants"]
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
+
+
+@pytest.mark.asyncio
+async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
+
+    class CalendarEvent(BaseModel):
+        event_name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=None,
+        instructions="Extract the event information.",
+        input="Alice and Bob are going to a science fair on Friday.",
+        text_format=CalendarEvent,
+    )
+    print(response)
+
+    # The output is successfully parsed.
+    event = response.output_parsed
+    assert event is not None
+
+    # The output is correct.
+    assert event.event_name.lower() == "science fair"
+    assert event.date == "Friday"
+    participants = event.participants
+    assert len(participants) == 2
+    assert participants[0] == "Alice"
+    assert participants[1] == "Bob"
@@ -902,6 +902,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 ] = {
     "text":
     lambda part: _TextParser(part).get("text", None),
+    "input_text":
+    lambda part: _TextParser(part).get("text", None),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
@@ -1040,7 +1042,7 @@ def _parse_chat_message_content_part(
             "with empty / unparsable content.", part, part_type)
         return None
 
-    if part_type in ("text", "refusal"):
+    if part_type in ("text", "input_text", "refusal"):
         str_content = cast(str, content)
         if wrap_dicts:
             return {'type': 'text', 'text': str_content}