NVIDIA · mayani-nv · Jul 18, 2025 · Jul 18, 2025 · Jul 19, 2025 · Jul 21, 2025
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
@@ -52,10 +52,11 @@ class StructuralTag(OpenAIBaseModel):
 
 
 class ResponseFormat(OpenAIBaseModel):
-    # type must be "json_object" or "text" or "structural_tag"
-    type: Literal["text", "json_object", "structural_tag"]
+    # type must be "json_object" or "text" or "structural_tag" or "json_schema"
+    type: Literal["text", "json_object", "structural_tag", "json_schema"]
     structures: Optional[List[StructuralTag]] = None
     triggers: Optional[List[str]] = None
+    json_schema: Optional[Dict[str, Any]] = None
 
 
 class DisaggregatedParams(OpenAIBaseModel):
@@ -144,6 +145,8 @@ def _response_format_to_guided_decoding_params(
         return None
     elif response_format.type == "json_object":
         return GuidedDecodingParams(json_object=True)
+    elif response_format.type == "json_schema":
+        return GuidedDecodingParams(json=response_format.json_schema)
     elif response_format.type == "structural_tag":
         return GuidedDecodingParams(
             structural_tag=response_format.model_dump_json(by_alias=True,
@@ -205,7 +208,7 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description=
         ("Similar to chat completion, this parameter specifies the format of "
-         "output. {'type': 'json_object'}, {'type': 'text' }, {'type': 'structural_tag'} are "
+         "output. {'type': 'json_object'}, {'type': 'text' }, {'type': 'structural_tag'}, {'type': 'json_schema'} are "
          "supported."),
     )
 

diff --git a/tests/unittest/llmapi/apps/_test_openai_json_schema.py b/tests/unittest/llmapi/apps/_test_openai_json_schema.py
@@ -0,0 +1,103 @@
+import os
+import tempfile
+
+import openai
+import pytest
+import yaml
+from pydantic import BaseModel, Field
+
+from ..test_llm import get_model_path
+from .openai_server import RemoteOpenAIServer
+
+pytestmark = pytest.mark.threadleak(enabled=False)
+
+
+@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+def model_name():
+    return "llama-3.1-model/Llama-3.1-8B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def temp_extra_llm_api_options_file(request):
+    temp_dir = tempfile.gettempdir()
+    temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
+    try:
+        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
+
+        with open(temp_file_path, 'w') as f:
+            yaml.dump(extra_llm_api_options_dict, f)
+
+        yield temp_file_path
+    finally:
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+
+
+@pytest.fixture(scope="module")
+def server(model_name: str, temp_extra_llm_api_options_file: str):
+    model_path = get_model_path(model_name)
+    args = [
+        "--backend", "pytorch", "--extra_llm_api_options",
+        temp_extra_llm_api_options_file
+    ]
+    with RemoteOpenAIServer(model_path, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def client(server: RemoteOpenAIServer):
+    return server.get_client()
+
+
+@pytest.fixture(scope="module")
+def async_client(server: RemoteOpenAIServer):
+    return server.get_async_client()
+
+
+@pytest.fixture(scope="module")
+def capital_info_model():
+
+    class CapitalInfo(BaseModel):
+        name: str = Field(...,
+                          pattern=r"^\w+$",
+                          description="The name of the capital city")
+        population: int = Field(...,
+                                description="The population of the capital city")
+
+    return CapitalInfo
+
+
+def test_chat_json_schema(client: openai.OpenAI, model_name: str,
+                          capital_info_model):
+
+    CapitalInfo = capital_info_model
+    messages = [{
+        "role":
+        "user",
+        "content":
+        "Please generate the information of the capital of France in the JSON format. ",
+    }, ]
+
+    chat_completion = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        response_format={
+            "type": "json_schema",
+            "json_schema": CapitalInfo.model_json_schema(),
+        },
+        temperature=0.7,
+        max_completion_tokens=100,
+    )
+
+    assert chat_completion.id is not None
+    assert len(chat_completion.choices) == 1
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    assert message.role == "assistant"
+
+    capital_info = CapitalInfo.model_validate_json(message.content)
+
+    assert isinstance(capital_info, CapitalInfo)
+    assert capital_info.name == "Paris"
+    assert isinstance(capital_info.population, int)
+    assert capital_info.population > 0