vllm-project
diff --git a/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 8 additions & 8 deletions b/‎benchmarks/benchmark_serving_structured_output.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/features/reasoning_outputs.md‎
Lines changed: 8 additions & 4 deletions b/‎docs/features/reasoning_outputs.md‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎docs/features/structured_outputs.md‎
Lines changed: 18 additions & 18 deletions b/‎docs/features/structured_outputs.md‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎docs/features/tool_calling.md‎
Lines changed: 5 additions & 6 deletions b/‎docs/features/tool_calling.md‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎docs/serving/openai_compatible_server.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/serving/openai_compatible_server.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/offline_inference/structured_outputs.py‎
Lines changed: 31 additions & 26 deletions b/‎examples/offline_inference/structured_outputs.py‎
Lines changed: 31 additions & 26 deletions
diff --git a/‎examples/online_serving/openai_chat_completion_client_with_tools_required.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/online_serving/openai_chat_completion_client_with_tools_required.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/structured_outputs/structured_outputs.py‎
Lines changed: 5 additions & 3 deletions b/‎examples/online_serving/structured_outputs/structured_outputs.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎tests/async_engine/test_async_llm_engine.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/async_engine/test_async_llm_engine.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/entrypoints/conftest.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/entrypoints/conftest.py‎
Lines changed: 1 addition & 1 deletion
@@ -696,11 +696,11 @@ def _eval_correctness_regex(expected, actual):
         return re.match(args.regex, actual) is not None
 
     def _eval_correctness(expected, actual):
-        if args.structure_type == "guided_json":
+        if args.structure_type == "json":
             return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "guided_regex":
+        elif args.structure_type == "regex":
             return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "guided_choice":
+        elif args.structure_type == "choice":
             return _eval_correctness_choice(expected, actual)
         else:
             return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
     )
 
     if args.dataset == "grammar":
-        args.structure_type = "guided_grammar"
+        args.structure_type = "grammar"
     elif args.dataset == "regex":
-        args.structure_type = "guided_regex"
+        args.structure_type = "regex"
     elif args.dataset == "choice":
-        args.structure_type = "guided_choice"
+        args.structure_type = "choice"
     else:
-        args.structure_type = "guided_json"
+        args.structure_type = "json"
 
     if args.no_structured_output:
         args.structured_output_ratio = 0
     if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}guided"
+        result_file_name = f"{args.structured_output_ratio}so"
         result_file_name += f"_{backend}"
         result_file_name += f"_{args.request_rate}qps"
         result_file_name += f"_{args.model.split('/')[-1]}"
 
@@ -1,3 +1,7 @@
+---
+title: reasoning_outputs
+---
+
 # Reasoning Outputs
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
@@ -10,11 +14,11 @@ vLLM currently supports the following reasoning models:
 
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
-| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
 
 !!! note
     IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
 
@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
 
 The following parameters are supported, which must be added as extra parameters:
 
-- `guided_choice`: the output will be exactly one of the choices.
-- `guided_regex`: the output will follow the regex pattern.
-- `guided_json`: the output will follow the JSON schema.
-- `guided_grammar`: the output will follow the context free grammar.
+- `choice`: the output will be exactly one of the choices.
+- `regex`: the output will follow the regex pattern.
+- `json`: the output will follow the JSON schema.
+- `grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
 You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
-`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
+`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
 which will try to choose an appropriate backend based on the details of the
 request. You may also choose a specific backend, along with
 some options. A full set of options is available in the `vllm serve --help`
 text.
 
-Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
 
 ??? code
 
@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
         messages=[
             {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
         ],
-        extra_body={"guided_choice": ["positive", "negative"]},
+        extra_body={"structured_outputs": {"choices": ["positive", "negative"]}},
     )
     print(completion.choices[0].message.content)
     ```
 
-The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
 
 ??? code
 
@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
                 "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
             }
         ],
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+        extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
     )
     print(completion.choices[0].message.content)
     ```
 
 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
-For this we can use the `guided_json` parameter in two different ways:
+For this we can use the `json` parameter in two different ways:
 
 - Using directly a [JSON Schema](https://json-schema.org/)
 - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
 
-The next example shows how to use the `guided_json` parameter with a Pydantic model:
+The next example shows how to use the `response_format` parameter with a Pydantic model:
 
 ??? code
 
@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
     JSON schema and how the fields should be populated. This can improve the
     results notably in most cases.
 
-Finally we have the `guided_grammar` option, which is probably the most
+Finally we have the `grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
                 "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
             }
         ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
+        extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
     )
     print(completion.choices[0].message.content)
     ```
@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
 ## Offline Inference
 
 Offline inference allows for the same types of structured outputs.
-To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
-The main available options inside `GuidedDecodingParams` are:
+To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
+The main available options inside `StructuredOutputsParams` are:
 
 - `json`
 - `regex`
@@ -309,12 +309,12 @@ shown below:
 
     ```python
     from vllm import LLM, SamplingParams
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
 
     llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
 
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
     outputs = llm.generate(
         prompts="Classify this sentiment: vLLM is wonderful!",
         sampling_params=sampling_params,
 
@@ -71,7 +71,7 @@ This example demonstrates:
 * Making a request with `tool_choice="auto"`
 * Handling the structured response and executing the corresponding function
 
-You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
 
 Remember that it's the caller's responsibility to:
 
@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
 
 ## Named Function Calling
 
-vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
-enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backend supported by vLLM. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
+vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
 
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
 ## Required Function Calling
 
-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
 
 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
 
 
@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
         {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
     ],
     extra_body={
-        "guided_choice": ["positive", "negative"]
+        "structured_outputs": {"choices": ["positive", "negative"]}
     }
 )
 ```
@@ -374,7 +374,7 @@ The following extra parameters are supported:
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
     ```
-  
+
 [](){ #translations-api }
 
 ### Translations API
 
@@ -1,29 +1,33 @@
+# ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This file demonstrates the example usage of guided decoding
-to generate structured outputs using vLLM. It shows how to apply
-different guided decoding techniques such as Choice, Regex, JSON schema,
-and Grammar to produce structured and formatted results
-based on specific prompts.
+This file demonstrates the example usage of structured outputs
+in vLLM. It shows how to apply different constraints such as choice,
+regex, json schema, and grammar to produce structured and formatted
+results based on specific prompts.
 """
 
 from enum import Enum
 
 from pydantic import BaseModel
 
 from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
+from vllm.sampling_params import StructuredOutputsParams
 
-# Guided decoding by Choice (list of possible options)
-guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
+# Structured outputs by Choice (list of possible options)
+structured_outputs_params_choice = StructuredOutputsParams(
+    choice=["Positive", "Negative"]
+)
+sampling_params_choice = SamplingParams(
+    structured_outputs=structured_outputs_params_choice
+)
 prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
-# Guided decoding by Regex
-guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+# Structured outputs by Regex
+structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+    structured_outputs=structured_outputs_params_regex, stop=["\n"]
 )
 prompt_regex = (
     "Generate an email address for Alan Turing, who works in Enigma."
@@ -32,7 +36,7 @@
 )
 
 
-# Guided decoding by JSON using Pydantic schema
+# Structured outputs by JSON using Pydantic schema
 class CarType(str, Enum):
     sedan = "sedan"
     suv = "SUV"
@@ -47,14 +51,11 @@ class CarDescription(BaseModel):
 
 
 json_schema = CarDescription.model_json_schema()
-guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
-prompt_json = (
-    "Generate a JSON with the brand, model and car_type of"
-    "the most iconic car from the 90's"
-)
+structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
+sampling_params_json = SamplingParams(structured_outputs=structured_outputs_params_json)
+prompt_json = "Generate a JSON with the brand, model and car_type ofthe most iconic car from the 90's"
 
-# Guided decoding by Grammar
+# Structured outputs by Grammar
 simplified_sql_grammar = """
 root ::= select_statement
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -63,8 +64,12 @@ class CarDescription(BaseModel):
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+structured_outputs_params_grammar = StructuredOutputsParams(
+    grammar=simplified_sql_grammar
+)
+sampling_params_grammar = SamplingParams(
+    structured_outputs=structured_outputs_params_grammar
+)
 prompt_grammar = (
     "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )
@@ -83,16 +88,16 @@ def main():
     llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
 
     choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
-    format_output("Guided decoding by Choice", choice_output)
+    format_output("Structured outputs by Choice", choice_output)
 
     regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
-    format_output("Guided decoding by Regex", regex_output)
+    format_output("Structured outputs by Regex", regex_output)
 
     json_output = generate_output(prompt_json, sampling_params_json, llm)
-    format_output("Guided decoding by JSON", json_output)
+    format_output("Structured outputs by JSON", json_output)
 
     grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
-    format_output("Guided decoding by Grammar", grammar_output)
+    format_output("Structured outputs by Grammar", grammar_output)
 
 
 if __name__ == "__main__":
 
@@ -6,7 +6,7 @@
 
 ```bash
 VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
-    --guided-decoding-backend outlines
+    --structured-outputs-config.backend outlines
 ```
 
 This example demonstrates how to generate chat completions
 
@@ -86,7 +86,7 @@ class CarDescription(pydantic.BaseModel):
                 "content": "Classify this sentiment: vLLM is wonderful!",
             }
         ],
-        "extra_body": {"guided_choice": ["positive", "negative"]},
+        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
     },
     "regex": {
         "messages": [
@@ -96,7 +96,7 @@ class CarDescription(pydantic.BaseModel):
             }
         ],
         "extra_body": {
-            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
         },
     },
     "json": {
@@ -122,7 +122,8 @@ class CarDescription(pydantic.BaseModel):
             }
         ],
         "extra_body": {
-            "guided_grammar": """
+            "structured_outputs": {
+                "grammar": """
 root ::= select_statement
 
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -135,6 +136,7 @@ class CarDescription(pydantic.BaseModel):
 
 number ::= "1 " | "2 "
 """,
+            }
         },
     },
     "structural_tag": {
 
@@ -128,7 +128,6 @@ async def test_new_requests_event():
     engine = MockAsyncLLMEngine()
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
 
 
 def start_engine():
 
@@ -184,7 +184,7 @@ def sample_enum_json_schema():
 
 
 @pytest.fixture
-def sample_guided_choice():
+def sample_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"