Skip to content

Commit 69068cd

Browse files
committed
chore: finalize cleanup from v0
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
1 parent e188592 commit 69068cd

39 files changed

+357
-630
lines changed

benchmarks/benchmark_serving_structured_output.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -696,11 +696,11 @@ def _eval_correctness_regex(expected, actual):
696696
return re.match(args.regex, actual) is not None
697697

698698
def _eval_correctness(expected, actual):
699-
if args.structure_type == "guided_json":
699+
if args.structure_type == "json":
700700
return _eval_correctness_json(expected, actual)
701-
elif args.structure_type == "guided_regex":
701+
elif args.structure_type == "regex":
702702
return _eval_correctness_regex(expected, actual)
703-
elif args.structure_type == "guided_choice":
703+
elif args.structure_type == "choice":
704704
return _eval_correctness_choice(expected, actual)
705705
else:
706706
return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
780780
)
781781

782782
if args.dataset == "grammar":
783-
args.structure_type = "guided_grammar"
783+
args.structure_type = "grammar"
784784
elif args.dataset == "regex":
785-
args.structure_type = "guided_regex"
785+
args.structure_type = "regex"
786786
elif args.dataset == "choice":
787-
args.structure_type = "guided_choice"
787+
args.structure_type = "choice"
788788
else:
789-
args.structure_type = "guided_json"
789+
args.structure_type = "json"
790790

791791
if args.no_structured_output:
792792
args.structured_output_ratio = 0
793793
if args.save_results:
794-
result_file_name = f"{args.structured_output_ratio}guided"
794+
result_file_name = f"{args.structured_output_ratio}so"
795795
result_file_name += f"_{backend}"
796796
result_file_name += f"_{args.request_rate}qps"
797797
result_file_name += f"_{args.model.split('/')[-1]}"

docs/features/reasoning_outputs.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
---
2+
title: reasoning_outputs
3+
---
4+
15
# Reasoning Outputs
26

37
vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
@@ -10,11 +14,11 @@ vLLM currently supports the following reasoning models:
1014

1115
| Model Series | Parser Name | Structured Output Support | Tool Calling |
1216
|--------------|-------------|------------------|-------------|
13-
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` ||
14-
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` ||
17+
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` ||
18+
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` ||
1519
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` |||
16-
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` ||
17-
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` ||
20+
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` ||
21+
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` ||
1822

1923
!!! note
2024
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.

docs/features/structured_outputs.md

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
1212

1313
The following parameters are supported, which must be added as extra parameters:
1414

15-
- `guided_choice`: the output will be exactly one of the choices.
16-
- `guided_regex`: the output will follow the regex pattern.
17-
- `guided_json`: the output will follow the JSON schema.
18-
- `guided_grammar`: the output will follow the context free grammar.
15+
- `choice`: the output will be exactly one of the choices.
16+
- `regex`: the output will follow the regex pattern.
17+
- `json`: the output will follow the JSON schema.
18+
- `grammar`: the output will follow the context free grammar.
1919
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
2020

2121
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
2222

2323
Structured outputs are supported by default in the OpenAI-Compatible Server. You
2424
may choose to specify the backend to use by setting the
25-
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
25+
`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
2626
which will try to choose an appropriate backend based on the details of the
2727
request. You may also choose a specific backend, along with
2828
some options. A full set of options is available in the `vllm serve --help`
2929
text.
3030

31-
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
31+
Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
3232

3333
??? code
3434

@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
4545
messages=[
4646
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
4747
],
48-
extra_body={"guided_choice": ["positive", "negative"]},
48+
extra_body={"structured_outputs": {"choices": ["positive", "negative"]}},
4949
)
5050
print(completion.choices[0].message.content)
5151
```
5252

53-
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
53+
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
5454

5555
??? code
5656

@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
6363
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
6464
}
6565
],
66-
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
66+
extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
6767
)
6868
print(completion.choices[0].message.content)
6969
```
7070

7171
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
72-
For this we can use the `guided_json` parameter in two different ways:
72+
For this we can use the `json` parameter in two different ways:
7373

7474
- Using directly a [JSON Schema](https://json-schema.org/)
7575
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
7676

77-
The next example shows how to use the `guided_json` parameter with a Pydantic model:
77+
The next example shows how to use the `response_format` parameter with a Pydantic model:
7878

7979
??? code
8080

@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
119119
JSON schema and how the fields should be populated. This can improve the
120120
results notably in most cases.
121121

122-
Finally we have the `guided_grammar` option, which is probably the most
122+
Finally we have the `grammar` option, which is probably the most
123123
difficult to use, but it´s really powerful. It allows us to define complete
124124
languages like SQL queries. It works by using a context free EBNF grammar.
125125
As an example, we can use to define a specific format of simplified SQL queries:
@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
149149
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
150150
}
151151
],
152-
extra_body={"guided_grammar": simplified_sql_grammar},
152+
extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
153153
)
154154
print(completion.choices[0].message.content)
155155
```
@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
292292
## Offline Inference
293293

294294
Offline inference allows for the same types of structured outputs.
295-
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
296-
The main available options inside `GuidedDecodingParams` are:
295+
To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
296+
The main available options inside `StructuredOutputsParams` are:
297297

298298
- `json`
299299
- `regex`
@@ -309,12 +309,12 @@ shown below:
309309

310310
```python
311311
from vllm import LLM, SamplingParams
312-
from vllm.sampling_params import GuidedDecodingParams
312+
from vllm.sampling_params import StructuredOutputsParams
313313

314314
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
315315

316-
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
317-
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
316+
structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
317+
sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
318318
outputs = llm.generate(
319319
prompts="Classify this sentiment: vLLM is wonderful!",
320320
sampling_params=sampling_params,

docs/features/tool_calling.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ This example demonstrates:
7171
* Making a request with `tool_choice="auto"`
7272
* Handling the structured response and executing the corresponding function
7373

74-
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
74+
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
7575

7676
Remember that it's the caller's responsibility to:
7777

@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
8383

8484
## Named Function Calling
8585

86-
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
87-
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
86+
vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backend supported by vLLM. You are guaranteed a validly-parsable function call - not a
8887
high-quality one.
8988

90-
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
91-
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
89+
vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
90+
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
9291

9392
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
9493
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
9594

9695
## Required Function Calling
9796

98-
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
97+
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
9998

10099
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
101100

docs/serving/openai_compatible_server.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
133133
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
134134
],
135135
extra_body={
136-
"guided_choice": ["positive", "negative"]
136+
"structured_outputs": {"choices": ["positive", "negative"]}
137137
}
138138
)
139139
```
@@ -374,7 +374,7 @@ The following extra parameters are supported:
374374
```python
375375
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
376376
```
377-
377+
378378
[](){ #translations-api }
379379

380380
### Translations API

examples/offline_inference/structured_outputs.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,33 @@
1+
# ruff: noqa: E501
12
# SPDX-License-Identifier: Apache-2.0
23
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
34
"""
4-
This file demonstrates the example usage of guided decoding
5-
to generate structured outputs using vLLM. It shows how to apply
6-
different guided decoding techniques such as Choice, Regex, JSON schema,
7-
and Grammar to produce structured and formatted results
8-
based on specific prompts.
5+
This file demonstrates the example usage of structured outputs
6+
in vLLM. It shows how to apply different constraints such as choice,
7+
regex, json schema, and grammar to produce structured and formatted
8+
results based on specific prompts.
99
"""
1010

1111
from enum import Enum
1212

1313
from pydantic import BaseModel
1414

1515
from vllm import LLM, SamplingParams
16-
from vllm.sampling_params import GuidedDecodingParams
16+
from vllm.sampling_params import StructuredOutputsParams
1717

18-
# Guided decoding by Choice (list of possible options)
19-
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
20-
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
18+
# Structured outputs by Choice (list of possible options)
19+
structured_outputs_params_choice = StructuredOutputsParams(
20+
choice=["Positive", "Negative"]
21+
)
22+
sampling_params_choice = SamplingParams(
23+
structured_outputs=structured_outputs_params_choice
24+
)
2125
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
2226

23-
# Guided decoding by Regex
24-
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
27+
# Structured outputs by Regex
28+
structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
2529
sampling_params_regex = SamplingParams(
26-
guided_decoding=guided_decoding_params_regex, stop=["\n"]
30+
structured_outputs=structured_outputs_params_regex, stop=["\n"]
2731
)
2832
prompt_regex = (
2933
"Generate an email address for Alan Turing, who works in Enigma."
@@ -32,7 +36,7 @@
3236
)
3337

3438

35-
# Guided decoding by JSON using Pydantic schema
39+
# Structured outputs by JSON using Pydantic schema
3640
class CarType(str, Enum):
3741
sedan = "sedan"
3842
suv = "SUV"
@@ -47,14 +51,11 @@ class CarDescription(BaseModel):
4751

4852

4953
json_schema = CarDescription.model_json_schema()
50-
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
51-
sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
52-
prompt_json = (
53-
"Generate a JSON with the brand, model and car_type of"
54-
"the most iconic car from the 90's"
55-
)
54+
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
55+
sampling_params_json = SamplingParams(structured_outputs=structured_outputs_params_json)
56+
prompt_json = "Generate a JSON with the brand, model and car_type ofthe most iconic car from the 90's"
5657

57-
# Guided decoding by Grammar
58+
# Structured outputs by Grammar
5859
simplified_sql_grammar = """
5960
root ::= select_statement
6061
select_statement ::= "SELECT " column " from " table " where " condition
@@ -63,8 +64,12 @@ class CarDescription(BaseModel):
6364
condition ::= column "= " number
6465
number ::= "1 " | "2 "
6566
"""
66-
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
67-
sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
67+
structured_outputs_params_grammar = StructuredOutputsParams(
68+
grammar=simplified_sql_grammar
69+
)
70+
sampling_params_grammar = SamplingParams(
71+
structured_outputs=structured_outputs_params_grammar
72+
)
6873
prompt_grammar = (
6974
"Generate an SQL query to show the 'username' and 'email'from the 'users' table."
7075
)
@@ -83,16 +88,16 @@ def main():
8388
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
8489

8590
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
86-
format_output("Guided decoding by Choice", choice_output)
91+
format_output("Structured outputs by Choice", choice_output)
8792

8893
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
89-
format_output("Guided decoding by Regex", regex_output)
94+
format_output("Structured outputs by Regex", regex_output)
9095

9196
json_output = generate_output(prompt_json, sampling_params_json, llm)
92-
format_output("Guided decoding by JSON", json_output)
97+
format_output("Structured outputs by JSON", json_output)
9398

9499
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
95-
format_output("Guided decoding by Grammar", grammar_output)
100+
format_output("Structured outputs by Grammar", grammar_output)
96101

97102

98103
if __name__ == "__main__":

examples/online_serving/openai_chat_completion_client_with_tools_required.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
77
```bash
88
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
9-
--guided-decoding-backend outlines
9+
--structured-outputs-config.backend outlines
1010
```
1111
1212
This example demonstrates how to generate chat completions

examples/online_serving/structured_outputs/structured_outputs.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class CarDescription(pydantic.BaseModel):
8686
"content": "Classify this sentiment: vLLM is wonderful!",
8787
}
8888
],
89-
"extra_body": {"guided_choice": ["positive", "negative"]},
89+
"extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
9090
},
9191
"regex": {
9292
"messages": [
@@ -96,7 +96,7 @@ class CarDescription(pydantic.BaseModel):
9696
}
9797
],
9898
"extra_body": {
99-
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
99+
"structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
100100
},
101101
},
102102
"json": {
@@ -122,7 +122,8 @@ class CarDescription(pydantic.BaseModel):
122122
}
123123
],
124124
"extra_body": {
125-
"guided_grammar": """
125+
"structured_outputs": {
126+
"grammar": """
126127
root ::= select_statement
127128
128129
select_statement ::= "SELECT " column " from " table " where " condition
@@ -135,6 +136,7 @@ class CarDescription(pydantic.BaseModel):
135136
136137
number ::= "1 " | "2 "
137138
""",
139+
}
138140
},
139141
},
140142
"structural_tag": {

tests/async_engine/test_async_llm_engine.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ async def test_new_requests_event():
128128
engine = MockAsyncLLMEngine()
129129
assert engine.get_model_config() is not None
130130
assert engine.get_tokenizer() is not None
131-
assert engine.get_decoding_config() is not None
132131

133132

134133
def start_engine():

tests/entrypoints/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def sample_enum_json_schema():
184184

185185

186186
@pytest.fixture
187-
def sample_guided_choice():
187+
def sample_choices():
188188
return [
189189
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
190190
"Ruby", "Swift", "Kotlin"

0 commit comments

Comments
 (0)