Add "auto" mode, enable test coverage

russellb · russellb · commit 469d4ec85071 · 2025-03-18T22:01:34.000Z
Signed-off-by: Russell Bryant &lt;rbryant@redhat.com&gt;
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -13,7 +13,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
+GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
 MODELS_TO_TEST = [
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
@@ -30,12 +30,13 @@ def test_guided_json_completion(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_json_schema,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an employee profile "
         f"that fits this schema: {sample_json_schema}"
@@ -67,13 +68,14 @@ def test_guided_json_object(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=100,
-                                     n=2,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json_object=True,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        n=2,
+        guided_decoding=GuidedDecodingParams(json_object=True))
 
     outputs = llm.generate(
         prompts=("Generate a JSON object with curly braces for a person with "
@@ -98,7 +100,7 @@ def test_guided_json_object(
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
+                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
 @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_unsupported_schema(
     monkeypatch: pytest.MonkeyPatch,
@@ -107,21 +109,43 @@ def test_guided_json_unsupported_schema(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json_schema,
-                                         backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="The provided JSON schema contains features "
-                       "not supported by xgrammar."):
-        llm.generate(prompts=[
-            f"Give an example JSON for an employee profile "
-            f"that fits this schema: {unsupported_json_schema}"
-        ] * 2,
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+    if guided_decoding_backend == "xgrammar":
+        with pytest.raises(ValueError,
+                           match="The provided JSON schema contains features "
+                           "not supported by xgrammar."):
+            llm.generate(prompts=[
+                f"Give an example JSON for an employee profile "
+                f"that fits this schema: {unsupported_json_schema}"
+            ] * 2,
+                         sampling_params=sampling_params,
+                         use_tqdm=True)
+    else:
+        # This should work for both "guidelines" and "auto".
+
+        outputs = llm.generate(
+            prompts=("Give an example JSON object for a grade "
+                     "that fits this schema: "
+                     f"{unsupported_json_schema}"),
+            sampling_params=sampling_params,
+            use_tqdm=True)
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+            print(generated_text)
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
 
 
 @pytest.mark.skip_global_cleanup
@@ -135,13 +159,14 @@ def test_guided_grammar_ebnf(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_ebnf,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
     outputs = llm.generate(
         prompts=("Generate a sql statement that selects col_1 from "
                  "table_1 where it is equal to 1"),
@@ -178,13 +203,14 @@ def test_guided_grammar_lark(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_lark,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
     outputs = llm.generate(
         prompts=("Generate a sql statement that selects col_1 from "
                  "table_1 where it is equal to 1"),
@@ -225,13 +251,14 @@ def test_guided_grammar_ebnf_invalid(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar="not a grammar",
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
     with pytest.raises(ValueError,
                        match="Failed to convert the grammar "
                        "from Lark to EBNF."):
@@ -254,12 +281,13 @@ def test_guided_regex(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
     outputs = llm.generate(
         prompts=[
             f"Give an example IPv4 address with this regex: {sample_regex}"
@@ -291,12 +319,13 @@ def test_guided_choice_completion(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         choice=sample_guided_choice,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
diff --git a/vllm/config.py b/vllm/config.py
@@ -2788,7 +2788,7 @@ def __post_init__(self):
         v0_valid_guided_backends = [
             'outlines', 'lm-format-enforcer', 'xgrammar'
         ]
-        v1_valid_guided_backends = ['xgrammar', 'guidance']
+        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
 
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -382,16 +382,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default='xgrammar',
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines, '
-            'https://github.com/mlc-ai/xgrammar, and '
-            'https://github.com/noamgat/lm-format-enforcer.'
-            ' Can be overridden per request via guided_decoding_backend'
-            ' parameter.\n'
-            'Backend-specific options can be supplied in a comma-separated '
-            'list following a colon after the backend name. Valid backends and '
-            'all available options are: [xgrammar:no-fallback, '
-            'xgrammar:disable-any-whitespace, '
-            'outlines:no-fallback, lm-format-enforcer:no-fallback]')
+            'https://github.com/mlc-ai/xgrammar and '
+            'https://github.com/guidance-ai/llguidance.'
+            'Valid backend values are "xgrammar", "guidance", and "auto". '
+            'With "auto", we will make opinionated choices based on request'
+            'contents and what the backend libraries currently support, so '
+            'the behavior is subject to change in each release. '
+            'The default is xgrammar.')
         parser.add_argument(
             '--logits-processor-pattern',
             type=nullable_str,
@@ -1461,8 +1458,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # Only support Xgrammar for guided decoding so far.
-        SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
+        # Support xgrammar, guidance, or an opinionated automatic mode.
+        SUPPORTED_GUIDED_DECODING = ["xgrammar", "guidance", "auto"]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
                                recommend_to_remove=False)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -121,7 +121,16 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = ["xgrammar", "guidance"]
+        # Platform validation
+        if vllm.platforms.current_platform.is_tpu():
+            raise ValueError("Structured output is not supported on TPU.")
+
+        # Backend validation
+        #  - ensure backend is supported in v1
+        #  - if a backend was included in the request, ensure it matches
+        #    the backend configured for the engine. We don't support changing
+        #    the backend per request in V1.
+        supported_backends = ["auto", "xgrammar", "guidance"]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
@@ -135,11 +144,24 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         else:
             params.guided_decoding.backend = engine_level_backend
 
-        if vllm.platforms.current_platform.is_tpu():
-            raise ValueError("Structured output is not supported on TPU.")
+        # Request content validation
 
         if engine_level_backend == "xgrammar":
+            # xgrammar with no fallback
             validate_structured_output_request_xgrammar(params)
+            params.guided_decoding.backend = "xgrammar"
+        elif engine_level_backend == "auto":
+            # "auto" is an opt-in to opinionated behavior where we try to
+            # choose a backend based on request contents. This is not the
+            # default as it is less predictable and subject to change
+            # between releases as feature support changes.
+            try:
+                validate_structured_output_request_xgrammar(params)
+                params.guided_decoding.backend = "xgrammar"
+            except ValueError:
+                # The request includes some jsonschema feature(s) that
+                # are not supported in xgrammar. Fall back to guidance.
+                params.guided_decoding.backend = "guidance"
 
     def process_inputs(
         self,

Original file line number	Diff line number	Diff line change
`@@ -2788,7 +2788,7 @@ def __post_init__(self):`
`2788`	`2788`	`v0_valid_guided_backends = [`
`2789`	`2789`	`'outlines', 'lm-format-enforcer', 'xgrammar'`
`2790`	`2790`	`]`
`2791`		`- v1_valid_guided_backends = ['xgrammar', 'guidance']`
	`2791`	`+ v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']`
`2792`	`2792`
`2793`	`2793`	`backend = GuidedDecodingParams(`
`2794`	`2794`	`backend=self.guided_decoding_backend).backend_name`