diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b179dc3b4747..c243d81e7f18 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -386,13 +386,21 @@ def test_structured_output_auto_mode( max_tokens=1000, guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) + prompts = ("Give an example JSON object for a grade " + "that fits this schema: " + f"{unsupported_json_schema}") # This would fail with the default of "xgrammar", but in "auto" # we will handle fallback automatically. - outputs = llm.generate(prompts=("Give an example JSON object for a grade " - "that fits this schema: " - f"{unsupported_json_schema}"), + outputs = llm.generate(prompts=prompts, sampling_params=sampling_params, use_tqdm=True) + # Make sure `auto` backend handling doesn't mess up sampling_params + # and that we can reuse it without error. + outputs.extend( + llm.generate(prompts=prompts, + sampling_params=sampling_params, + use_tqdm=True)) + assert outputs is not None for output in outputs: assert output is not None diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 68ed99664947..707a757ca83a 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -79,6 +79,17 @@ def backend_options(self) -> list[str]: return [] return self.backend.split(":")[1].split(",") + def add_option(self, opt_name: str) -> None: + """Adds an option to the backend options.""" + if not self.backend: + self.backend = f":{opt_name}" + elif ":" not in self.backend: + self.backend += f":{opt_name}" + else: + options = set(self.backend_options()) + options.add(opt_name) + self.backend = f"{self.backend_name}:{','.join(sorted(options))}" + def no_fallback(self) -> bool: """Returns True if the "no-fallback" option is supplied for the guided decoding backend""" diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index afbbddb86d51..9dce37061a82 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -154,7 +154,14 @@ def _validate_structured_output(self, params: SamplingParams) -> None: raise ValueError(f"Only {supported_backends} structured output is " "supported in V1.") if params.guided_decoding.backend: - if params.guided_decoding.backend != engine_level_backend: + # Request-level backend selection is not supported in V1. + # The values may differ if `params` is reused and was set + # to a specific backend based on `auto` behavior in a previous + # request. We remember that it was set as a result of `auto` + # using the `_auto` option set on the backend in the params. + if (params.guided_decoding.backend != engine_level_backend + and not (engine_level_backend == "auto" and "_auto" + in params.guided_decoding.backend_options())): raise ValueError( "Request-level structured output backend selection is no " "longer supported. The request specified " @@ -182,6 +189,8 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # The request includes some jsonschema feature(s) that # are not supported in xgrammar. Fall back to guidance. params.guided_decoding.backend = "guidance" + # Remember that this backend was set automatically + params.guided_decoding.add_option("_auto") if engine_level_backend.startswith("guidance"): # TODO ideally we would have the LLTokenizer here as Lark syntax