|
21 | 21 | from vllm.transformers_utils.tokenizer_group import TokenizerGroup |
22 | 22 | from vllm.v1.engine import EngineCoreRequest |
23 | 23 | from vllm.v1.engine.mm_input_cache import MirroredProcessingCache |
24 | | -from vllm.v1.structured_output.backend_guidance import ( |
25 | | - validate_guidance_grammar) |
26 | | -from vllm.v1.structured_output.backend_xgrammar import ( |
27 | | - validate_xgrammar_grammar) |
| 24 | +from vllm.v1.structured_output import StructuredOutputManager |
28 | 25 |
|
29 | 26 |
|
30 | 27 | class Processor: |
@@ -81,7 +78,7 @@ def _validate_sampling_params( |
81 | 78 | params: SamplingParams, |
82 | 79 | lora_request: Optional[LoRARequest], |
83 | 80 | ) -> None: |
84 | | - self._validate_structured_output(params) |
| 81 | + StructuredOutputManager.validate_request(params, self.vllm_config) |
85 | 82 | self._validate_logit_bias(params) |
86 | 83 |
|
87 | 84 | if params.allowed_token_ids is None: |
@@ -148,59 +145,6 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: |
148 | 145 | raise ValueError(f"Got lora_request {lora_request} but LoRA is " |
149 | 146 | "not enabled!") |
150 | 147 |
|
151 | | - def _validate_structured_output(self, params: SamplingParams) -> None: |
152 | | - if not params.guided_decoding or not self.decoding_config: |
153 | | - return |
154 | | - |
155 | | - engine_level_backend = self.decoding_config.backend |
156 | | - if params.guided_decoding.backend: |
157 | | - # Request-level backend selection is not supported in V1. |
158 | | - # The values may differ if `params` is reused and was set |
159 | | - # to a specific backend based on `auto` behavior in a previous |
160 | | - # request. We remember that it was set as a result of `auto` |
161 | | - # using the `_auto` option set on the backend in the params. |
162 | | - if (params.guided_decoding.backend != engine_level_backend |
163 | | - and not (engine_level_backend == "auto" |
164 | | - and params.guided_decoding.backend_was_auto)): |
165 | | - raise ValueError( |
166 | | - "Request-level structured output backend selection is no " |
167 | | - "longer supported. The request specified " |
168 | | - f"'{params.guided_decoding.backend}', but vLLM was " |
169 | | - f"initialised with '{engine_level_backend}'. This error " |
170 | | - "can be resolved by removing backend selection from the " |
171 | | - "request.") |
172 | | - else: |
173 | | - params.guided_decoding.backend = engine_level_backend |
174 | | - |
175 | | - # Request content validation |
176 | | - if engine_level_backend.startswith("xgrammar"): |
177 | | - # xgrammar with no fallback |
178 | | - validate_xgrammar_grammar(params) |
179 | | - elif engine_level_backend.startswith("guidance"): |
180 | | - # TODO: ideally we would have the LLTokenizer here as Lark syntax |
181 | | - # allows <|special_token|> and similar, see |
182 | | - # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens |
183 | | - # Without tokenizer these are disallowed in grammars. |
184 | | - validate_guidance_grammar(params, tokenizer=None) |
185 | | - else: |
186 | | - # NOTE: engine_level_backend must be "auto" here, because we have |
187 | | - # checked supported_backends above. |
188 | | - # "auto" is an opt-in to opinionated behavior where we try to |
189 | | - # choose a backend based on request contents. This is not the |
190 | | - # default as it is less predictable and subject to change |
191 | | - # between releases as feature support changes. |
192 | | - try: |
193 | | - validate_xgrammar_grammar(params) |
194 | | - params.guided_decoding.backend = "xgrammar" |
195 | | - except ValueError: |
196 | | - # The request either failed validation |
197 | | - # or includes some jsonschema feature(s) that |
198 | | - # are not supported in xgrammar. Fall back to guidance. |
199 | | - validate_guidance_grammar(params, tokenizer=None) |
200 | | - params.guided_decoding.backend = "guidance" |
201 | | - # Remember that this backend was set automatically |
202 | | - params.guided_decoding.backend_was_auto = True |
203 | | - |
204 | 148 | def process_inputs( |
205 | 149 | self, |
206 | 150 | request_id: str, |
|
0 commit comments