1+ # ruff: noqa: E501
12# SPDX-License-Identifier: Apache-2.0
23
34from __future__ import annotations
45
56import json
67import re
78from enum import Enum
8- from typing import Any
9+ from typing import TYPE_CHECKING , Any
910
1011import jsonschema
1112import pytest
1213from pydantic import BaseModel
1314
15+ from tests .reasoning .utils import run_reasoning_extraction
1416from vllm .entrypoints .llm import LLM
1517from vllm .outputs import RequestOutput
1618from vllm .platforms import current_platform
19+ from vllm .reasoning .abs_reasoning_parsers import ReasoningParserManager
1720from vllm .sampling_params import GuidedDecodingParams , SamplingParams
1821
22+ if TYPE_CHECKING :
23+ from vllm .config import TokenizerMode
24+
1925NGRAM_SPEC_CONFIG = {
2026 "model" : "[ngram]" ,
2127 "num_speculative_tokens" : 5 ,
@@ -444,7 +450,7 @@ def test_structured_output(
444450
445451 prompt = """
446452You have access to the following function to retrieve the weather in a city:
447-
453+
448454 {
449455 "name": "get_weather",
450456 "parameters": {
@@ -455,7 +461,7 @@ def test_structured_output(
455461 }
456462 }
457463 }
458-
464+
459465If a you choose to call a function ONLY reply in the following format:
460466<{start_tag}={function_name}>{parameters}{end_tag}
461467where
@@ -476,7 +482,7 @@ def test_structured_output(
476482- Always add your sources when using search results to answer the user query
477483
478484You are a helpful assistant.
479-
485+
480486Given the previous instructions, what is the weather in New York City? \
481487 Make the response as short as possible.
482488"""
@@ -514,6 +520,88 @@ def test_structured_output(
514520 f"{ generated_text !r} \n Error: { str (e )} " )
515521
516522
523+ @pytest .mark .skip_global_cleanup
524+ @pytest .mark .parametrize (
525+ "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config" , # noqa: E501
526+ [
527+ ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" , "xgrammar" , "auto" ,
528+ "deepseek_r1" , NGRAM_SPEC_CONFIG ),
529+ ("Qwen/Qwen3-1.7B" , "xgrammar" , "auto" , "deepseek_r1" , None ),
530+ ],
531+ )
532+ def test_structured_output_with_reasoning_matrices (
533+ monkeypatch : pytest .MonkeyPatch ,
534+ guided_decoding_backend : str ,
535+ tokenizer_mode : TokenizerMode ,
536+ reasoning_parser : str ,
537+ model_name : str ,
538+ speculative_config : dict [str , Any ] | None ,
539+ ):
540+ monkeypatch .setenv ("VLLM_USE_V1" , "1" )
541+
542+ if current_platform .is_tpu () and speculative_config :
543+ pytest .skip ("TPU does not support speculative decoding" )
544+
545+ # Use a single LLM instance for several scenarios to
546+ # speed up the test suite.
547+ llm = LLM (
548+ model = model_name ,
549+ # Don't use eager execution on TPUs because we want to test for no
550+ # recompilation at runtime
551+ enforce_eager = bool (not current_platform .is_tpu ()),
552+ max_model_len = 1024 ,
553+ max_num_seqs = 16 ,
554+ guided_decoding_backend = guided_decoding_backend ,
555+ guided_decoding_disable_any_whitespace = True ,
556+ tokenizer_mode = tokenizer_mode ,
557+ reasoning_parser = reasoning_parser ,
558+ speculative_config = speculative_config ,
559+ )
560+ tokenizer = llm .get_tokenizer (None )
561+ reasoner = ReasoningParserManager .get_reasoning_parser (reasoning_parser )(
562+ tokenizer = tokenizer )
563+
564+ reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\n Problem: What is 5 * 8 + 2?" # noqa: E501
565+ reasoning_schema = {
566+ "type" : "object" ,
567+ "properties" : {
568+ "result" : {
569+ "type" : "integer"
570+ }
571+ },
572+ "required" : ["result" ],
573+ "additionalProperties" : False
574+ }
575+ if "Qwen3" in model_name :
576+ reasoning_prompt += "<think>\n "
577+
578+ sampling_params = SamplingParams (
579+ temperature = 0.1 ,
580+ max_tokens = 8192 ,
581+ guided_decoding = GuidedDecodingParams (json = reasoning_schema ),
582+ )
583+ outputs = llm .generate (
584+ [reasoning_prompt ],
585+ sampling_params = sampling_params ,
586+ use_tqdm = True ,
587+ )
588+
589+ assert outputs is not None
590+ output = outputs [0 ]
591+ assert output is not None and isinstance (output , RequestOutput )
592+ prompt = output .prompt
593+ generated_text = output .outputs [0 ].text
594+ reasoning_content , content = run_reasoning_extraction (
595+ reasoner , [generated_text ])
596+ print (
597+ f"Prompt: { prompt !r} \n Reasoning: { reasoning_content !r} \n Content: { content !r} "
598+ )
599+
600+ assert content is not None and reasoning_content is not None
601+ output_json = json .loads (content )
602+ jsonschema .validate (instance = output_json , schema = reasoning_schema )
603+
604+
517605@pytest .mark .skip_global_cleanup
518606@pytest .mark .parametrize ("model_name, tokenizer_mode" ,
519607 PARAMS_MODELS_TOKENIZER_MODE )
0 commit comments