vllm-project · njhill · Jul 2, 2025 · Jul 2, 2025
diff --git a/docs/api/README.md b/docs/api/README.md
@@ -18,7 +18,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.PromptAdapterConfig][]
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
-- [vllm.config.DecodingConfig][]
+- [vllm.config.StructuredOutputConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
@@ -129,7 +129,7 @@ async def test_new_requests_event():
     engine = MockAsyncLLMEngine()
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
+    assert engine.get_structured_output_config() is not None
 
 
 def start_engine():

diff --git a/vllm/config.py b/vllm/config.py
@@ -3539,8 +3539,11 @@ def get_served_model_name(model: str,
 
 @config
 @dataclass
-class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine."""
+class StructuredOutputConfig:
+    """
+    Dataclass which contains parameters for
+    structured output / guided decoding.
+    """
 
     @property
     @deprecated(
@@ -3610,7 +3613,7 @@ def __post_init__(self):
                 and self.backend not in ("xgrammar", "guidance")):
             raise ValueError("disable_any_whitespace is only supported for "
                              "xgrammar and guidance backends.")
-        if (self.disable_additional_properties and self.backend != "guidance"):
+        if self.disable_additional_properties and self.backend != "guidance":
             raise ValueError("disable_additional_properties is only supported "
                              "for the guidance backend.")
 
@@ -4298,8 +4301,9 @@ class VllmConfig:
     """LoRA configuration."""
     speculative_config: Optional[SpeculativeConfig] = None
     """Speculative decoding configuration."""
-    decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
-    """Decoding configuration."""
+    structured_output_config: StructuredOutputConfig = field(
+        default_factory=StructuredOutputConfig)
+    """Structured output configuration."""
     observability_config: Optional[ObservabilityConfig] = None
     """Observability configuration."""
     prompt_adapter_config: Optional[PromptAdapterConfig] = None
@@ -4392,8 +4396,8 @@ def compute_hash(self) -> str:
             vllm_factors.append(self.speculative_config.compute_hash())
         else:
             vllm_factors.append("None")
-        if self.decoding_config:
-            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.structured_output_config:
+            vllm_factors.append(self.structured_output_config.compute_hash())
         else:
             vllm_factors.append("None")
         if self.observability_config:
@@ -4767,7 +4771,7 @@ def __str__(self):
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
             f" device_config={self.device_config.device}, "
-            f"decoding_config={self.decoding_config!r}, "
+            f"structured_output_config={self.structured_output_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -22,17 +22,18 @@
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigFormat, ConfigType, DecodingConfig,
-                         DetailedTraceModules, Device, DeviceConfig,
-                         DistributedExecutorBackend, GuidedDecodingBackend,
-                         GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
-                         KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
+                         ConfigFormat, ConfigType, DetailedTraceModules,
+                         Device, DeviceConfig, DistributedExecutorBackend,
+                         GuidedDecodingBackend, GuidedDecodingBackendV1,
+                         HfOverrides, KVEventsConfig, KVTransferConfig,
+                         LoadConfig, LoadFormat, LoRAConfig, ModelConfig,
+                         ModelDType, ModelImpl, MultiModalConfig,
                          ObservabilityConfig, ParallelConfig, PoolerConfig,
                          PrefixCachingHashAlgo, PromptAdapterConfig,
                          SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         TaskOption, TokenizerMode, TokenizerPoolConfig,
-                         VllmConfig, get_attr_docs, get_field)
+                         StructuredOutputConfig, TaskOption, TokenizerMode,
+                         TokenizerPoolConfig, VllmConfig, get_attr_docs,
+                         get_field)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -416,12 +417,14 @@ class EngineArgs:
     disable_hybrid_kv_cache_manager: bool = (
         SchedulerConfig.disable_hybrid_kv_cache_manager)
 
-    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
-    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
+    guided_decoding_backend: GuidedDecodingBackend = \
+        StructuredOutputConfig.backend
+    guided_decoding_disable_fallback: bool = \
+        StructuredOutputConfig.disable_fallback
     guided_decoding_disable_any_whitespace: bool = \
-        DecodingConfig.disable_any_whitespace
+        StructuredOutputConfig.disable_any_whitespace
     guided_decoding_disable_additional_properties: bool = \
-        DecodingConfig.disable_additional_properties
+        StructuredOutputConfig.disable_additional_properties
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern
 
@@ -462,7 +465,7 @@ class EngineArgs:
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
     enable_reasoning: Optional[bool] = None  # DEPRECATED
-    reasoning_parser: str = DecodingConfig.reasoning_backend
+    reasoning_parser: str = StructuredOutputConfig.reasoning_backend
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
@@ -608,10 +611,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                 **load_kwargs["pt_load_map_location"])
 
         # Guided decoding arguments
-        guided_decoding_kwargs = get_kwargs(DecodingConfig)
+        guided_decoding_kwargs = get_kwargs(StructuredOutputConfig)
         guided_decoding_group = parser.add_argument_group(
-            title="DecodingConfig",
-            description=DecodingConfig.__doc__,
+            title="StructuredOutputConfig",
+            description=StructuredOutputConfig.__doc__,
         )
         guided_decoding_group.add_argument("--guided-decoding-backend",
                                            **guided_decoding_kwargs["backend"])
@@ -1259,7 +1262,7 @@ def create_engine_config(
             max_prompt_adapter_token=self.max_prompt_adapter_token) \
                                         if self.enable_prompt_adapter else None
 
-        decoding_config = DecodingConfig(
+        structured_output_config = StructuredOutputConfig(
             backend=self.guided_decoding_backend,
             disable_fallback=self.guided_decoding_disable_fallback,
             disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
@@ -1284,7 +1287,7 @@ def create_engine_config(
             lora_config=lora_config,
             speculative_config=speculative_config,
             load_config=load_config,
-            decoding_config=decoding_config,
+            structured_output_config=structured_output_config,
             observability_config=observability_config,
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -11,8 +11,8 @@
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, StructuredOutputConfig, VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -479,8 +479,9 @@ async def add_request_async(
             params = await build_guided_decoding_logits_processor_async(
                 sampling_params=params,
                 tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.backend,
-                reasoning_backend=self.decoding_config.reasoning_backend,
+                default_guided_backend=self.structured_output_config.backend,
+                reasoning_backend=self.structured_output_config.
+                reasoning_backend,
                 model_config=self.model_config)
 
         self._add_processed_request(
@@ -1119,9 +1120,9 @@ async def get_parallel_config(self) -> ParallelConfig:
         """Get the parallel configuration of the vLLM engine."""
         return self.engine.get_parallel_config()
 
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        return self.engine.get_decoding_config()
+    async def get_structured_output_config(self) -> StructuredOutputConfig:
+        """Get the structured output configuration of the vLLM engine."""
+        return self.engine.get_structured_output_config()
 
     async def get_scheduler_config(self) -> SchedulerConfig:
         """Get the scheduling configuration of the vLLM engine."""

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -17,9 +17,9 @@
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig,
+                         StructuredOutputConfig, VllmConfig)
 from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
@@ -221,7 +221,7 @@ def __init__(
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config  # noqa
         self.load_config = vllm_config.load_config
-        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        self.structured_output_config = vllm_config.structured_output_config or StructuredOutputConfig(  # noqa
         )
         self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
         self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
@@ -840,9 +840,9 @@ def get_parallel_config(self) -> ParallelConfig:
         """Gets the parallel configuration."""
         return self.parallel_config
 
-    def get_decoding_config(self) -> DecodingConfig:
-        """Gets the decoding configuration."""
-        return self.decoding_config
+    def get_structured_output_config(self) -> StructuredOutputConfig:
+        """Gets the structured output configuration."""
+        return self.structured_output_config
 
     def get_scheduler_config(self) -> SchedulerConfig:
         """Gets the scheduler configuration."""
@@ -2042,17 +2042,18 @@ def _build_logits_processors(
 
             tokenizer = self.get_tokenizer(lora_request=lora_request)
             guided_decoding.backend = guided_decoding.backend or \
-                self.decoding_config.backend
+                self.structured_output_config.backend
 
-            if self.decoding_config.reasoning_backend:
+            if self.structured_output_config.reasoning_backend:
                 logger.debug("Building with reasoning backend %s",
-                             self.decoding_config.reasoning_backend)
+                             self.structured_output_config.reasoning_backend)
 
             processor = get_local_guided_decoding_logits_processor(
                 guided_params=guided_decoding,
                 tokenizer=tokenizer,
                 model_config=self.model_config,
-                reasoning_backend=self.decoding_config.reasoning_backend,
+                reasoning_backend=self.structured_output_config.
+                reasoning_backend,
             )
             if processor:
                 logits_processors.append(processor)

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -16,7 +16,7 @@
 from zmq.asyncio import Socket
 
 from vllm import PoolingParams
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -96,7 +96,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         # Get the configs.
         self.vllm_config = engine_config
         self.model_config = engine_config.model_config
-        self.decoding_config = engine_config.decoding_config
+        self.structured_output_config = engine_config.structured_output_config
 
         # Create the tokenizer group.
         self.tokenizer = init_tokenizer_from_configs(
@@ -381,8 +381,8 @@ async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
     async def get_vllm_config(self) -> VllmConfig:
         return self.vllm_config
 
-    async def get_decoding_config(self) -> DecodingConfig:
-        return self.decoding_config
+    async def get_structured_output_config(self) -> StructuredOutputConfig:
+        return self.structured_output_config
 
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
@@ -544,11 +544,11 @@ async def _process_request(
                 build_guided_decoding_logits_processor_async(
                     sampling_params=params,
                     tokenizer=await self.get_tokenizer(lora_request),
-                    default_guided_backend=(self.decoding_config.backend
-                        if self.decoding_config
-                        else DecodingConfig.backend),
+                    default_guided_backend=(self.structured_output_config.backend
+                        if self.structured_output_config
+                        else StructuredOutputConfig.backend),
                     model_config=self.model_config,
-                    reasoning_backend=self.decoding_config.reasoning_backend,
+                    reasoning_backend=self.structured_output_config.reasoning_backend,
                 )
 
         # 1) Create output queue for this requests.

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -6,7 +6,7 @@
 from typing import AsyncGenerator, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -250,8 +250,8 @@ async def get_model_config(self) -> ModelConfig:
         ...
 
     @abstractmethod
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
+    async def get_structured_output_config(self) -> StructuredOutputConfig:
+        """Get the structured output configuration of the vLLM engine."""
         ...
 
     @abstractmethod

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 import vllm.envs as envs
-from vllm.config import ModelConfig, VllmConfig
+from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
@@ -526,8 +526,8 @@ async def get_vllm_config(self) -> VllmConfig:
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
-    async def get_decoding_config(self):
-        raise ValueError("Not Supported on V1 yet.")
+    async def get_structured_output_config(self) -> "StructuredOutputConfig":
+        raise self.vllm_config.structured_output_config
 
     async def get_input_preprocessor(self) -> InputPreprocessor:
         return self.processor.input_preprocessor

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -40,7 +40,7 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
-        self.decoding_config = vllm_config.decoding_config
+        self.structured_output_config = vllm_config.structured_output_config
         self.tokenizer = tokenizer
 
         self.generation_config_fields = (
@@ -149,10 +149,10 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
                              "not enabled!")
 
     def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.guided_decoding or not self.decoding_config:
+        if not params.guided_decoding or not self.structured_output_config:
             return
 
-        engine_level_backend = self.decoding_config.backend
+        engine_level_backend = self.structured_output_config.backend
         if params.guided_decoding.backend:
             # Request-level backend selection is not supported in V1.
             # The values may differ if `params` is reused and was set

@@ -51,7 +51,8 @@ def __init__(self, vllm_config: VllmConfig):
             scheduler_config=self.vllm_config.scheduler_config,
             lora_config=self.vllm_config.lora_config,
         ).get_lora_tokenizer(None)
-        reasoning_backend = vllm_config.decoding_config.reasoning_backend
+        reasoning_backend = (
+            vllm_config.structured_output_config.reasoning_backend)
         if reasoning_backend:
             reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                 reasoning_backend)

@@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_output_config.disable_any_whitespace
         self.disable_additional_properties = \
-            self.vllm_config.decoding_config.disable_additional_properties
+            self.vllm_config.structured_output_config.disable_additional_properties
 
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
             self.tokenizer, self.vocab_size)

@@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_output_config.disable_any_whitespace
 
         if isinstance(self.tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.