diff --git a/docs/api/README.md b/docs/api/README.md index 5c7b2ca79ee2..410d7cb7ad7a 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -18,7 +18,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.PromptAdapterConfig][] - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] -- [vllm.config.DecodingConfig][] +- [vllm.config.StructuredOutputConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 3c030aea2066..dbb15ca099af 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -129,7 +129,7 @@ async def test_new_requests_event(): engine = MockAsyncLLMEngine() assert engine.get_model_config() is not None assert engine.get_tokenizer() is not None - assert engine.get_decoding_config() is not None + assert engine.get_structured_output_config() is not None def start_engine(): diff --git a/vllm/config.py b/vllm/config.py index 7863859a6ee6..0f72942170c0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3539,8 +3539,11 @@ def get_served_model_name(model: str, @config @dataclass -class DecodingConfig: - """Dataclass which contains the decoding strategy of the engine.""" +class StructuredOutputConfig: + """ + Dataclass which contains parameters for + structured output / guided decoding. + """ @property @deprecated( @@ -3610,7 +3613,7 @@ def __post_init__(self): and self.backend not in ("xgrammar", "guidance")): raise ValueError("disable_any_whitespace is only supported for " "xgrammar and guidance backends.") - if (self.disable_additional_properties and self.backend != "guidance"): + if self.disable_additional_properties and self.backend != "guidance": raise ValueError("disable_additional_properties is only supported " "for the guidance backend.") @@ -4298,8 +4301,9 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: DecodingConfig = field(default_factory=DecodingConfig) - """Decoding configuration.""" + structured_output_config: StructuredOutputConfig = field( + default_factory=StructuredOutputConfig) + """Structured output configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" prompt_adapter_config: Optional[PromptAdapterConfig] = None @@ -4392,8 +4396,8 @@ def compute_hash(self) -> str: vllm_factors.append(self.speculative_config.compute_hash()) else: vllm_factors.append("None") - if self.decoding_config: - vllm_factors.append(self.decoding_config.compute_hash()) + if self.structured_output_config: + vllm_factors.append(self.structured_output_config.compute_hash()) else: vllm_factors.append("None") if self.observability_config: @@ -4767,7 +4771,7 @@ def __str__(self): f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f" device_config={self.device_config.device}, " - f"decoding_config={self.decoding_config!r}, " + f"structured_output_config={self.structured_output_config!r}, " f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 38f82e64de53..d28bd3c2327a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -22,17 +22,18 @@ import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, - ConfigFormat, ConfigType, DecodingConfig, - DetailedTraceModules, Device, DeviceConfig, - DistributedExecutorBackend, GuidedDecodingBackend, - GuidedDecodingBackendV1, HfOverrides, KVEventsConfig, - KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ModelDType, ModelImpl, MultiModalConfig, + ConfigFormat, ConfigType, DetailedTraceModules, + Device, DeviceConfig, DistributedExecutorBackend, + GuidedDecodingBackend, GuidedDecodingBackendV1, + HfOverrides, KVEventsConfig, KVTransferConfig, + LoadConfig, LoadFormat, LoRAConfig, ModelConfig, + ModelDType, ModelImpl, MultiModalConfig, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, PromptAdapterConfig, SchedulerConfig, SchedulerPolicy, SpeculativeConfig, - TaskOption, TokenizerMode, TokenizerPoolConfig, - VllmConfig, get_attr_docs, get_field) + StructuredOutputConfig, TaskOption, TokenizerMode, + TokenizerPoolConfig, VllmConfig, get_attr_docs, + get_field) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -416,12 +417,14 @@ class EngineArgs: disable_hybrid_kv_cache_manager: bool = ( SchedulerConfig.disable_hybrid_kv_cache_manager) - guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend - guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback + guided_decoding_backend: GuidedDecodingBackend = \ + StructuredOutputConfig.backend + guided_decoding_disable_fallback: bool = \ + StructuredOutputConfig.disable_fallback guided_decoding_disable_any_whitespace: bool = \ - DecodingConfig.disable_any_whitespace + StructuredOutputConfig.disable_any_whitespace guided_decoding_disable_additional_properties: bool = \ - DecodingConfig.disable_additional_properties + StructuredOutputConfig.disable_additional_properties logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -462,7 +465,7 @@ class EngineArgs: additional_config: dict[str, Any] = \ get_field(VllmConfig, "additional_config") enable_reasoning: Optional[bool] = None # DEPRECATED - reasoning_parser: str = DecodingConfig.reasoning_backend + reasoning_parser: str = StructuredOutputConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location @@ -608,10 +611,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **load_kwargs["pt_load_map_location"]) # Guided decoding arguments - guided_decoding_kwargs = get_kwargs(DecodingConfig) + guided_decoding_kwargs = get_kwargs(StructuredOutputConfig) guided_decoding_group = parser.add_argument_group( - title="DecodingConfig", - description=DecodingConfig.__doc__, + title="StructuredOutputConfig", + description=StructuredOutputConfig.__doc__, ) guided_decoding_group.add_argument("--guided-decoding-backend", **guided_decoding_kwargs["backend"]) @@ -1259,7 +1262,7 @@ def create_engine_config( max_prompt_adapter_token=self.max_prompt_adapter_token) \ if self.enable_prompt_adapter else None - decoding_config = DecodingConfig( + structured_output_config = StructuredOutputConfig( backend=self.guided_decoding_backend, disable_fallback=self.guided_decoding_disable_fallback, disable_any_whitespace=self.guided_decoding_disable_any_whitespace, @@ -1284,7 +1287,7 @@ def create_engine_config( lora_config=lora_config, speculative_config=speculative_config, load_config=load_config, - decoding_config=decoding_config, + structured_output_config=structured_output_config, observability_config=observability_config, prompt_adapter_config=prompt_adapter_config, compilation_config=self.compilation_config, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3d7d28055dd0..7bc790d95309 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -11,8 +11,8 @@ from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, StructuredOutputConfig, VllmConfig) from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -479,8 +479,9 @@ async def add_request_async( params = await build_guided_decoding_logits_processor_async( sampling_params=params, tokenizer=await self.get_tokenizer_async(lora_request), - default_guided_backend=self.decoding_config.backend, - reasoning_backend=self.decoding_config.reasoning_backend, + default_guided_backend=self.structured_output_config.backend, + reasoning_backend=self.structured_output_config. + reasoning_backend, model_config=self.model_config) self._add_processed_request( @@ -1119,9 +1120,9 @@ async def get_parallel_config(self) -> ParallelConfig: """Get the parallel configuration of the vLLM engine.""" return self.engine.get_parallel_config() - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - return self.engine.get_decoding_config() + async def get_structured_output_config(self) -> StructuredOutputConfig: + """Get the structured output configuration of the vLLM engine.""" + return self.engine.get_structured_output_config() async def get_scheduler_config(self) -> SchedulerConfig: """Get the scheduling configuration of the vLLM engine.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 25fa1c3058be..f295a2301dfc 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -17,9 +17,9 @@ from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, + StructuredOutputConfig, VllmConfig) from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats @@ -221,7 +221,7 @@ def __init__( self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config # noqa self.load_config = vllm_config.load_config - self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa + self.structured_output_config = vllm_config.structured_output_config or StructuredOutputConfig( # noqa ) self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa @@ -840,9 +840,9 @@ def get_parallel_config(self) -> ParallelConfig: """Gets the parallel configuration.""" return self.parallel_config - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config + def get_structured_output_config(self) -> StructuredOutputConfig: + """Gets the structured output configuration.""" + return self.structured_output_config def get_scheduler_config(self) -> SchedulerConfig: """Gets the scheduler configuration.""" @@ -2042,17 +2042,18 @@ def _build_logits_processors( tokenizer = self.get_tokenizer(lora_request=lora_request) guided_decoding.backend = guided_decoding.backend or \ - self.decoding_config.backend + self.structured_output_config.backend - if self.decoding_config.reasoning_backend: + if self.structured_output_config.reasoning_backend: logger.debug("Building with reasoning backend %s", - self.decoding_config.reasoning_backend) + self.structured_output_config.reasoning_backend) processor = get_local_guided_decoding_logits_processor( guided_params=guided_decoding, tokenizer=tokenizer, model_config=self.model_config, - reasoning_backend=self.decoding_config.reasoning_backend, + reasoning_backend=self.structured_output_config. + reasoning_backend, ) if processor: logits_processors.append(processor) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 9e018ec7f344..0c21eb0fb125 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -16,7 +16,7 @@ from zmq.asyncio import Socket from vllm import PoolingParams -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs # yapf conflicts with isort for this block # yapf: disable @@ -96,7 +96,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, # Get the configs. self.vllm_config = engine_config self.model_config = engine_config.model_config - self.decoding_config = engine_config.decoding_config + self.structured_output_config = engine_config.structured_output_config # Create the tokenizer group. self.tokenizer = init_tokenizer_from_configs( @@ -381,8 +381,8 @@ async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): async def get_vllm_config(self) -> VllmConfig: return self.vllm_config - async def get_decoding_config(self) -> DecodingConfig: - return self.decoding_config + async def get_structured_output_config(self) -> StructuredOutputConfig: + return self.structured_output_config async def get_model_config(self) -> ModelConfig: return self.model_config @@ -544,11 +544,11 @@ async def _process_request( build_guided_decoding_logits_processor_async( sampling_params=params, tokenizer=await self.get_tokenizer(lora_request), - default_guided_backend=(self.decoding_config.backend - if self.decoding_config - else DecodingConfig.backend), + default_guided_backend=(self.structured_output_config.backend + if self.structured_output_config + else StructuredOutputConfig.backend), model_config=self.model_config, - reasoning_backend=self.decoding_config.reasoning_backend, + reasoning_backend=self.structured_output_config.reasoning_backend, ) # 1) Create output queue for this requests. diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 8688fcc82cd9..9b41511b76f2 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -6,7 +6,7 @@ from typing import AsyncGenerator, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -250,8 +250,8 @@ async def get_model_config(self) -> ModelConfig: ... @abstractmethod - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" + async def get_structured_output_config(self) -> StructuredOutputConfig: + """Get the structured output configuration of the vLLM engine.""" ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3754570dfaaa..d0e60fcbdce0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -8,7 +8,7 @@ import numpy as np import vllm.envs as envs -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE @@ -526,8 +526,8 @@ async def get_vllm_config(self) -> VllmConfig: async def get_model_config(self) -> ModelConfig: return self.model_config - async def get_decoding_config(self): - raise ValueError("Not Supported on V1 yet.") + async def get_structured_output_config(self) -> "StructuredOutputConfig": + raise self.vllm_config.structured_output_config async def get_input_preprocessor(self) -> InputPreprocessor: return self.processor.input_preprocessor diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 7e7703df2cf1..8c634e922ce9 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -40,7 +40,7 @@ def __init__( self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config - self.decoding_config = vllm_config.decoding_config + self.structured_output_config = vllm_config.structured_output_config self.tokenizer = tokenizer self.generation_config_fields = ( @@ -149,10 +149,10 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: "not enabled!") def _validate_structured_output(self, params: SamplingParams) -> None: - if not params.guided_decoding or not self.decoding_config: + if not params.guided_decoding or not self.structured_output_config: return - engine_level_backend = self.decoding_config.backend + engine_level_backend = self.structured_output_config.backend if params.guided_decoding.backend: # Request-level backend selection is not supported in V1. # The values may differ if `params` is reused and was set diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c5500b9a384d..c1b97f8ef825 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -51,7 +51,8 @@ def __init__(self, vllm_config: VllmConfig): scheduler_config=self.vllm_config.scheduler_config, lora_config=self.vllm_config.lora_config, ).get_lora_tokenizer(None) - reasoning_backend = vllm_config.decoding_config.reasoning_backend + reasoning_backend = ( + vllm_config.structured_output_config.reasoning_backend) if reasoning_backend: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_backend) diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 02e7fc33f517..d628d4e040d0 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_output_config.disable_any_whitespace self.disable_additional_properties = \ - self.vllm_config.decoding_config.disable_additional_properties + self.vllm_config.structured_output_config.disable_additional_properties self.ll_tokenizer = llguidance_hf.from_tokenizer( self.tokenizer, self.vocab_size) diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 88544565e544..3991ef42cf0b 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_output_config.disable_any_whitespace if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly.