Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.PromptAdapterConfig][]
- [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][]
- [vllm.config.DecodingConfig][]
- [vllm.config.StructuredOutputConfig][]
- [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][]
Expand Down
2 changes: 1 addition & 1 deletion tests/async_engine/test_async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ async def test_new_requests_event():
engine = MockAsyncLLMEngine()
assert engine.get_model_config() is not None
assert engine.get_tokenizer() is not None
assert engine.get_decoding_config() is not None
assert engine.get_structured_output_config() is not None


def start_engine():
Expand Down
20 changes: 12 additions & 8 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3539,8 +3539,11 @@ def get_served_model_name(model: str,

@config
@dataclass
class DecodingConfig:
"""Dataclass which contains the decoding strategy of the engine."""
class StructuredOutputConfig:
"""
Dataclass which contains parameters for
structured output / guided decoding.
"""

@property
@deprecated(
Expand Down Expand Up @@ -3610,7 +3613,7 @@ def __post_init__(self):
and self.backend not in ("xgrammar", "guidance")):
raise ValueError("disable_any_whitespace is only supported for "
"xgrammar and guidance backends.")
if (self.disable_additional_properties and self.backend != "guidance"):
if self.disable_additional_properties and self.backend != "guidance":
raise ValueError("disable_additional_properties is only supported "
"for the guidance backend.")

Expand Down Expand Up @@ -4298,8 +4301,9 @@ class VllmConfig:
"""LoRA configuration."""
speculative_config: Optional[SpeculativeConfig] = None
"""Speculative decoding configuration."""
decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
"""Decoding configuration."""
structured_output_config: StructuredOutputConfig = field(
default_factory=StructuredOutputConfig)
"""Structured output configuration."""
observability_config: Optional[ObservabilityConfig] = None
"""Observability configuration."""
prompt_adapter_config: Optional[PromptAdapterConfig] = None
Expand Down Expand Up @@ -4392,8 +4396,8 @@ def compute_hash(self) -> str:
vllm_factors.append(self.speculative_config.compute_hash())
else:
vllm_factors.append("None")
if self.decoding_config:
vllm_factors.append(self.decoding_config.compute_hash())
if self.structured_output_config:
vllm_factors.append(self.structured_output_config.compute_hash())
else:
vllm_factors.append("None")
if self.observability_config:
Expand Down Expand Up @@ -4767,7 +4771,7 @@ def __str__(self):
f"enforce_eager={self.model_config.enforce_eager}, "
f"kv_cache_dtype={self.cache_config.cache_dtype}, "
f" device_config={self.device_config.device}, "
f"decoding_config={self.decoding_config!r}, "
f"structured_output_config={self.structured_output_config!r}, "
f"observability_config={self.observability_config!r}, "
f"seed={self.model_config.seed}, "
f"served_model_name={self.model_config.served_model_name}, "
Expand Down
39 changes: 21 additions & 18 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,18 @@

import vllm.envs as envs
from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
ConfigFormat, ConfigType, DecodingConfig,
DetailedTraceModules, Device, DeviceConfig,
DistributedExecutorBackend, GuidedDecodingBackend,
GuidedDecodingBackendV1, HfOverrides, KVEventsConfig,
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
ModelConfig, ModelDType, ModelImpl, MultiModalConfig,
ConfigFormat, ConfigType, DetailedTraceModules,
Device, DeviceConfig, DistributedExecutorBackend,
GuidedDecodingBackend, GuidedDecodingBackendV1,
HfOverrides, KVEventsConfig, KVTransferConfig,
LoadConfig, LoadFormat, LoRAConfig, ModelConfig,
ModelDType, ModelImpl, MultiModalConfig,
ObservabilityConfig, ParallelConfig, PoolerConfig,
PrefixCachingHashAlgo, PromptAdapterConfig,
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
TaskOption, TokenizerMode, TokenizerPoolConfig,
VllmConfig, get_attr_docs, get_field)
StructuredOutputConfig, TaskOption, TokenizerMode,
TokenizerPoolConfig, VllmConfig, get_attr_docs,
get_field)
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationMethods
Expand Down Expand Up @@ -416,12 +417,14 @@ class EngineArgs:
disable_hybrid_kv_cache_manager: bool = (
SchedulerConfig.disable_hybrid_kv_cache_manager)

guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
guided_decoding_backend: GuidedDecodingBackend = \
StructuredOutputConfig.backend
guided_decoding_disable_fallback: bool = \
StructuredOutputConfig.disable_fallback
guided_decoding_disable_any_whitespace: bool = \
DecodingConfig.disable_any_whitespace
StructuredOutputConfig.disable_any_whitespace
guided_decoding_disable_additional_properties: bool = \
DecodingConfig.disable_additional_properties
StructuredOutputConfig.disable_additional_properties
logits_processor_pattern: Optional[
str] = ModelConfig.logits_processor_pattern

Expand Down Expand Up @@ -462,7 +465,7 @@ class EngineArgs:
additional_config: dict[str, Any] = \
get_field(VllmConfig, "additional_config")
enable_reasoning: Optional[bool] = None # DEPRECATED
reasoning_parser: str = DecodingConfig.reasoning_backend
reasoning_parser: str = StructuredOutputConfig.reasoning_backend

use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
pt_load_map_location: str = LoadConfig.pt_load_map_location
Expand Down Expand Up @@ -608,10 +611,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
**load_kwargs["pt_load_map_location"])

# Guided decoding arguments
guided_decoding_kwargs = get_kwargs(DecodingConfig)
guided_decoding_kwargs = get_kwargs(StructuredOutputConfig)
guided_decoding_group = parser.add_argument_group(
title="DecodingConfig",
description=DecodingConfig.__doc__,
title="StructuredOutputConfig",
description=StructuredOutputConfig.__doc__,
)
guided_decoding_group.add_argument("--guided-decoding-backend",
**guided_decoding_kwargs["backend"])
Expand Down Expand Up @@ -1259,7 +1262,7 @@ def create_engine_config(
max_prompt_adapter_token=self.max_prompt_adapter_token) \
if self.enable_prompt_adapter else None

decoding_config = DecodingConfig(
structured_output_config = StructuredOutputConfig(
backend=self.guided_decoding_backend,
disable_fallback=self.guided_decoding_disable_fallback,
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
Expand All @@ -1284,7 +1287,7 @@ def create_engine_config(
lora_config=lora_config,
speculative_config=speculative_config,
load_config=load_config,
decoding_config=decoding_config,
structured_output_config=structured_output_config,
observability_config=observability_config,
prompt_adapter_config=prompt_adapter_config,
compilation_config=self.compilation_config,
Expand Down
15 changes: 8 additions & 7 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from weakref import ReferenceType

import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig, StructuredOutputConfig, VllmConfig)
from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_timeout import asyncio_timeout
Expand Down Expand Up @@ -479,8 +479,9 @@ async def add_request_async(
params = await build_guided_decoding_logits_processor_async(
sampling_params=params,
tokenizer=await self.get_tokenizer_async(lora_request),
default_guided_backend=self.decoding_config.backend,
reasoning_backend=self.decoding_config.reasoning_backend,
default_guided_backend=self.structured_output_config.backend,
reasoning_backend=self.structured_output_config.
reasoning_backend,
model_config=self.model_config)

self._add_processed_request(
Expand Down Expand Up @@ -1119,9 +1120,9 @@ async def get_parallel_config(self) -> ParallelConfig:
"""Get the parallel configuration of the vLLM engine."""
return self.engine.get_parallel_config()

async def get_decoding_config(self) -> DecodingConfig:
"""Get the decoding configuration of the vLLM engine."""
return self.engine.get_decoding_config()
async def get_structured_output_config(self) -> StructuredOutputConfig:
"""Get the structured output configuration of the vLLM engine."""
return self.engine.get_structured_output_config()

async def get_scheduler_config(self) -> SchedulerConfig:
"""Get the scheduling configuration of the vLLM engine."""
Expand Down
23 changes: 12 additions & 11 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
from typing_extensions import TypeVar

import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ObservabilityConfig, ParallelConfig, SchedulerConfig,
VllmConfig)
from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
ParallelConfig, SchedulerConfig,
StructuredOutputConfig, VllmConfig)
from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics_types import StatLoggerBase, Stats
Expand Down Expand Up @@ -221,7 +221,7 @@ def __init__(
self.device_config = vllm_config.device_config
self.speculative_config = vllm_config.speculative_config # noqa
self.load_config = vllm_config.load_config
self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa
self.structured_output_config = vllm_config.structured_output_config or StructuredOutputConfig( # noqa
)
self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa
self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa
Expand Down Expand Up @@ -840,9 +840,9 @@ def get_parallel_config(self) -> ParallelConfig:
"""Gets the parallel configuration."""
return self.parallel_config

def get_decoding_config(self) -> DecodingConfig:
"""Gets the decoding configuration."""
return self.decoding_config
def get_structured_output_config(self) -> StructuredOutputConfig:
"""Gets the structured output configuration."""
return self.structured_output_config

def get_scheduler_config(self) -> SchedulerConfig:
"""Gets the scheduler configuration."""
Expand Down Expand Up @@ -2042,17 +2042,18 @@ def _build_logits_processors(

tokenizer = self.get_tokenizer(lora_request=lora_request)
guided_decoding.backend = guided_decoding.backend or \
self.decoding_config.backend
self.structured_output_config.backend

if self.decoding_config.reasoning_backend:
if self.structured_output_config.reasoning_backend:
logger.debug("Building with reasoning backend %s",
self.decoding_config.reasoning_backend)
self.structured_output_config.reasoning_backend)

processor = get_local_guided_decoding_logits_processor(
guided_params=guided_decoding,
tokenizer=tokenizer,
model_config=self.model_config,
reasoning_backend=self.decoding_config.reasoning_backend,
reasoning_backend=self.structured_output_config.
reasoning_backend,
)
if processor:
logits_processors.append(processor)
Expand Down
16 changes: 8 additions & 8 deletions vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from zmq.asyncio import Socket

from vllm import PoolingParams
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
from vllm.core.scheduler import SchedulerOutputs
# yapf conflicts with isort for this block
# yapf: disable
Expand Down Expand Up @@ -96,7 +96,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
# Get the configs.
self.vllm_config = engine_config
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config
self.structured_output_config = engine_config.structured_output_config

# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
Expand Down Expand Up @@ -381,8 +381,8 @@ async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config

async def get_decoding_config(self) -> DecodingConfig:
return self.decoding_config
async def get_structured_output_config(self) -> StructuredOutputConfig:
return self.structured_output_config

async def get_model_config(self) -> ModelConfig:
return self.model_config
Expand Down Expand Up @@ -544,11 +544,11 @@ async def _process_request(
build_guided_decoding_logits_processor_async(
sampling_params=params,
tokenizer=await self.get_tokenizer(lora_request),
default_guided_backend=(self.decoding_config.backend
if self.decoding_config
else DecodingConfig.backend),
default_guided_backend=(self.structured_output_config.backend
if self.structured_output_config
else StructuredOutputConfig.backend),
model_config=self.model_config,
reasoning_backend=self.decoding_config.reasoning_backend,
reasoning_backend=self.structured_output_config.reasoning_backend,
)

# 1) Create output queue for this requests.
Expand Down
6 changes: 3 additions & 3 deletions vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import AsyncGenerator, Mapping, Optional

from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
from vllm.config import DecodingConfig, ModelConfig, VllmConfig
from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
from vllm.core.scheduler import SchedulerOutputs
from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
Expand Down Expand Up @@ -250,8 +250,8 @@ async def get_model_config(self) -> ModelConfig:
...

@abstractmethod
async def get_decoding_config(self) -> DecodingConfig:
"""Get the decoding configuration of the vLLM engine."""
async def get_structured_output_config(self) -> StructuredOutputConfig:
"""Get the structured output configuration of the vLLM engine."""
...

@abstractmethod
Expand Down
6 changes: 3 additions & 3 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np

import vllm.envs as envs
from vllm.config import ModelConfig, VllmConfig
from vllm.config import ModelConfig, StructuredOutputConfig, VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
Expand Down Expand Up @@ -526,8 +526,8 @@ async def get_vllm_config(self) -> VllmConfig:
async def get_model_config(self) -> ModelConfig:
return self.model_config

async def get_decoding_config(self):
raise ValueError("Not Supported on V1 yet.")
async def get_structured_output_config(self) -> "StructuredOutputConfig":
raise self.vllm_config.structured_output_config

async def get_input_preprocessor(self) -> InputPreprocessor:
return self.processor.input_preprocessor
Expand Down
6 changes: 3 additions & 3 deletions vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.lora_config = vllm_config.lora_config
self.decoding_config = vllm_config.decoding_config
self.structured_output_config = vllm_config.structured_output_config
self.tokenizer = tokenizer

self.generation_config_fields = (
Expand Down Expand Up @@ -149,10 +149,10 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
"not enabled!")

def _validate_structured_output(self, params: SamplingParams) -> None:
if not params.guided_decoding or not self.decoding_config:
if not params.guided_decoding or not self.structured_output_config:
return

engine_level_backend = self.decoding_config.backend
engine_level_backend = self.structured_output_config.backend
if params.guided_decoding.backend:
# Request-level backend selection is not supported in V1.
# The values may differ if `params` is reused and was set
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/structured_output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def __init__(self, vllm_config: VllmConfig):
scheduler_config=self.vllm_config.scheduler_config,
lora_config=self.vllm_config.lora_config,
).get_lora_tokenizer(None)
reasoning_backend = vllm_config.decoding_config.reasoning_backend
reasoning_backend = (
vllm_config.structured_output_config.reasoning_backend)
if reasoning_backend:
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
reasoning_backend)
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/structured_output/backend_guidance.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend):

def __post_init__(self):
self.disable_any_whitespace = \
self.vllm_config.decoding_config.disable_any_whitespace
self.vllm_config.structured_output_config.disable_any_whitespace
self.disable_additional_properties = \
self.vllm_config.decoding_config.disable_additional_properties
self.vllm_config.structured_output_config.disable_additional_properties

self.ll_tokenizer = llguidance_hf.from_tokenizer(
self.tokenizer, self.vocab_size)
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/structured_output/backend_xgrammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend):

def __post_init__(self):
self.disable_any_whitespace = \
self.vllm_config.decoding_config.disable_any_whitespace
self.vllm_config.structured_output_config.disable_any_whitespace

if isinstance(self.tokenizer, MistralTokenizer):
# NOTE: ideally, xgrammar should handle this accordingly.
Expand Down