vllm-project · southfreebird · Feb 18, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
@@ -23,11 +23,23 @@
 ]
 
 
-@pytest.fixture(scope="module")
-def llm():
+@pytest.fixture(scope="module", params=["autoregressive", "speculative"])
+def llm(request):
+
+    def get_llm_kwargs(mode: str):
+        if mode == "autoregressive":
+            return {}
+        return {
+            "speculative_config": {
+                "model": "Qwen/Qwen2.5-0.5B-Instruct",
+                "num_speculative_tokens": 3,
+            },
+        }
+
+    test_llm_kwargs = get_llm_kwargs(request.param)
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0, **test_llm_kwargs)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -13,7 +13,8 @@
 
 import vllm.envs as envs
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
+                         VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -511,7 +512,8 @@ async def add_request_async(
                 default_guided_backend=self.decoding_config.
                 guided_decoding_backend,
                 reasoning_backend=self.decoding_config.reasoning_backend,
-                model_config=self.model_config)
+                model_config=self.model_config,
+                speculative_config=self.speculative_config)
 
         self._add_processed_request(
             request_id=request_id,
@@ -536,9 +538,13 @@ async def collective_rpc_async(self,
 
 
 async def build_guided_decoding_logits_processor_async(
-        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str, reasoning_backend: Optional[str],
-        model_config: ModelConfig) -> SamplingParams:
+        sampling_params: SamplingParams,
+        tokenizer: AnyTokenizer,
+        default_guided_backend: str,
+        reasoning_backend: Optional[str],
+        model_config: ModelConfig,
+        speculative_config: Optional[SpeculativeConfig] = None
+) -> SamplingParams:
     """Constructs logits processors based on the guided_decoding,
     logits_bias, and allowed_token_ids fields in sampling_params. Deletes
     those fields and adds the constructed logits processors to the
@@ -564,7 +570,8 @@ async def build_guided_decoding_logits_processor_async(
         guided_params=guided_decoding,
         tokenizer=tokenizer,
         reasoning_backend=reasoning_backend,
-        model_config=model_config)
+        model_config=model_config,
+        speculative_config=speculative_config)
 
     if processor:
         if sampling_params.logits_processors is None:

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -2102,7 +2102,7 @@ def _build_logits_processors(
                 tokenizer=tokenizer,
                 model_config=self.model_config,
                 reasoning_backend=self.decoding_config.reasoning_backend,
-            )
+                speculative_config=self.speculative_config)
             if processor:
                 logits_processors.append(processor)
 

diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -96,6 +96,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         self.vllm_config = engine_config
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
+        self.speculative_config = engine_config.speculative_config
 
         # Create the tokenizer group.
         self.tokenizer = init_tokenizer_from_configs(
@@ -620,6 +621,7 @@ async def _process_request(
                         else DecodingConfig.guided_decoding_backend),
                     model_config=self.model_config,
                     reasoning_backend=self.decoding_config.reasoning_backend,
+                    speculative_config=self.speculative_config,
                 )
 
         # 1) Create output queue for this requests.

diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.utils import (
@@ -13,7 +13,7 @@
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
 
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, SpeculativeConfig
     from vllm.logits_process import LogitsProcessor
     from vllm.sampling_params import GuidedDecodingParams
 
@@ -100,6 +100,7 @@ async def get_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
         model_config: ModelConfig,
+        speculative_config: Optional[SpeculativeConfig] = None,
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
 
     reasoner = None
@@ -126,23 +127,27 @@ async def get_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
+            guided_params, tokenizer, model_config, reasoner,
+            speculative_config)
     if guided_params.backend_name == 'guidance':
         from vllm.model_executor.guided_decoding.guidance_decoding import (
             get_local_guidance_guided_decoding_logits_processor)
         return get_local_guidance_guided_decoding_logits_processor(
             guided_params, tokenizer)
+
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
         "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
     )
 
 
 def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams,
-        tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig,
-        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+    guided_params: GuidedDecodingParams,
+    tokenizer: PreTrainedTokenizer,
+    model_config: ModelConfig,
+    reasoning_backend: str | None = None,
+    speculative_config: Optional[SpeculativeConfig] = None
+) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
 
     reasoner = None
@@ -167,7 +172,8 @@ def get_local_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config, reasoner)
+            guided_params, tokenizer, model_config, reasoner,
+            speculative_config)
     if guided_params.backend_name == 'guidance':
         from vllm.model_executor.guided_decoding.guidance_decoding import (
             get_local_guidance_guided_decoding_logits_processor)

diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -36,6 +36,7 @@ def __init__(
         self.tokenizer_name = tokenizer.name_or_path
         self.new_sampling = False
         self.initialized = False
+        self.num_processed_tokens = 0
 
     def _initialize(self):
         if self.initialized:
@@ -69,7 +70,17 @@ def __call__(
         # to avoid pickling ll_tokenizer and ll_interpreter
         self._initialize()
 
+        if self.num_processed_tokens > 0 and self.num_processed_tokens >= len(
+                input_ids):
+            diff = self.num_processed_tokens - len(input_ids) + 1
+            self.ll_matcher.rollback(diff)
+            self.num_processed_tokens -= diff
+
         if self.new_sampling and len(input_ids) > 0:
+            # The tokens are not truly consumed when the matcher is stopped,
+            # despite consume_token returning True. This is a workaround.
+            self.num_processed_tokens += 1 if not self.ll_matcher.is_stopped(
+            ) else 0
             self.ll_matcher.consume_token(input_ids[-1])
             err = self.ll_matcher.get_error()
             if err: