Add support for guidance decoding backend

southfreebird · Sergei Skvortsov · commit 525f0551a0c1 · 2025-04-29T13:17:59.000Z
Signed-off-by: southfreebird &lt;yvorott@gmail.com&gt;
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
@@ -30,7 +30,6 @@ def get_llm_kwargs(mode: str):
         if mode == "autoregressive":
             return {}
         return {
-            # the model with fixed vocabulary size
             "speculative_model": "Qwen/Qwen2.5-0.5B-Instruct",
             "num_speculative_tokens": 3,
         }
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -36,6 +36,7 @@ def __init__(
         self.tokenizer_name = tokenizer.name_or_path
         self.new_sampling = False
         self.initialized = False
+        self.num_processed_tokens = 0
 
     def _initialize(self):
         if self.initialized:
@@ -69,7 +70,17 @@ def __call__(
         # to avoid pickling ll_tokenizer and ll_interpreter
         self._initialize()
 
+        if self.num_processed_tokens > 0 and self.num_processed_tokens >= len(
+                input_ids):
+            diff = self.num_processed_tokens - len(input_ids) + 1
+            self.ll_matcher.rollback(diff)
+            self.num_processed_tokens -= diff
+
         if self.new_sampling and len(input_ids) > 0:
+            # The tokens are not truly consumed when the matcher is stopped,
+            # despite consume_token returning True. This is a workaround.
+            self.num_processed_tokens += 1 if not self.ll_matcher.is_stopped(
+            ) else 0
             self.ll_matcher.consume_token(input_ids[-1])
             err = self.ll_matcher.get_error()
             if err:

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@ def get_llm_kwargs(mode: str):`
`30`	`30`	`if mode == "autoregressive":`
`31`	`31`	`return {}`
`32`	`32`	`return {`
`33`		`- # the model with fixed vocabulary size`
`34`	`33`	`"speculative_model": "Qwen/Qwen2.5-0.5B-Instruct",`
`35`	`34`	`"num_speculative_tokens": 3,`
`36`	`35`	`}`