Skip to content

Commit 525f055

Browse files
southfreebirdSergei Skvortsov
authored andcommitted
Add support for guidance decoding backend
Signed-off-by: southfreebird <yvorott@gmail.com>
1 parent 14dcd48 commit 525f055

File tree

2 files changed

+11
-1
lines changed

2 files changed

+11
-1
lines changed

tests/entrypoints/llm/test_guided_generate.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def get_llm_kwargs(mode: str):
3030
if mode == "autoregressive":
3131
return {}
3232
return {
33-
# the model with fixed vocabulary size
3433
"speculative_model": "Qwen/Qwen2.5-0.5B-Instruct",
3534
"num_speculative_tokens": 3,
3635
}

vllm/model_executor/guided_decoding/guidance_logits_processors.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(
3636
self.tokenizer_name = tokenizer.name_or_path
3737
self.new_sampling = False
3838
self.initialized = False
39+
self.num_processed_tokens = 0
3940

4041
def _initialize(self):
4142
if self.initialized:
@@ -69,7 +70,17 @@ def __call__(
6970
# to avoid pickling ll_tokenizer and ll_interpreter
7071
self._initialize()
7172

73+
if self.num_processed_tokens > 0 and self.num_processed_tokens >= len(
74+
input_ids):
75+
diff = self.num_processed_tokens - len(input_ids) + 1
76+
self.ll_matcher.rollback(diff)
77+
self.num_processed_tokens -= diff
78+
7279
if self.new_sampling and len(input_ids) > 0:
80+
# The tokens are not truly consumed when the matcher is stopped,
81+
# despite consume_token returning True. This is a workaround.
82+
self.num_processed_tokens += 1 if not self.ll_matcher.is_stopped(
83+
) else 0
7384
self.ll_matcher.consume_token(input_ids[-1])
7485
err = self.ll_matcher.get_error()
7586
if err:

0 commit comments

Comments
 (0)