File tree Expand file tree Collapse file tree 2 files changed +11
-1
lines changed
vllm/model_executor/guided_decoding Expand file tree Collapse file tree 2 files changed +11
-1
lines changed Original file line number Diff line number Diff line change @@ -30,7 +30,6 @@ def get_llm_kwargs(mode: str):
3030 if mode == "autoregressive" :
3131 return {}
3232 return {
33- # the model with fixed vocabulary size
3433 "speculative_model" : "Qwen/Qwen2.5-0.5B-Instruct" ,
3534 "num_speculative_tokens" : 3 ,
3635 }
Original file line number Diff line number Diff line change @@ -36,6 +36,7 @@ def __init__(
3636 self .tokenizer_name = tokenizer .name_or_path
3737 self .new_sampling = False
3838 self .initialized = False
39+ self .num_processed_tokens = 0
3940
4041 def _initialize (self ):
4142 if self .initialized :
@@ -69,7 +70,17 @@ def __call__(
6970 # to avoid pickling ll_tokenizer and ll_interpreter
7071 self ._initialize ()
7172
73+ if self .num_processed_tokens > 0 and self .num_processed_tokens >= len (
74+ input_ids ):
75+ diff = self .num_processed_tokens - len (input_ids ) + 1
76+ self .ll_matcher .rollback (diff )
77+ self .num_processed_tokens -= diff
78+
7279 if self .new_sampling and len (input_ids ) > 0 :
80+ # The tokens are not truly consumed when the matcher is stopped,
81+ # despite consume_token returning True. This is a workaround.
82+ self .num_processed_tokens += 1 if not self .ll_matcher .is_stopped (
83+ ) else 0
7384 self .ll_matcher .consume_token (input_ids [- 1 ])
7485 err = self .ll_matcher .get_error ()
7586 if err :
You can’t perform that action at this time.
0 commit comments