vllm-project · aarnphm · Apr 14, 2025 · Apr 17, 2025 · Apr 23, 2025 · Apr 26, 2025
@@ -738,6 +738,28 @@ def update_from_output(
             new_token_ids = generated_token_ids
             kv_transfer_params = None
 
+            # NOTE: We will need to first advance the FSM
+            # given that we apply bitmask in first pass
+            # and we only perform jump-forward posteriori.
+            first_pass = True
+            if new_token_ids and self.structured_output_manager.should_advance(
+                    request):
+                # NOTE: structured_output_request
+                # should not be None if use_structured_output, we have
+                # check above, so safe to ignore type warning
+                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                    req_id, new_token_ids)
+                first_pass = False
+
+            # NOTE: We are performing retokenization to handle
+            # tokenizer boundary. There will be some
+            # overhead here.
+            if first_pass and new_token_ids and request.use_structured_output and (  # noqa: E501
+                    jump_tokens :=
+                    self.structured_output_manager.jump_forward_tokens(request)
+            ):
+                new_token_ids += jump_tokens
+
             # Append generated tokens and check for stop. Note that if
             # a request is still being prefilled, we expect the model runner
             # to return empty token ids for the request.
@@ -758,14 +780,6 @@ def update_from_output(
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
-            if new_token_ids and self.structured_output_manager.should_advance(
-                    request):
-                # NOTE: structured_output_request
-                # should not be None if use_structured_output, we have
-                # check above, so safe to ignore type warning
-                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    req_id, new_token_ids)
-
             # Add newly generated spec token ids to the request.
             if spec_token_ids is not None:
                 if self.structured_output_manager.should_advance(request):

@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import itertools
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
@@ -183,6 +184,76 @@ def grammar_bitmask(
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
 
+    def jump_forward_tokens(self, request: Request) -> list[int] | None:
+        """
+        For structured output requests, we will perform
+        jump_and_retokenize possible divergence based on grammar state
+        """
+        so_request = request.structured_output_request
+        if TYPE_CHECKING:
+            assert so_request is not None
+            assert so_request.grammar is not None
+            assert self.backend is not None
+
+        jf_string = so_request.grammar.find_jump_string()
+        if not jf_string:
+            return None
+
+        # NOTE: max_rollback_window determines the size
+        # of the tokenes from all_token_ids to be used for retokenization.
+        # Note that we don't need to whole token_ids
+        # for performance reason (tokenizer is blocking)
+        max_rollback_window = 10
+
+        rollback_text_str = self.tokenizer.decode(
+            request.all_token_ids[-max_rollback_window:])
+        retokenized_output_ids = self.tokenizer.encode(
+            rollback_text_str + jf_string,
+            add_special_tokens=False,
+        )
+        if request.prompt_token_ids[-1] in retokenized_output_ids:
+            prompt_boundary = retokenized_output_ids.index(
+                request.prompt_token_ids[-1]) + 1
+            retokenized_output_ids = retokenized_output_ids[prompt_boundary:]
+
+        original_output_ids = request.output_token_ids[
+            max(0,
+                len(request.output_token_ids) - len(retokenized_output_ids)):]
+
+        # Find the prefix match length
+        k = sum(1 for _ in itertools.takewhile(
+            lambda pair: pair[0] == pair[1],
+            zip(original_output_ids, retokenized_output_ids),
+        ))
+        retokenized_suffix = retokenized_output_ids[k:]
+        if k < len(original_output_ids):
+            so_request.grammar.rollback(len(original_output_ids) - k)
+
+        # Validate tokens one by one
+        accepted_tokens: list[int] = []
+        num_validated_in_suffix = 0
+        validation_ok = True
+        for token in retokenized_suffix:
+            if so_request.grammar.accept_tokens(request.request_id, [token]):
+                accepted_tokens.append(token)
+                num_validated_in_suffix += 1
+            else:
+                if num_validated_in_suffix > 0:
+                    so_request.grammar.rollback(num_validated_in_suffix)
+                validation_ok = False
+                break
+
+        if validation_ok:
+            return accepted_tokens
+
+        original_suffix_tokens = original_output_ids[num_validated_in_suffix:]
+        if original_suffix_tokens and not so_request.grammar.accept_tokens(
+                request.request_id,
+                original_suffix_tokens,
+        ):
+            so_request.grammar.rollback(len(original_suffix_tokens))
+        return None
+
     def should_advance(self, request: Request) -> bool:
         if not request.use_structured_output:
             return False

@@ -170,6 +170,10 @@ def reset(self):
         # This method may be not needed anymore? TODO
         self.ll_matcher.reset()
 
+    def find_jump_string(self) -> str | None:
+        ff_string = self.ll_matcher.compute_ff_bytes()
+        return ff_string.decode() if ff_string else None
+
 
 def serialize_guidance_grammar(
     request_type: StructuredOutputOptions,

@@ -29,6 +29,15 @@ class StructuredOutputOptions(enum.Enum):
 class StructuredOutputGrammar(ABC):
     """Request-level backend for structured output requests."""
 
+    @abstractmethod
+    def find_jump_string(self) -> str | None:
+        """
+        Find jump-forward string based on current grammar state.
+
+        Returns:
+            Optional list of int: list of jump tokens
+        """
+
     @abstractmethod
     def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
         """

@@ -194,6 +194,10 @@ def reset(self):
         self.num_processed_tokens = 0
         self.matcher.reset()
 
+    def find_jump_string(self) -> str | None:
+        jf_string = self.matcher.find_jump_forward_string()
+        return jf_string if jf_string else None
+
 
 def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
     """Check if JSON schema contains features unsupported by xgrammar."""