--wip--

aarnphm · aarnphm · commit a97b1726fc89 · 2025-04-17T19:03:39.000Z
Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -652,15 +652,16 @@ def update_from_output(
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
-            if new_token_ids and request.use_structured_output:
-                # NOTE: structured_output_request
-                # should not be None if use_structured_output, we have
-                # check above, so safe to ignore type warning
-                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    req_id, new_token_ids)
-
-            # Get prompt logprobs for this request.
-            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
+            # --- Jump-forward decoding for structured output requests ---
+            if request.use_structured_output:
+                batch_index = scheduler_output.structured_output_request_ids.get(
+                    req_id, 0)
+                jump_tokens = self.structured_output_manager.jump_forward_tokens(
+                    request, batch_index)
+                if jump_tokens:
+                    new_token_ids.extend(jump_tokens)
+            # --- End jump-forward decoding ---
+
             if new_token_ids:
                 # Add EngineCoreOutput for this Request.
                 outputs.append(
@@ -669,12 +670,13 @@ def update_from_output(
                         new_token_ids=new_token_ids,
                         finish_reason=request.get_finished_reason(),
                         new_logprobs=new_logprobs,
-                        new_prompt_logprobs_tensors=prompt_logprobs_tensors,
+                        new_prompt_logprobs_tensors=prompt_logprobs_dict.get(
+                            req_id),
                         stop_reason=request.stop_reason,
                         events=request.take_events()))
             else:
                 # Invariant: EngineCore returns no partial prefill outputs.
-                assert not prompt_logprobs_tensors
+                assert not prompt_logprobs_dict.get(req_id)
 
             self.scheduled_req_ids.remove(req_id)
             if not stopped:
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -8,6 +8,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
@@ -17,8 +18,11 @@
     import numpy as np
     import numpy.typing as npt
     import torch
+    import xgrammar.testing as xgr_testing
 
     from vllm.v1.request import Request
+else:
+    xgr_testing = LazyLoader('xgr_testing', globals(), 'xgrammar.testing')
 
 logger = init_logger(__name__)
 
@@ -122,3 +126,28 @@ def grammar_bitmask(
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
+
+    def jump_forward_tokens(self, request, batch_index) -> list[int]:
+        """
+        For xgrammar-based structured output requests, repeatedly check if the grammar bitmask
+        is a single-token bitmask, and if so, advance the FSM and collect all jump-forward tokens.
+        Returns the list of jump-forward token IDs.
+        """
+        so_request = request.structured_output_request
+        if so_request is None or so_request.grammar is None:
+            return []
+
+        jump_tokens = []
+        bitmask = torch.zeros(so_request.grammar.vocab_size, dtype=torch.int32)
+        so_request.grammar.allocate_token_bitmask(1)
+        so_request.grammar.fill_bitmask(bitmask, 0)
+        is_single, unique_token_id = xgr_testing._is_single_token_bitmask(
+            bitmask, so_request.grammar.vocab_size, 0)
+        while is_single and unique_token_id != -1:
+            jump_tokens.append(unique_token_id)
+            so_request.grammar.accept_tokens(request.request_id,
+                                             [unique_token_id])
+            so_request.grammar.fill_bitmask(bitmask, batch_index)
+            is_single, unique_token_id = xgr_testing._is_single_token_bitmask(
+                bitmask, so_request.grammar.vocab_size, 0)
+        return jump_tokens
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
@@ -119,6 +119,9 @@ def reset(self):
         # This method may be not needed anymore? TODO
         self.ll_matcher.reset()
 
+    def find_jump_forward_tokens(self) -> list[int]:
+        raise NotImplementedError
+
 
 def serialize_guidance_grammar(request_type: StructuredOutputOptions,
                                grammar_spec: str,
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
@@ -100,3 +100,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
             max_num_seqs (int): The maximum number of sequences for which
               to allocate the bitmask.
         """
+
+    @abstractmethod
+    def find_jump_forward_tokens(self) -> list[int]:
+        """
+        Finds the tokens that can be used to jump forward in the grammar.
+        """
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
@@ -147,3 +147,8 @@ def is_terminated(self) -> bool:
     def reset(self):
         self.num_processed_tokens = 0
         self.matcher.reset()
+
+    def find_jump_forward_tokens(self) -> list[int]:
+        jf_string = self.matcher.find_jump_forward_string()
+        return self.tokenizer.decode(
+            jf_string, skip_special_tokens=True) if jf_string else []