Pradyun92
diff --git a/‎mantle_extensions/mantle_extensions/config/patch_config.json‎
Lines changed: 12 additions & 0 deletions b/‎mantle_extensions/mantle_extensions/config/patch_config.json‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎mantle_extensions/mantle_extensions/patches/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎mantle_extensions/mantle_extensions/patches/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎mantle_extensions/mantle_extensions/patches/eagle_structured_output_fix.py‎
Lines changed: 198 additions & 0 deletions b/‎mantle_extensions/mantle_extensions/patches/eagle_structured_output_fix.py‎
Lines changed: 198 additions & 0 deletions
@@ -148,6 +148,18 @@
         ],
         "upstreamable": true,
         "notes": "IN-TREE MODIFICATION: One-line fix at line 523. Must be in-tree because vLLM has a bug importing tool parser plugins in APIServer process. Fixed condition from 'if not self.json_started and self.parameter_prefix not in delta_text:' to 'if not self.json_started:'. Plugin patch kept for reference but cannot be used."
+      },
+      "eagle_structured_output_fix": {
+        "enabled": true,
+        "category": "REQUIRED",
+        "description": "Eagle + Structured Output FSM validation fix",
+        "class_name": "EagleStructuredOutputPatch",
+        "target": "vllm.v1.structured_output.StructuredOutputManager",
+        "methods": [
+          "grammar_bitmask"
+        ],
+        "upstreamable": true,
+        "notes": "Fixes AssertionError when using Eagle speculative decoding with structured output (tool calling). Replaces assertion with defensive conditional check to handle cases where FSM rejects tokens that are in scheduled list. Root cause unknown (FSM state mismatch, xgrammar rollback bug, or concurrency issue). Bug exists since PR #18879 (May 2025). Remove once upstream fixes this issue."
       }
     }
   },
 
@@ -6,7 +6,6 @@
 
 __all__ = [
     "serving_patches",
-    "streaming_patches",
     "eagle_patches",
     "config_patches",
     "quantization_patches",
 
@@ -0,0 +1,198 @@
+"""
+Eagle + Structured Output FSM Validation Fix
+
+ISSUE: When using Eagle speculative decoding with structured output (tool calling),
+       vLLM crashes with AssertionError when the FSM rejects a spec token that
+       is present in the scheduled_spec_decode_tokens list.
+
+ERROR: "Failed to advance FSM for request ... for tokens XXX. Please file an issue."
+       Followed by: AssertionError at vllm/v1/structured_output/__init__.py line 263
+
+OBSERVED BEHAVIOR:
+- Eagle generates speculative tokens for next iterations
+- Scheduler validates these tokens via grammar.validate_tokens() and stores valid prefix
+- During model execution, grammar_bitmask() validates the same tokens again
+- Sometimes accept_tokens() returns False even for tokens in the scheduled list
+- The assertion crashes the entire engine instead of handling this gracefully
+
+ROOT CAUSE UNKNOWN - Possible explanations:
+1. Race condition or state mismatch between validation and bitmask generation
+2. Bug in xgrammar rollback functionality
+3. Interaction between Eagle + structured output + penalties causing state corruption
+4. Concurrency issue with shared grammar state
+
+SOLUTION:
+Replace the assertion with a defensive conditional check:
+- If token is valid according to FSM, accept it and advance state
+- If token is invalid, log debug message but continue loop
+- Still fill bitmasks for all tokens to maintain correct array size
+- Makes the code resilient to FSM state mismatches
+
+This is a defensive fix that prevents crashes without fully understanding the root cause.
+
+UPSTREAM STATUS: Not fixed in vLLM upstream (bug exists since PR #18879, May 2025)
+UPSTREAMABLE: Yes - this defensive approach should be contributed upstream
+
+CLEANUP: When upstream fixes this bug:
+1. Delete this file
+2. Remove eagle_structured_output_fix from patch_config.json
+3. Remove registration from plugin.py
+4. Reinstall plugin
+"""
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def create_patched_grammar_bitmask():
+    """
+    Factory function that creates the patched grammar_bitmask method.
+
+    This replaces the assertion with a conditional check to handle
+    the case where the scheduler drops invalid spec tokens.
+    """
+    # Import here to avoid issues if vLLM isn't installed
+    from typing import TYPE_CHECKING
+
+    def grammar_bitmask(
+        self,
+        requests: dict,
+        structured_output_request_ids: list[str],
+        scheduled_spec_decode_tokens: dict[str, list[int]],
+    ):
+        """
+        Patched version that handles FSM rejection of scheduled spec tokens.
+
+        Changes from upstream:
+        - Replace assertion with defensive conditional check
+        - If spec token is rejected by FSM, log and continue (don't crash)
+        - Continue filling bitmasks for all tokens to maintain array size
+        - Makes code resilient to FSM state mismatches
+        """
+        max_num_spec_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            max_num_spec_tokens = (
+                self.vllm_config.speculative_config.num_speculative_tokens
+            )
+
+        if self._grammar_bitmask is None:
+            assert self.backend is not None
+            max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+
+            # Allocate a bitmask for each token needing to be checked:
+            # one for each speculative position, and one more for the
+            # bonus token / non-speculative token.
+            self._grammar_bitmask = self.backend.allocate_token_bitmask(
+                max_batch_size * (1 + max_num_spec_tokens)
+            )
+
+        # Generate a batched bitmask for all structured output requests.
+        # When speculative decoding is enabled, we need to include multiple
+        # masks for each request, one for each possible bonus token position.
+        # These are stored inline in the tensor and unpacked by the gpu runner.
+        cumulative_index = 0
+
+        # Optimized parallel filling of bitmasks for
+        # non-spec, large-batch-size cases
+        if (
+            len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold
+            and max_num_spec_tokens == 0
+        ):
+            promises = []
+            batch = []
+            for req_id in structured_output_request_ids:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
+
+                apply_bitmask = self.should_fill_bitmask(request)
+                batch.append(
+                    (structured_output_request.grammar, cumulative_index, apply_bitmask)
+                )
+                if len(batch) == self.fill_bitmask_parallel_batch_size:
+                    promises.append(self._async_submit_fill_bitmask(batch))
+                    batch = []
+
+                cumulative_index += 1
+            if batch:
+                promises.append(self._async_submit_fill_bitmask(batch))
+
+            # Wait for all bitmask filling tasks to complete.
+            for promise in promises:
+                promise.result()
+        else:
+            # Fallback to serial filling of bitmasks for small-batch-size cases
+            for req_id in structured_output_request_ids:
+                request = requests[req_id]
+                structured_output_request = request.structured_output_request
+
+                if TYPE_CHECKING:
+                    assert structured_output_request is not None
+                    assert structured_output_request.grammar is not None
+                apply_bitmask = self.should_fill_bitmask(request)
+
+                state_advancements = 0
+                req_tokens = scheduled_spec_decode_tokens.get(req_id, [])
+                for i, token in enumerate(req_tokens + [None]):
+                    self._fill_bitmasks(
+                        [
+                            (
+                                structured_output_request.grammar,
+                                cumulative_index,
+                                apply_bitmask,
+                            )
+                        ]
+                    )
+
+                    # ============================================================
+                    # MANTLE FIX: Replace assertion with conditional check
+                    # ============================================================
+                    if (
+                        apply_bitmask
+                        and token is not None
+                        and not structured_output_request.grammar.is_terminated()
+                    ):
+                        # ORIGINAL (causes crash):
+                        # assert structured_output_request.grammar.accept_tokens(
+                        #     req_id, [token]
+                        # )
+                        # state_advancements += 1
+
+                        # FIXED (defensive approach - no crash):
+                        # Only advance state if token is accepted by grammar.
+                        # If rejected, continue loop to fill bitmasks for all tokens
+                        # (downstream code in apply_grammar_bitmask expects exact array size).
+                        if structured_output_request.grammar.accept_tokens(req_id, [token]):
+                            state_advancements += 1
+                        else:
+                            # Token rejected by FSM even though it's in scheduled list.
+                            # Root cause unknown (FSM state mismatch, xgrammar bug, etc.)
+                            # but we handle it gracefully instead of crashing.
+                            logger.debug(
+                                f"Grammar rejected spec token {token} for request {req_id}. "
+                                "This indicates an FSM state mismatch in Eagle + structured output. "
+                                "Continuing without advancing grammar state."
+                            )
+                            # Continue to next token (don't break) to ensure
+                            # bitmask array has correct size for apply_grammar_bitmask
+                    # ============================================================
+                    # END MANTLE FIX
+                    # ============================================================
+
+                    cumulative_index += 1
+                if state_advancements > 0:
+                    structured_output_request.grammar.rollback(state_advancements)
+
+        bitmask_tensor = self._grammar_bitmask
+        if cumulative_index < bitmask_tensor.shape[0]:
+            bitmask_tensor = bitmask_tensor[:cumulative_index]
+
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
+
+    return grammar_bitmask