Skip to content

Commit e206b54

Browse files
authored
[v0][Core] Use xgrammar shared context to avoid copy overhead for offline engine (#13837)
Signed-off-by: Seth Kimmel <seth.kimmel3@gmail.com>
1 parent 1d35662 commit e206b54

File tree

1 file changed

+23
-3
lines changed

1 file changed

+23
-3
lines changed

vllm/model_executor/guided_decoding/xgrammar_decoding.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
# noqa: UP007
44
from __future__ import annotations
55

6-
import copy
76
import json
87
import re
98
from dataclasses import dataclass, field
@@ -348,5 +347,26 @@ def __call__(self, input_ids: list[int],
348347
return scores
349348

350349
def clone(self) -> XGrammarLogitsProcessor:
351-
"""Deepcopy due to per-sequence state in the matchers"""
352-
return copy.deepcopy(self)
350+
"""Create a new instance with shared compiled grammar
351+
but separate state"""
352+
new_processor = XGrammarLogitsProcessor(self.config)
353+
354+
# Share the compiled grammar context (immutable after compilation)
355+
new_processor.ctx = self.ctx
356+
357+
# Create fresh matchers for the new sequence
358+
if self.ctx is not None:
359+
new_processor.matchers = [
360+
xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
361+
]
362+
363+
# Create a new token bitmask with the same size
364+
if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
365+
new_processor.token_bitmask = self.token_bitmask
366+
367+
# Copy simple attributes
368+
new_processor.batch_size = self.batch_size
369+
# Reset prefilled state for new sequence
370+
new_processor.prefilled = False
371+
372+
return new_processor

0 commit comments

Comments
 (0)