chore: migrate tokenizer init to manager only

aarnphm · aarnphm · commit 81aadb6152eb · 2025-04-14T15:43:18.000Z
Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -7,9 +7,11 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
+from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
 
 if TYPE_CHECKING:
     import numpy as np
@@ -46,13 +48,26 @@ def grammar_init(self, request: Request) -> None:
         # backends on a per-request basis in V1 (for now, anyway...).
         if self.backend is None:
             backend_name = request.sampling_params.guided_decoding.backend_name
+            tokenizer_group = init_tokenizer_from_configs(
+                model_config=self.vllm_config.model_config,
+                scheduler_config=self.vllm_config.scheduler_config,
+                parallel_config=self.vllm_config.parallel_config,
+                lora_config=self.vllm_config.lora_config)
+            tokenizer_group.ping()
+            tokenizer = tokenizer_group.get_lora_tokenizer(None)
+            vocab_size = self.vllm_config.model_config.get_vocab_size()
             if backend_name == "xgrammar":
-                from vllm.v1.structured_output.backend_xgrammar import (
-                    XgrammarBackend)
-
-                self.backend = XgrammarBackend(self.vllm_config)
+                self.backend = XgrammarBackend(
+                    self.vllm_config,
+                    tokenizer=tokenizer,
+                    vocab_size=vocab_size,
+                )
             elif backend_name == "guidance":
-                self.backend = GuidanceBackend(self.vllm_config)
+                self.backend = GuidanceBackend(
+                    self.vllm_config,
+                    tokenizer=tokenizer,
+                    vocab_size=vocab_size,
+                )
             else:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
@@ -1,15 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
 import torch
 
-from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar,
@@ -29,25 +29,16 @@
 logger = init_logger(__name__)
 
 
+@dataclass
 class GuidanceBackend(StructuredOutputBackend):
 
-    def __init__(self, vllm_config: VllmConfig):
-        self.vllm_config = vllm_config
-        tokenizer_group = init_tokenizer_from_configs(
-            model_config=vllm_config.model_config,
-            scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
-        self.vllm_config = vllm_config
-        self.vocab_size = vllm_config.model_config.get_vocab_size()
+    def __post_init__(self):
         self.disable_any_whitespace = (
             "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
+            in self.vllm_config.decoding_config.guided_decoding_backend)
 
-        tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
-            tokenizer, self.vocab_size)
+            self.tokenizer, self.vocab_size)
 
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import enum
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 import torch
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+    from vllm.transformers_utils.tokenizer import AnyTokenizer
+
 
 class StructuredOutputOptions(enum.Enum):
     JSON = enum.auto()
@@ -60,9 +68,14 @@ def reset(self):
         """
 
 
+@dataclass
 class StructuredOutputBackend(ABC):
     """Engine-level backend for structured output requests."""
 
+    vllm_config: VllmConfig
+    tokenizer: AnyTokenizer
+    vocab_size: int
+
     @abstractmethod
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,14 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING
 
 import torch
 
 import vllm.envs
-from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
@@ -23,58 +23,49 @@
 logger = init_logger(__name__)
 
 
+@dataclass
 class XgrammarBackend(StructuredOutputBackend):
 
-    def __init__(self, vllm_config: VllmConfig):
-        self.vllm_config = vllm_config
+    def __post_init__(self):
         self.disable_any_whitespace = (
             "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
-        tokenizer_group = init_tokenizer_from_configs(
-            model_config=vllm_config.model_config,
-            scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
-
-        tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = vllm_config.model_config.get_vocab_size()
-        if isinstance(tokenizer, MistralTokenizer):
+            in self.vllm_config.decoding_config.guided_decoding_backend)
+        if isinstance(self.tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
             try:
-                if tokenizer.is_tekken:
-                    encoded_vocab = tokenizer._vocab
+                if self.tokenizer.is_tekken:
+                    encoded_vocab = self.tokenizer._vocab
                 else:
                     encoded_vocab = [
                         token for token, _ in sorted(
-                            tokenizer.get_vocab().items(),
+                            self.tokenizer.get_vocab().items(),
                             key=lambda x: x[1],
                         )
                     ]
                 stop_token_ids = None
                 if hasattr(
-                        tokenizer,
+                        self.tokenizer,
                         "eos_token_id",
-                ) and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
+                ) and self.tokenizer.eos_token_id is not None:
+                    stop_token_ids = [self.tokenizer.eos_token_id]
             except AttributeError as e:
                 raise ValueError(
                     f"Cannot get the vocabulary of the tokenizer "
-                    f"{type(tokenizer)}. The tokenizer should have a "
+                    f"{type(self.tokenizer)}. The tokenizer should have a "
                     "get_vocab method.") from e
             tokenizer_info = xgr.TokenizerInfo(  # type: ignore
                 encoded_vocab=encoded_vocab,
                 # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
                 vocab_type=xgr.VocabType.RAW
-                if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
+                if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
                 vocab_size=self.vocab_size,
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,
             )
         else:
             tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
+                self.tokenizer,
                 vocab_size=self.vocab_size,
             )
         self.compiler = xgr.GrammarCompiler(