Make is_encoder_decoder an init var

simondanielsson · simondanielsson · commit 0480fa3d69b7 · 2025-09-27T10:56:06.000+02:00
Signed-off-by: simondanielsson &lt;simon.danielsson99@hotmail.com&gt;
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -1921,6 +1921,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder(
         is_encoder_decoder=is_encoder_decoder,
     )
 
+    # `is_encoder_decoder` should only be used during construction of the config
+    assert not hasattr(scheduler_config, "is_encoder_decoder")
     _validate_chunked_prefill_settings_for_encoder_decoder(
         scheduler_config, is_encoder_decoder, expect_enabled)
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from dataclasses import field
+from dataclasses import InitVar, field
 from typing import Any, Literal, Union
 
 from pydantic import SkipValidation, model_validator
@@ -11,9 +11,11 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
-                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
+from vllm.utils import (
+    DEFAULT_MAX_NUM_BATCHED_TOKENS,
+    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
+)
 
 logger = init_logger(__name__)
 
@@ -84,8 +86,12 @@ class SchedulerConfig:
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
-    is_encoder_decoder: bool = False
-    """True if the model is an encoder-decoder model."""
+    is_encoder_decoder: InitVar[bool] = False
+    """True if the model is an encoder-decoder model.
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    disable chunked prefill and prefix caching for encoder-decoder models.
+    """
 
     # TODO (ywang96): Make this configurable.
     max_num_encoder_input_tokens: int = field(init=False)
@@ -160,26 +166,26 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self) -> None:
+    def __post_init__(self, is_encoder_decoder: bool) -> None:
         if self.max_model_len is None:
             self.max_model_len = 8192
 
         if self.max_num_seqs is None:
             self.max_num_seqs = 128
 
-        if self.is_encoder_decoder:
+        if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
             self.chunked_prefill_enabled = False
             self.enable_chunked_prefill = False
             self.long_prefill_token_threshold = 0
             logger.info(
                 "Encoder-decoder models do not support chunked prefill nor"
-                " prefix caching; disabling both.")
+                " prefix caching; disabling both."
+            )
 
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
@@ -189,7 +195,8 @@ def __post_init__(self) -> None:
                 # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
                 # for higher throughput.
                 self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
+                )
 
             if self.runner_type == "pooling":
                 # Choose specific value for higher throughput
@@ -208,29 +215,31 @@ def __post_init__(self) -> None:
             # Ensure max_num_batched_tokens does not exceed model limit.
             # Some models (e.g., Whisper) have embeddings tied to max length.
             self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len,
-                self.max_num_batched_tokens)
+                self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
+            )
 
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
 
         if self.enable_chunked_prefill:
             logger.info(
                 "Chunked prefill is enabled with max_num_batched_tokens=%d.",
-                self.max_num_batched_tokens)
+                self.max_num_batched_tokens,
+            )
 
         self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len *
-                                                        0.04)
+                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
                 "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
                 "long_prefill_token_threshold=%d",
-                self.max_num_partial_prefills, self.max_long_partial_prefills,
-                self.long_prefill_token_threshold)
+                self.max_num_partial_prefills,
+                self.max_long_partial_prefills,
+                self.long_prefill_token_threshold,
+            )
 
         # NOTE: Default set cuda_graph_sizes to [min(max_num_seqs * 2, 512)].
         # This avoids OOM in tight memory scenarios with small max_num_seqs,
@@ -240,61 +249,71 @@ def __post_init__(self) -> None:
             self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
 
         if self.async_scheduling:
-            self.scheduler_cls = (
-                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
+            self.scheduler_cls = "vllm.v1.core.sched.async_scheduler.AsyncScheduler"
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def _verify_args(self) -> Self:
-        if (self.max_num_batched_tokens < self.max_model_len
-                and not self.chunked_prefill_enabled):
+        if (
+            self.max_num_batched_tokens < self.max_model_len
+            and not self.chunked_prefill_enabled
+        ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
                 f"smaller than max_model_len ({self.max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len.")
+                "decrease max_model_len."
+            )
 
         if self.max_num_batched_tokens < self.max_num_seqs:
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs}).")
+                f"({self.max_num_seqs})."
+            )
 
         if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len)
+                self.max_num_seqs * self.max_model_len,
+            )
 
         if self.num_lookahead_slots < 0:
             raise ValueError(
                 "num_lookahead_slots "
                 f"({self.num_lookahead_slots}) must be greater than or "
-                "equal to 0.")
+                "equal to 0."
+            )
 
         if self.max_num_partial_prefills < 1:
             raise ValueError(
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
-                "must be greater than or equal to 1.")
+                "must be greater than or equal to 1."
+            )
         elif self.max_num_partial_prefills > 1:
             if not self.chunked_prefill_enabled:
-                raise ValueError("Chunked prefill must be enabled to set "
-                                 "max_num_partial_prefills > 1.")
+                raise ValueError(
+                    "Chunked prefill must be enabled to set "
+                    "max_num_partial_prefills > 1."
+                )
 
             if self.long_prefill_token_threshold > self.max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len}).")
+                    f"than the max_model_len ({self.max_model_len})."
+                )
 
-        if (self.max_long_partial_prefills
-                < 1) or (self.max_long_partial_prefills
-                         > self.max_num_partial_prefills):
+        if (self.max_long_partial_prefills < 1) or (
+            self.max_long_partial_prefills > self.max_num_partial_prefills
+        ):
             raise ValueError(
                 f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
                 "must be greater than or equal to 1 and less than or equal to "
-                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+                f"max_num_partial_prefills ({self.max_num_partial_prefills})."
+            )
 
         return self

Original file line number	Diff line number	Diff line change
`@@ -1921,6 +1921,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder(`
`1921`	`1921`	`is_encoder_decoder=is_encoder_decoder,`
`1922`	`1922`	`)`
`1923`	`1923`
	`1924`	+ # `is_encoder_decoder` should only be used during construction of the config
	`1925`	`+ assert not hasattr(scheduler_config, "is_encoder_decoder")`
`1924`	`1926`	`_validate_chunked_prefill_settings_for_encoder_decoder(`
`1925`	`1927`	`scheduler_config, is_encoder_decoder, expect_enabled)`
`1926`	`1928`