Run pre-commit

simondanielsson · simondanielsson · commit 2a30c6ee5381 · 2025-09-27T10:56:06.000+02:00
Signed-off-by: simondanielsson &lt;simon.danielsson99@hotmail.com&gt;
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -11,11 +11,9 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (
-    DEFAULT_MAX_NUM_BATCHED_TOKENS,
-    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-)
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS)
 
 logger = init_logger(__name__)
 
@@ -166,7 +164,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self, is_encoder_decoder: bool) -> None:
@@ -184,8 +183,7 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
             self.long_prefill_token_threshold = 0
             logger.info(
                 "Encoder-decoder models do not support chunked prefill nor"
-                " prefix caching; disabling both."
-            )
+                " prefix caching; disabling both.")
 
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
@@ -195,8 +193,7 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
                 # for higher throughput.
                 self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
-                )
+                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.runner_type == "pooling":
                 # Choose specific value for higher throughput
@@ -215,8 +212,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
             # Ensure max_num_batched_tokens does not exceed model limit.
             # Some models (e.g., Whisper) have embeddings tied to max length.
             self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
-            )
+                self.max_num_seqs * self.max_model_len,
+                self.max_num_batched_tokens)
 
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
@@ -230,7 +227,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
         self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+                self.long_prefill_token_threshold = int(self.max_model_len *
+                                                        0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
@@ -249,29 +247,26 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
             self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
 
         if self.async_scheduling:
-            self.scheduler_cls = "vllm.v1.core.sched.async_scheduler.AsyncScheduler"
+            self.scheduler_cls = (
+                "vllm.v1.core.sched.async_scheduler.AsyncScheduler")
 
     @model_validator(mode="after")
     def _verify_args(self) -> Self:
-        if (
-            self.max_num_batched_tokens < self.max_model_len
-            and not self.chunked_prefill_enabled
-        ):
+        if (self.max_num_batched_tokens < self.max_model_len
+                and not self.chunked_prefill_enabled):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
                 f"smaller than max_model_len ({self.max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
-                "decrease max_model_len."
-            )
+                "decrease max_model_len.")
 
         if self.max_num_batched_tokens < self.max_num_seqs:
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
                 "be greater than or equal to max_num_seqs "
-                f"({self.max_num_seqs})."
-            )
+                f"({self.max_num_seqs}).")
 
         if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
             logger.warning(
@@ -285,35 +280,29 @@ def _verify_args(self) -> Self:
             raise ValueError(
                 "num_lookahead_slots "
                 f"({self.num_lookahead_slots}) must be greater than or "
-                "equal to 0."
-            )
+                "equal to 0.")
 
         if self.max_num_partial_prefills < 1:
             raise ValueError(
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
-                "must be greater than or equal to 1."
-            )
+                "must be greater than or equal to 1.")
         elif self.max_num_partial_prefills > 1:
             if not self.chunked_prefill_enabled:
-                raise ValueError(
-                    "Chunked prefill must be enabled to set "
-                    "max_num_partial_prefills > 1."
-                )
+                raise ValueError("Chunked prefill must be enabled to set "
+                                 "max_num_partial_prefills > 1.")
 
             if self.long_prefill_token_threshold > self.max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len})."
-                )
+                    f"than the max_model_len ({self.max_model_len}).")
 
-        if (self.max_long_partial_prefills < 1) or (
-            self.max_long_partial_prefills > self.max_num_partial_prefills
-        ):
+        if (self.max_long_partial_prefills
+                < 1) or (self.max_long_partial_prefills
+                         > self.max_num_partial_prefills):
             raise ValueError(
                 f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
                 "must be greater than or equal to 1 and less than or equal to "
-                f"max_num_partial_prefills ({self.max_num_partial_prefills})."
-            )
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
 
         return self
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -1368,7 +1368,6 @@ def create_engine_config(
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             is_encoder_decoder=model_config.is_encoder_decoder,
-            preemption_mode=self.preemption_mode,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,