diff --git a/vllm/config.py b/vllm/config.py index 40beace3040c..4020f8b3ebdb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2050,6 +2050,13 @@ def __post_init__(self) -> None: _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) + # When using default settings, + # Ensure max_num_batched_tokens does not exceed model limit. + # Some models (e.g., Whisper) have embeddings tied to max length. + self.max_num_batched_tokens = min( + self.max_num_seqs * self.max_model_len, + self.max_num_batched_tokens) + self.max_num_encoder_input_tokens = self.max_num_batched_tokens self.encoder_cache_size = self.max_num_batched_tokens @@ -2090,6 +2097,13 @@ def _verify_args(self) -> None: "be greater than or equal to max_num_seqs " f"({self.max_num_seqs}).") + if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len: + logger.warning( + "max_num_batched_tokens (%d) exceeds max_num_seqs" + "* max_model_len (%d). This may lead to unexpected behavior.", + self.max_num_batched_tokens, + self.max_num_seqs * self.max_model_len) + if self.num_lookahead_slots < 0: raise ValueError( "num_lookahead_slots "