From 3fe18934d7f33b98812d7402463fe171c24efc77 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 17 Jun 2025 16:54:31 +0000 Subject: [PATCH 1/9] [Multimodal] Optimize Qwen2/2.5-VL startup time Signed-off-by: Woosuk Kwon --- vllm/model_executor/models/qwen2_vl.py | 3 +++ vllm/multimodal/processing.py | 12 ++++++++++++ vllm/multimodal/profiling.py | 5 ++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 49b709069cd2..8e534f78da19 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -819,6 +819,9 @@ def get_image_processor( def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} + def get_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": 16384, "video": 98304} + def _get_vision_info( self, *, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 5cfca57bffee..2521e088c7f3 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1100,6 +1100,18 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: return allowed_limits + def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]: + """Return the maximum number of tokens for each modality. + By default, returns `None`. When `None` is returned, vLLM will generate + dummy inputs (images/videos) at maximum possible sizes and process them + to determine the maximum token count per modality. + This approach works but can be very slow for certain models (e.g., + Qwen2.5-VL), leading to very long startup time. For better performance, + each model can override this method to return pre-computed maximum token + counts, avoiding the need for dummy input generation and processing. + """ + return None + _I = TypeVar("_I", bound=BaseProcessingInfo) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 1faecb7bd24a..6de768feccad 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -253,6 +253,9 @@ def get_mm_max_tokens( seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, ) -> Mapping[str, int]: - mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) + max_tokens_per_item = self.processing_info.get_max_tokens_per_item() + if max_tokens_per_item is not None: + return max_tokens_per_item + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) return self._get_mm_num_tokens(mm_inputs) From 883bf7e5d3e3c7e9e8c3e1560045ea75563c19a4 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 18 Jun 2025 22:41:14 -0700 Subject: [PATCH 2/9] clarify and remove hardcoded values Signed-off-by: Roger Wang Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen2_vl.py | 7 +++++-- vllm/multimodal/processing.py | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8e534f78da19..ea9d56323d09 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -819,8 +819,11 @@ def get_image_processor( def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": 16384, "video": 98304} + def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + + max_image_tokens = self.get_max_image_tokens() + max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) + return {"image": max_image_tokens, "video": max_video_tokens} def _get_vision_info( self, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 2521e088c7f3..b91c9821ef88 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1100,8 +1100,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: return allowed_limits - def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]: - """Return the maximum number of tokens for each modality. + def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + """Return the maximum number of tokens per item of for each modality. By default, returns `None`. When `None` is returned, vLLM will generate dummy inputs (images/videos) at maximum possible sizes and process them to determine the maximum token count per modality. @@ -1109,6 +1109,11 @@ def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]: Qwen2.5-VL), leading to very long startup time. For better performance, each model can override this method to return pre-computed maximum token counts, avoiding the need for dummy input generation and processing. + + NOTE: The result maximum number of tokens per item of each modality respect + to the model maximum sequence length and the maximum number of items of each + modality allowed. + """ return None From f6250e7f06723282f1b22f0f7a7dd3d9a666a4d5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 18 Jun 2025 22:46:27 -0700 Subject: [PATCH 3/9] comment Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index b91c9821ef88..008cd1e53e83 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1110,9 +1110,10 @@ def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> each model can override this method to return pre-computed maximum token counts, avoiding the need for dummy input generation and processing. - NOTE: The result maximum number of tokens per item of each modality respect - to the model maximum sequence length and the maximum number of items of each - modality allowed. + NOTE: The maximum number of tokens per item of each modality returned from + this function should respect to the model maximum sequence length and the + maximum number of items of each modality allowed, and agrees with dummy + inputs (images/videos) at maximum possible sizes. """ return None From 9ec904b055b343481350d5ee0749ed930e720db1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 18 Jun 2025 23:20:56 -0700 Subject: [PATCH 4/9] add missing kwarg Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen2_vl.py | 4 +++- vllm/multimodal/processing.py | 4 +++- vllm/multimodal/profiling.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b2d75648d52d..6f78de1e6bb0 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -820,7 +820,9 @@ def get_image_processor( def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + def get_max_tokens_per_item( + self, seq_len: int, + mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: max_image_tokens = self.get_max_image_tokens() max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 008cd1e53e83..e4a557c8e50a 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1100,7 +1100,9 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: return allowed_limits - def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + def get_max_tokens_per_item( + self, seq_len: int, + mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: """Return the maximum number of tokens per item of for each modality. By default, returns `None`. When `None` is returned, vLLM will generate dummy inputs (images/videos) at maximum possible sizes and process them diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 6de768feccad..4c8c11416020 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -253,7 +253,8 @@ def get_mm_max_tokens( seq_len: int, mm_counts: Optional[Mapping[str, int]] = None, ) -> Mapping[str, int]: - max_tokens_per_item = self.processing_info.get_max_tokens_per_item() + max_tokens_per_item = self.processing_info.get_max_tokens_per_item( + seq_len=seq_len, mm_counts=mm_counts) if max_tokens_per_item is not None: return max_tokens_per_item From 85de93e98b3ecd85fc4f45a391e7d2c72082a64a Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 19 Jun 2025 02:38:53 -0700 Subject: [PATCH 5/9] typing Signed-off-by: Roger Wang --- vllm/multimodal/processing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index e4a557c8e50a..38f3a7cb932f 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1102,7 +1102,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]: def get_max_tokens_per_item( self, seq_len: int, - mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]: + mm_counts: Optional[Mapping[str, + int]]) -> Optional[Mapping[str, int]]: """Return the maximum number of tokens per item of for each modality. By default, returns `None`. When `None` is returned, vLLM will generate dummy inputs (images/videos) at maximum possible sizes and process them @@ -1112,10 +1113,10 @@ def get_max_tokens_per_item( each model can override this method to return pre-computed maximum token counts, avoiding the need for dummy input generation and processing. - NOTE: The maximum number of tokens per item of each modality returned from - this function should respect to the model maximum sequence length and the - maximum number of items of each modality allowed, and agrees with dummy - inputs (images/videos) at maximum possible sizes. + NOTE: The maximum number of tokens per item of each modality returned + from this function should respect to the model maximum sequence length + and the maximum number of items of each modality allowed, and agrees + with dummy inputs (images/videos) at maximum possible sizes. """ return None From fc97bb8dc31b1f7e93ce02c402118208b7583b84 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 20 Jun 2025 11:06:23 -0700 Subject: [PATCH 6/9] update Signed-off-by: Roger Wang --- vllm/multimodal/profiling.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 4c8c11416020..12521a319cfe 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -256,6 +256,15 @@ def get_mm_max_tokens( max_tokens_per_item = self.processing_info.get_max_tokens_per_item( seq_len=seq_len, mm_counts=mm_counts) if max_tokens_per_item is not None: + total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k] + for k in max_tokens_per_item.keys() + & mm_counts.keys()) + if total_mm_tokens > seq_len: + raise ValueError( + "Pre-commputed total number of multimodal tokens cannot be " + "greater than the sequence length. This is likely due to " + "incorrect implementation of `get_max_tokens_per_item` in " + "the model definition.") return max_tokens_per_item mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) From a1fde53ea45b1096baf8f4d096db38b68ed82373 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 20 Jun 2025 11:24:30 -0700 Subject: [PATCH 7/9] update Signed-off-by: Roger Wang --- vllm/multimodal/profiling.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 12521a319cfe..2bf5bbb73138 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -256,9 +256,12 @@ def get_mm_max_tokens( max_tokens_per_item = self.processing_info.get_max_tokens_per_item( seq_len=seq_len, mm_counts=mm_counts) if max_tokens_per_item is not None: - total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k] - for k in max_tokens_per_item.keys() - & mm_counts.keys()) + if mm_counts is None: + total_mm_tokens = sum(max_tokens_per_item.values()) + else: + total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k] + for k in max_tokens_per_item.keys() + & mm_counts.keys()) if total_mm_tokens > seq_len: raise ValueError( "Pre-commputed total number of multimodal tokens cannot be " From 75767f09bdbeff494c0a391c4d305b70ea808696 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 20 Jun 2025 15:53:28 -0700 Subject: [PATCH 8/9] debug Signed-off-by: Roger Wang --- vllm/multimodal/profiling.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 2bf5bbb73138..490a7350798e 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -264,10 +264,11 @@ def get_mm_max_tokens( & mm_counts.keys()) if total_mm_tokens > seq_len: raise ValueError( - "Pre-commputed total number of multimodal tokens cannot be " - "greater than the sequence length. This is likely due to " - "incorrect implementation of `get_max_tokens_per_item` in " - "the model definition.") + "Pre-commputed total number of multimodal tokens " + f"({total_mm_tokens}) cannot be greater than the sequence " + f"length ({seq_len}). This is likely due to incorrect " + "implementation of `get_max_tokens_per_item` in the model " + "definition.") return max_tokens_per_item mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) From bf105d09c17fe359d73ec9b1dcd37bc254ce6014 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 20 Jun 2025 16:13:56 -0700 Subject: [PATCH 9/9] warning Signed-off-by: Roger Wang --- vllm/multimodal/profiling.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 490a7350798e..67bcb31f23f7 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -263,12 +263,15 @@ def get_mm_max_tokens( for k in max_tokens_per_item.keys() & mm_counts.keys()) if total_mm_tokens > seq_len: - raise ValueError( - "Pre-commputed total number of multimodal tokens " - f"({total_mm_tokens}) cannot be greater than the sequence " - f"length ({seq_len}). This is likely due to incorrect " - "implementation of `get_max_tokens_per_item` in the model " - "definition.") + logger.warning_once( + "The sequence length (%d) is smaller than the pre-defined" + " wosrt-case total number of multimodal tokens (%d). " + "This may cause certain multi-modal inputs to fail during " + "inference. To avoid this, you should increase " + "`max_model_len` or reduce `mm_counts`.", + seq_len, + total_mm_tokens, + ) return max_tokens_per_item mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)