From 3fe18934d7f33b98812d7402463fe171c24efc77 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 17 Jun 2025 16:54:31 +0000
Subject: [PATCH 1/9] [Multimodal] Optimize Qwen2/2.5-VL startup time

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/models/qwen2_vl.py |  3 +++
 vllm/multimodal/processing.py          | 12 ++++++++++++
 vllm/multimodal/profiling.py           |  5 ++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 49b709069cd2..8e534f78da19 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -819,6 +819,9 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": 16384, "video": 98304}
+
     def _get_vision_info(
         self,
         *,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 5cfca57bffee..2521e088c7f3 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1100,6 +1100,18 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
         return allowed_limits
 
+    def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]:
+        """Return the maximum number of tokens for each modality.
+        By default, returns `None`. When `None` is returned, vLLM will generate
+        dummy inputs (images/videos) at maximum possible sizes and process them
+        to determine the maximum token count per modality.
+        This approach works but can be very slow for certain models (e.g.,
+        Qwen2.5-VL), leading to very long startup time. For better performance,
+        each model can override this method to return pre-computed maximum token
+        counts, avoiding the need for dummy input generation and processing.
+        """
+        return None
+
 
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1faecb7bd24a..6de768feccad 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -253,6 +253,9 @@ def get_mm_max_tokens(
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> Mapping[str, int]:
-        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
+        max_tokens_per_item = self.processing_info.get_max_tokens_per_item()
+        if max_tokens_per_item is not None:
+            return max_tokens_per_item
 
+        mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         return self._get_mm_num_tokens(mm_inputs)

From 883bf7e5d3e3c7e9e8c3e1560045ea75563c19a4 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 18 Jun 2025 22:41:14 -0700
Subject: [PATCH 2/9] clarify and remove hardcoded values

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/qwen2_vl.py | 7 +++++--
 vllm/multimodal/processing.py          | 9 +++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8e534f78da19..ea9d56323d09 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -819,8 +819,11 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_max_tokens_per_item(self) -> Mapping[str, int]:
-        return {"image": 16384, "video": 98304}
+    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+
+        max_image_tokens = self.get_max_image_tokens()
+        max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
+        return {"image": max_image_tokens, "video": max_video_tokens}
 
     def _get_vision_info(
         self,
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 2521e088c7f3..b91c9821ef88 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1100,8 +1100,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
         return allowed_limits
 
-    def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]:
-        """Return the maximum number of tokens for each modality.
+    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+        """Return the maximum number of tokens per item of for each modality.
         By default, returns `None`. When `None` is returned, vLLM will generate
         dummy inputs (images/videos) at maximum possible sizes and process them
         to determine the maximum token count per modality.
@@ -1109,6 +1109,11 @@ def get_max_tokens_per_item(self) -> Optional[Mapping[str, int]]:
         Qwen2.5-VL), leading to very long startup time. For better performance,
         each model can override this method to return pre-computed maximum token
         counts, avoiding the need for dummy input generation and processing.
+
+        NOTE: The result maximum number of tokens per item of each modality respect
+        to the model maximum sequence length and the maximum number of items of each 
+        modality allowed.
+
         """
         return None
 

From f6250e7f06723282f1b22f0f7a7dd3d9a666a4d5 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Wed, 18 Jun 2025 22:46:27 -0700
Subject: [PATCH 3/9] comment

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/processing.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index b91c9821ef88..008cd1e53e83 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1110,9 +1110,10 @@ def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) ->
         each model can override this method to return pre-computed maximum token
         counts, avoiding the need for dummy input generation and processing.
 
-        NOTE: The result maximum number of tokens per item of each modality respect
-        to the model maximum sequence length and the maximum number of items of each 
-        modality allowed.
+        NOTE: The maximum number of tokens per item of each modality returned from 
+        this function should respect to the model maximum sequence length and the 
+        maximum number of items of each modality allowed, and agrees with dummy 
+        inputs (images/videos) at maximum possible sizes.
 
         """
         return None

From 9ec904b055b343481350d5ee0749ed930e720db1 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Wed, 18 Jun 2025 23:20:56 -0700
Subject: [PATCH 4/9] add missing kwarg

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/model_executor/models/qwen2_vl.py | 4 +++-
 vllm/multimodal/processing.py          | 4 +++-
 vllm/multimodal/profiling.py           | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b2d75648d52d..6f78de1e6bb0 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -820,7 +820,9 @@ def get_image_processor(
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+    def get_max_tokens_per_item(
+            self, seq_len: int,
+            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
 
         max_image_tokens = self.get_max_image_tokens()
         max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 008cd1e53e83..e4a557c8e50a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1100,7 +1100,9 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
         return allowed_limits
 
-    def get_max_tokens_per_item(self, seq_len: int, mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+    def get_max_tokens_per_item(
+            self, seq_len: int,
+            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
         """Return the maximum number of tokens per item of for each modality.
         By default, returns `None`. When `None` is returned, vLLM will generate
         dummy inputs (images/videos) at maximum possible sizes and process them
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 6de768feccad..4c8c11416020 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -253,7 +253,8 @@ def get_mm_max_tokens(
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
     ) -> Mapping[str, int]:
-        max_tokens_per_item = self.processing_info.get_max_tokens_per_item()
+        max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
+            seq_len=seq_len, mm_counts=mm_counts)
         if max_tokens_per_item is not None:
             return max_tokens_per_item
 

From 85de93e98b3ecd85fc4f45a391e7d2c72082a64a Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Thu, 19 Jun 2025 02:38:53 -0700
Subject: [PATCH 5/9] typing

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/processing.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index e4a557c8e50a..38f3a7cb932f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1102,7 +1102,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
     def get_max_tokens_per_item(
             self, seq_len: int,
-            mm_counts: Mapping[str, int]) -> Optional[Mapping[str, int]]:
+            mm_counts: Optional[Mapping[str,
+                                        int]]) -> Optional[Mapping[str, int]]:
         """Return the maximum number of tokens per item of for each modality.
         By default, returns `None`. When `None` is returned, vLLM will generate
         dummy inputs (images/videos) at maximum possible sizes and process them
@@ -1112,10 +1113,10 @@ def get_max_tokens_per_item(
         each model can override this method to return pre-computed maximum token
         counts, avoiding the need for dummy input generation and processing.
 
-        NOTE: The maximum number of tokens per item of each modality returned from 
-        this function should respect to the model maximum sequence length and the 
-        maximum number of items of each modality allowed, and agrees with dummy 
-        inputs (images/videos) at maximum possible sizes.
+        NOTE: The maximum number of tokens per item of each modality returned 
+        from this function should respect to the model maximum sequence length 
+        and the maximum number of items of each modality allowed, and agrees 
+        with dummy inputs (images/videos) at maximum possible sizes.
 
         """
         return None

From fc97bb8dc31b1f7e93ce02c402118208b7583b84 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 20 Jun 2025 11:06:23 -0700
Subject: [PATCH 6/9] update

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/profiling.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 4c8c11416020..12521a319cfe 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -256,6 +256,15 @@ def get_mm_max_tokens(
         max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
             seq_len=seq_len, mm_counts=mm_counts)
         if max_tokens_per_item is not None:
+            total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
+                                  for k in max_tokens_per_item.keys()
+                                  & mm_counts.keys())
+            if total_mm_tokens > seq_len:
+                raise ValueError(
+                    "Pre-commputed total number of multimodal tokens cannot be "
+                    "greater than the sequence length. This is likely due to "
+                    "incorrect implementation of `get_max_tokens_per_item` in "
+                    "the model definition.")
             return max_tokens_per_item
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

From a1fde53ea45b1096baf8f4d096db38b68ed82373 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 20 Jun 2025 11:24:30 -0700
Subject: [PATCH 7/9] update

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/profiling.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 12521a319cfe..2bf5bbb73138 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -256,9 +256,12 @@ def get_mm_max_tokens(
         max_tokens_per_item = self.processing_info.get_max_tokens_per_item(
             seq_len=seq_len, mm_counts=mm_counts)
         if max_tokens_per_item is not None:
-            total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
-                                  for k in max_tokens_per_item.keys()
-                                  & mm_counts.keys())
+            if mm_counts is None:
+                total_mm_tokens = sum(max_tokens_per_item.values())
+            else:
+                total_mm_tokens = sum(max_tokens_per_item[k] * mm_counts[k]
+                                      for k in max_tokens_per_item.keys()
+                                      & mm_counts.keys())
             if total_mm_tokens > seq_len:
                 raise ValueError(
                     "Pre-commputed total number of multimodal tokens cannot be "

From 75767f09bdbeff494c0a391c4d305b70ea808696 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 20 Jun 2025 15:53:28 -0700
Subject: [PATCH 8/9] debug

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/profiling.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 2bf5bbb73138..490a7350798e 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -264,10 +264,11 @@ def get_mm_max_tokens(
                                       & mm_counts.keys())
             if total_mm_tokens > seq_len:
                 raise ValueError(
-                    "Pre-commputed total number of multimodal tokens cannot be "
-                    "greater than the sequence length. This is likely due to "
-                    "incorrect implementation of `get_max_tokens_per_item` in "
-                    "the model definition.")
+                    "Pre-commputed total number of multimodal tokens "
+                    f"({total_mm_tokens}) cannot be greater than the sequence "
+                    f"length ({seq_len}). This is likely due to incorrect "
+                    "implementation of `get_max_tokens_per_item` in the model "
+                    "definition.")
             return max_tokens_per_item
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)

From bf105d09c17fe359d73ec9b1dcd37bc254ce6014 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 20 Jun 2025 16:13:56 -0700
Subject: [PATCH 9/9] warning

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/profiling.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 490a7350798e..67bcb31f23f7 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -263,12 +263,15 @@ def get_mm_max_tokens(
                                       for k in max_tokens_per_item.keys()
                                       & mm_counts.keys())
             if total_mm_tokens > seq_len:
-                raise ValueError(
-                    "Pre-commputed total number of multimodal tokens "
-                    f"({total_mm_tokens}) cannot be greater than the sequence "
-                    f"length ({seq_len}). This is likely due to incorrect "
-                    "implementation of `get_max_tokens_per_item` in the model "
-                    "definition.")
+                logger.warning_once(
+                    "The sequence length (%d) is smaller than the pre-defined"
+                    " wosrt-case total number of multimodal tokens (%d). "
+                    "This may cause certain multi-modal inputs to fail during "
+                    "inference. To avoid this, you should increase "
+                    "`max_model_len` or reduce `mm_counts`.",
+                    seq_len,
+                    total_mm_tokens,
+                )
             return max_tokens_per_item
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)