Skip to content

Commit 803d5c3

Browse files
[V1] Override mm_counts for dummy data creation (#15703)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 7fd8c0f commit 803d5c3

File tree

9 files changed

+114
-93
lines changed

9 files changed

+114
-93
lines changed

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -385,49 +385,25 @@
385385
),
386386
"minicpmo_26": VLMTestInfo(
387387
models=["openbmb/MiniCPM-o-2_6"],
388-
test_type=(VLMTestType.IMAGE),
389-
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
390-
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
391-
max_model_len=4096,
392-
max_num_seqs=2,
393-
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
394-
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
395-
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
396-
),
397-
"minicpmo_26_multi_image": VLMTestInfo(
398-
models=["openbmb/MiniCPM-o-2_6"],
399-
test_type=(VLMTestType.MULTI_IMAGE),
388+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
400389
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
401390
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
402391
max_model_len=4096,
403392
max_num_seqs=2,
404393
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
405394
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
406395
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
407-
marks=[large_gpu_mark(min_gb=32)],
408396
),
409397
"minicpmv_26": VLMTestInfo(
410398
models=["openbmb/MiniCPM-V-2_6"],
411-
test_type=(VLMTestType.IMAGE),
412-
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
413-
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
414-
max_model_len=4096,
415-
max_num_seqs=2,
416-
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
417-
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
418-
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
419-
),
420-
"minicpmv_26_multi_image": VLMTestInfo(
421-
models=["openbmb/MiniCPM-V-2_6"],
422-
test_type=(VLMTestType.MULTI_IMAGE),
399+
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
423400
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
424401
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
425402
max_model_len=4096,
426403
max_num_seqs=2,
427404
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
428405
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
429406
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
430-
marks=[large_gpu_mark(min_gb=32)],
431407
),
432408
"molmo": VLMTestInfo(
433409
models=["allenai/Molmo-7B-D-0924"],

vllm/model_executor/models/llava_next_video.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,8 @@ def get_mm_max_tokens_per_item(
7171
max_video_tokens = self.get_num_video_tokens(
7272
image_width=target_width,
7373
image_height=target_height,
74-
num_frames=self.get_num_frames_with_most_features(seq_len),
74+
num_frames=self.get_num_frames_with_most_features(
75+
seq_len, mm_counts),
7576
)
7677

7778
return {"video": max_video_tokens}
@@ -130,9 +131,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
130131

131132
return num_frames
132133

133-
def get_num_frames_with_most_features(self, seq_len: int) -> int:
134-
mm_config = self.ctx.get_mm_config()
135-
max_videos = mm_config.get_limit_per_prompt("video")
134+
def get_num_frames_with_most_features(
135+
self,
136+
seq_len: int,
137+
mm_counts: Mapping[str, int],
138+
) -> int:
139+
max_videos = mm_counts.get("video", 0)
136140

137141
max_total_frames = self._get_max_video_frames(seq_len)
138142

@@ -155,7 +159,7 @@ def get_dummy_processor_inputs(
155159
target_width, target_height = \
156160
self.info.get_image_size_with_most_features()
157161
target_num_frames = \
158-
self.info.get_num_frames_with_most_features(seq_len)
162+
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
159163

160164
mm_data = {
161165
"video":

vllm/model_executor/models/llava_onevision.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def get_mm_max_tokens_per_item(
108108
) -> Mapping[str, int]:
109109
return {
110110
"image": self.get_max_image_tokens(),
111-
"video": self.get_max_video_tokens(seq_len),
111+
"video": self.get_max_video_tokens(seq_len, mm_counts),
112112
}
113113

114114
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -202,10 +202,13 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
202202

203203
return num_frames
204204

205-
def get_num_frames_with_most_features(self, seq_len: int) -> int:
206-
mm_config = self.ctx.get_mm_config()
207-
max_images = mm_config.get_limit_per_prompt("image")
208-
max_videos = mm_config.get_limit_per_prompt("video")
205+
def get_num_frames_with_most_features(
206+
self,
207+
seq_len: int,
208+
mm_counts: Mapping[str, int],
209+
) -> int:
210+
max_images = mm_counts.get("image", 0)
211+
max_videos = mm_counts.get("video", 0)
209212

210213
max_image_tokens = self.get_max_image_tokens() * max_images
211214
max_total_frames = self._get_max_video_frames(seq_len -
@@ -215,13 +218,18 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
215218

216219
return max(max_frames_per_video, 1)
217220

218-
def get_max_video_tokens(self, seq_len: int) -> int:
221+
def get_max_video_tokens(
222+
self,
223+
seq_len: int,
224+
mm_counts: Mapping[str, int],
225+
) -> int:
219226
target_width, target_height = self.get_image_size_with_most_features()
220227

221228
return self.get_num_video_tokens(
222229
image_width=target_width,
223230
image_height=target_height,
224-
num_frames=self.get_num_frames_with_most_features(seq_len),
231+
num_frames=self.get_num_frames_with_most_features(
232+
seq_len, mm_counts),
225233
)
226234

227235

@@ -243,7 +251,8 @@ def get_dummy_processor_inputs(
243251
target_width, target_height = \
244252
self.info.get_image_size_with_most_features()
245253
target_num_frames = \
246-
self.info.get_num_frames_with_most_features(seq_len)
254+
self.info.get_num_frames_with_most_features(seq_len,
255+
mm_counts)
247256

248257
mm_data = {
249258
"image":

vllm/model_executor/models/minicpmo.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@
4343
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
4444
from vllm.multimodal.profiling import ProcessorInputs
4545

46-
from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
46+
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
47+
MiniCPMVDummyInputsBuilder,
4748
MiniCPMVMultiModalDataParser,
4849
MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
4950
_minicpmv_field_config)
@@ -203,30 +204,33 @@ def get_max_audio_chunks_with_most_features(self) -> int:
203204
return 30
204205

205206
def get_max_audio_tokens(self) -> int:
206-
return self.get_max_audio_tokens_per_chunk(
207-
) * self.get_max_audio_chunks_with_most_features()
207+
num_chunks = self.get_max_audio_chunks_with_most_features()
208+
return self.get_max_audio_tokens_per_chunk() * num_chunks
208209

209210
def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
210211
sampling_rate = self.get_default_audio_sampling_rate()
211212
# exclude <audio> </audio>
212213
num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
213214
return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
214215

215-
def get_num_frames_with_most_features(self, seq_len: int) -> int:
216-
mm_config = self.ctx.get_mm_config()
217-
max_images = mm_config.get_limit_per_prompt("image")
218-
max_videos = mm_config.get_limit_per_prompt("video")
219-
max_audios = mm_config.get_limit_per_prompt("audio")
216+
def get_num_frames_with_most_features(
217+
self,
218+
seq_len: int,
219+
mm_counts: Mapping[str, int],
220+
) -> int:
221+
max_images = mm_counts.get("image", 0)
222+
max_videos = mm_counts.get("video", 0)
223+
max_audios = mm_counts.get("audio", 0)
220224

221225
max_image_tokens = self.get_max_image_tokens() * max_images
222226
max_audio_tokens = self.get_max_audio_tokens() * max_audios
223227
max_total_frames = self.get_max_video_frames(seq_len -
224228
max_image_tokens -
225229
max_audio_tokens)
230+
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
231+
_MAX_FRAMES_PER_VIDEO)
226232

227-
num_frames = max(max_total_frames // max(max_videos, 1), 1)
228-
229-
return num_frames
233+
return max(max_frames_per_video, 1)
230234

231235

232236
class MiniCPMODummyInputsBuilder(

vllm/model_executor/models/minicpmv.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@
6969
merge_multimodal_embeddings)
7070
from .vision import scatter_patch_features, select_patch_features
7171

72+
# For profile run
73+
_MAX_FRAMES_PER_VIDEO = 16
74+
7275

7376
class MiniCPMVImagePixelInputs(TypedDict):
7477
type: Literal["pixel_values"]
@@ -369,7 +372,8 @@ def get_mm_max_tokens_per_item(
369372
) -> Mapping[str, int]:
370373
mm_max_tokens = {"image": self.get_max_image_tokens()}
371374
if self.get_model_version() == (2, 6):
372-
mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
375+
mm_max_tokens["video"] = self.get_max_video_tokens(
376+
seq_len, mm_counts)
373377

374378
return mm_max_tokens
375379

@@ -432,9 +436,14 @@ def get_max_video_frame_tokens(self) -> int:
432436
use_image_id=False,
433437
)
434438

435-
def get_max_video_tokens(self, seq_len: int) -> int:
436-
return self.get_max_video_frame_tokens(
437-
) * self.get_num_frames_with_most_features(seq_len)
439+
def get_max_video_tokens(
440+
self,
441+
seq_len: int,
442+
mm_counts: Mapping[str, int],
443+
) -> int:
444+
num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
445+
num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
446+
return num_video_tokens_total
438447

439448
def get_video_max_slice_num(self) -> int:
440449
return 1
@@ -449,18 +458,21 @@ def get_max_video_frames(self, max_tokens: int) -> int:
449458
num_frames = max_tokens // num_frame_tokens
450459
return num_frames
451460

452-
def get_num_frames_with_most_features(self, seq_len: int) -> int:
453-
mm_config = self.ctx.get_mm_config()
454-
max_images = mm_config.get_limit_per_prompt("image")
455-
max_videos = mm_config.get_limit_per_prompt("video")
461+
def get_num_frames_with_most_features(
462+
self,
463+
seq_len: int,
464+
mm_counts: Mapping[str, int],
465+
) -> int:
466+
max_images = mm_counts.get("image", 0)
467+
max_videos = mm_counts.get("video", 0)
456468

457469
max_image_tokens = self.get_max_image_tokens() * max_images
458470
max_total_frames = self.get_max_video_frames(seq_len -
459471
max_image_tokens)
472+
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
473+
_MAX_FRAMES_PER_VIDEO)
460474

461-
num_frames = max(max_total_frames // max(max_videos, 1), 1)
462-
463-
return num_frames
475+
return max(max_frames_per_video, 1)
464476

465477

466478
_I = TypeVar("_I",
@@ -483,7 +495,7 @@ def get_dummy_processor_inputs(
483495
video_width, video_height = \
484496
self.info.get_video_frame_size_with_most_features()
485497
num_video_frames = \
486-
self.info.get_num_frames_with_most_features(seq_len)
498+
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
487499

488500
mm_data = {
489501
"image":

vllm/model_executor/models/qwen2_vl.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ def get_image_processor(
806806
max_pixels: Optional[int] = None,
807807
size: Optional[dict[str, int]] = None,
808808
**kwargs: object,
809-
):
809+
) -> Qwen2VLImageProcessor:
810810
return cached_image_processor_from_config(
811811
self.ctx.model_config,
812812
**self._get_image_processor_kwargs(min_pixels=min_pixels,
@@ -825,7 +825,7 @@ def get_mm_max_tokens_per_item(
825825
) -> Mapping[str, int]:
826826
return {
827827
"image": self.get_max_image_tokens(),
828-
"video": self.get_max_video_tokens(seq_len),
828+
"video": self.get_max_video_tokens(seq_len, mm_counts),
829829
}
830830

831831
def _get_vision_info(
@@ -941,10 +941,13 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
941941

942942
return num_frames
943943

944-
def get_num_frames_with_most_features(self, seq_len: int) -> int:
945-
mm_config = self.ctx.get_mm_config()
946-
max_images = mm_config.get_limit_per_prompt("image")
947-
max_videos = mm_config.get_limit_per_prompt("video")
944+
def get_num_frames_with_most_features(
945+
self,
946+
seq_len: int,
947+
mm_counts: Mapping[str, int],
948+
) -> int:
949+
max_images = mm_counts.get("image", 0)
950+
max_videos = mm_counts.get("video", 0)
948951

949952
max_image_tokens = self.get_max_image_tokens() * max_images
950953
max_total_frames = self._get_max_video_frames(seq_len -
@@ -954,13 +957,18 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
954957

955958
return max(max_frames_per_video, 1)
956959

957-
def get_max_video_tokens(self, seq_len: int) -> int:
960+
def get_max_video_tokens(
961+
self,
962+
seq_len: int,
963+
mm_counts: Mapping[str, int],
964+
) -> int:
958965
target_width, target_height = self.get_image_size_with_most_features()
959966

960967
return self.get_num_video_tokens(
961968
image_width=target_width,
962969
image_height=target_height,
963-
num_frames=self.get_num_frames_with_most_features(seq_len),
970+
num_frames=self.get_num_frames_with_most_features(
971+
seq_len, mm_counts),
964972
image_processor=None,
965973
)
966974

@@ -982,7 +990,7 @@ def get_dummy_processor_inputs(
982990
target_width, target_height = \
983991
self.info.get_image_size_with_most_features()
984992
target_num_frames = \
985-
self.info.get_num_frames_with_most_features(seq_len)
993+
self.info.get_num_frames_with_most_features(seq_len, mm_counts)
986994

987995
mm_data = {
988996
"image":

0 commit comments

Comments
 (0)