Skip to content

Commit 5b8c390

Browse files
[Bugfix] Fix modality limits in vision language example (#17721)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent 7525d5f commit 5b8c390

File tree

1 file changed

+36
-36
lines changed

1 file changed

+36
-36
lines changed

examples/offline_inference/vision_language.py

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
4545
max_model_len=4096,
4646
max_num_seqs=2,
4747
dtype="bfloat16",
48-
limit_mm_per_prompt={"image": 1},
48+
limit_mm_per_prompt={modality: 1},
4949
)
5050

5151
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
7171
max_model_len=2048,
7272
max_num_seqs=2,
7373
mm_processor_kwargs={"crop_to_patches": True},
74-
limit_mm_per_prompt={"image": 1},
74+
limit_mm_per_prompt={modality: 1},
7575
)
7676
prompts = [
7777
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
9292
prompts = [f"Question: {question} Answer:" for question in questions]
9393
engine_args = EngineArgs(
9494
model="Salesforce/blip2-opt-6.7b",
95-
limit_mm_per_prompt={"image": 1},
95+
limit_mm_per_prompt={modality: 1},
9696
)
9797

9898
return ModelRequestData(
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
110110
model="facebook/chameleon-7b",
111111
max_model_len=4096,
112112
max_num_seqs=2,
113-
limit_mm_per_prompt={"image": 1},
113+
limit_mm_per_prompt={modality: 1},
114114
)
115115

116116
return ModelRequestData(
@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
130130
max_model_len=4096,
131131
max_num_seqs=2,
132132
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
133-
limit_mm_per_prompt={"image": 1},
133+
limit_mm_per_prompt={modality: 1},
134134
)
135135

136136
prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
155155
max_num_seqs=2,
156156
trust_remote_code=True,
157157
dtype="bfloat16",
158-
limit_mm_per_prompt={"image": 1},
158+
limit_mm_per_prompt={modality: 1},
159159
)
160160

161161
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
175175
model="adept/fuyu-8b",
176176
max_model_len=2048,
177177
max_num_seqs=2,
178-
limit_mm_per_prompt={"image": 1},
178+
limit_mm_per_prompt={modality: 1},
179179
)
180180

181181
return ModelRequestData(
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
194194
max_model_len=2048,
195195
max_num_seqs=2,
196196
mm_processor_kwargs={"do_pan_and_scan": True},
197-
limit_mm_per_prompt={"image": 1},
197+
limit_mm_per_prompt={modality: 1},
198198
)
199199

200200
prompts = [("<bos><start_of_turn>user\n"
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
219219
trust_remote_code=True,
220220
enforce_eager=True,
221221
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
222-
limit_mm_per_prompt={"image": 1},
222+
limit_mm_per_prompt={modality: 1},
223223
)
224224

225225
prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
246246
model=model_name,
247247
trust_remote_code=True,
248248
max_model_len=8192,
249-
limit_mm_per_prompt={"image": 1},
249+
limit_mm_per_prompt={modality: 1},
250250
)
251251

252252
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
287287
"longest_edge": 3 * 364
288288
},
289289
},
290-
limit_mm_per_prompt={"image": 1},
290+
limit_mm_per_prompt={modality: 1},
291291
)
292292
prompts = [(
293293
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
314314
"longest_edge": 384
315315
},
316316
},
317-
limit_mm_per_prompt={"image": 1},
317+
limit_mm_per_prompt={modality: 1},
318318
)
319319
prompts = [
320320
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
337337
model=model_name,
338338
trust_remote_code=True,
339339
max_model_len=4096,
340-
limit_mm_per_prompt={"image": 1},
340+
limit_mm_per_prompt={modality: 1},
341341
)
342342

343343
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
378378
model="moonshotai/Kimi-VL-A3B-Instruct",
379379
trust_remote_code=True,
380380
max_model_len=4096,
381-
limit_mm_per_prompt={"image": 1},
381+
limit_mm_per_prompt={modality: 1},
382382
)
383383

384384
return ModelRequestData(
@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
398398
engine_args = EngineArgs(
399399
model="llava-hf/llava-1.5-7b-hf",
400400
max_model_len=4096,
401-
limit_mm_per_prompt={"image": 1},
401+
limit_mm_per_prompt={modality: 1},
402402
)
403403

404404
return ModelRequestData(
@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
415415
engine_args = EngineArgs(
416416
model="llava-hf/llava-v1.6-mistral-7b-hf",
417417
max_model_len=8192,
418-
limit_mm_per_prompt={"image": 1},
418+
limit_mm_per_prompt={modality: 1},
419419
)
420420

421421
return ModelRequestData(
@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
437437
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
438438
max_model_len=8192,
439439
max_num_seqs=2,
440-
limit_mm_per_prompt={"image": 1},
440+
limit_mm_per_prompt={modality: 1},
441441
)
442442

443443
return ModelRequestData(
@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
465465
engine_args = EngineArgs(
466466
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
467467
max_model_len=16384,
468-
limit_mm_per_prompt={"image": 1},
468+
limit_mm_per_prompt={modality: 1},
469469
)
470470

471471
return ModelRequestData(
@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
488488
model="TIGER-Lab/Mantis-8B-siglip-llama3",
489489
max_model_len=4096,
490490
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
491-
limit_mm_per_prompt={"image": 1},
491+
limit_mm_per_prompt={modality: 1},
492492
)
493493
stop_token_ids = [128009]
494494

@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
529529
max_model_len=4096,
530530
max_num_seqs=2,
531531
trust_remote_code=True,
532-
limit_mm_per_prompt={"image": 1},
532+
limit_mm_per_prompt={modality: 1},
533533
)
534534
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
535535
# 2.0
@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
584584
max_model_len=8192,
585585
max_num_seqs=2,
586586
tensor_parallel_size=2,
587-
limit_mm_per_prompt={"image": 1},
587+
limit_mm_per_prompt={modality: 1},
588588
)
589589

590590
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
610610
model=model_name,
611611
max_model_len=8192,
612612
max_num_seqs=2,
613-
limit_mm_per_prompt={"image": 1},
613+
limit_mm_per_prompt={modality: 1},
614614
)
615615

616616
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
645645
max_num_seqs=4,
646646
tensor_parallel_size=8,
647647
gpu_memory_utilization=0.4,
648-
limit_mm_per_prompt={"image": 1},
648+
limit_mm_per_prompt={modality: 1},
649649
)
650650

651651
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
680680
model=model_name,
681681
trust_remote_code=True,
682682
dtype="bfloat16",
683-
limit_mm_per_prompt={"image": 1},
683+
limit_mm_per_prompt={modality: 1},
684684
)
685685

686686
prompts = [
@@ -706,7 +706,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
706706
trust_remote_code=True,
707707
max_model_len=4096,
708708
tensor_parallel_size=4,
709-
limit_mm_per_prompt={"image": 1},
709+
limit_mm_per_prompt={modality: 1},
710710
)
711711

712712
tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -738,7 +738,7 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
738738
trust_remote_code=True,
739739
dtype="half",
740740
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
741-
limit_mm_per_prompt={"image": 1},
741+
limit_mm_per_prompt={modality: 1},
742742
)
743743

744744
placeholder = "<image>\n"
@@ -761,7 +761,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
761761
prompts = ["caption en" for _ in questions]
762762
engine_args = EngineArgs(
763763
model="google/paligemma-3b-mix-224",
764-
limit_mm_per_prompt={"image": 1},
764+
limit_mm_per_prompt={modality: 1},
765765
)
766766

767767
return ModelRequestData(
@@ -778,7 +778,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
778778
prompts = ["caption en" for _ in questions]
779779
engine_args = EngineArgs(
780780
model="google/paligemma2-3b-ft-docci-448",
781-
limit_mm_per_prompt={"image": 1},
781+
limit_mm_per_prompt={modality: 1},
782782
)
783783

784784
return ModelRequestData(
@@ -815,7 +815,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
815815
max_num_seqs=2,
816816
# Note - mm_processor_kwargs can also be passed to generate/chat calls
817817
mm_processor_kwargs={"num_crops": 16},
818-
limit_mm_per_prompt={"image": 1},
818+
limit_mm_per_prompt={modality: 1},
819819
)
820820

821821
return ModelRequestData(
@@ -849,7 +849,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
849849
max_lora_rank=320,
850850
# Note - mm_processor_kwargs can also be passed to generate/chat calls
851851
mm_processor_kwargs={"dynamic_hd": 16},
852-
limit_mm_per_prompt={"image": 1},
852+
limit_mm_per_prompt={modality: 1},
853853
)
854854

855855
return ModelRequestData(
@@ -870,7 +870,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
870870
model=model_name,
871871
max_model_len=6144,
872872
max_num_seqs=2,
873-
limit_mm_per_prompt={"image": 1},
873+
limit_mm_per_prompt={modality: 1},
874874
)
875875

876876
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -891,7 +891,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
891891
max_model_len=1024,
892892
max_num_seqs=2,
893893
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
894-
limit_mm_per_prompt={"image": 1},
894+
limit_mm_per_prompt={modality: 1},
895895
)
896896

897897
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -916,7 +916,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
916916
"min_pixels": 28 * 28,
917917
"max_pixels": 1280 * 28 * 28,
918918
},
919-
limit_mm_per_prompt={"image": 1},
919+
limit_mm_per_prompt={modality: 1},
920920
)
921921

922922
if modality == "image":
@@ -951,7 +951,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
951951
"max_pixels": 1280 * 28 * 28,
952952
"fps": 1,
953953
},
954-
limit_mm_per_prompt={"image": 1},
954+
limit_mm_per_prompt={modality: 1},
955955
)
956956

957957
if modality == "image":
@@ -985,7 +985,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
985985
"max_pixels": 1280 * 28 * 28,
986986
"fps": [1],
987987
},
988-
limit_mm_per_prompt={"image": 1},
988+
limit_mm_per_prompt={modality: 1},
989989
)
990990

991991
if modality == "image":
@@ -1018,7 +1018,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
10181018
model=model_name,
10191019
trust_remote_code=True,
10201020
max_model_len=4096,
1021-
limit_mm_per_prompt={"image": 1},
1021+
limit_mm_per_prompt={modality: 1},
10221022
)
10231023

10241024
tokenizer = AutoTokenizer.from_pretrained(model_name,

0 commit comments

Comments
 (0)