@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
4545 max_model_len = 4096 ,
4646 max_num_seqs = 2 ,
4747 dtype = "bfloat16" ,
48- limit_mm_per_prompt = {"image" : 1 },
48+ limit_mm_per_prompt = {modality : 1 },
4949 )
5050
5151 prompts = [(f"<|im_start|>user\n <fim_prefix><|img|><fim_suffix>{ question } "
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
7171 max_model_len = 2048 ,
7272 max_num_seqs = 2 ,
7373 mm_processor_kwargs = {"crop_to_patches" : True },
74- limit_mm_per_prompt = {"image" : 1 },
74+ limit_mm_per_prompt = {modality : 1 },
7575 )
7676 prompts = [
7777 f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{ question } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
9292 prompts = [f"Question: { question } Answer:" for question in questions ]
9393 engine_args = EngineArgs (
9494 model = "Salesforce/blip2-opt-6.7b" ,
95- limit_mm_per_prompt = {"image" : 1 },
95+ limit_mm_per_prompt = {modality : 1 },
9696 )
9797
9898 return ModelRequestData (
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
110110 model = "facebook/chameleon-7b" ,
111111 max_model_len = 4096 ,
112112 max_num_seqs = 2 ,
113- limit_mm_per_prompt = {"image" : 1 },
113+ limit_mm_per_prompt = {modality : 1 },
114114 )
115115
116116 return ModelRequestData (
@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
130130 max_model_len = 4096 ,
131131 max_num_seqs = 2 ,
132132 hf_overrides = {"architectures" : ["DeepseekVLV2ForCausalLM" ]},
133- limit_mm_per_prompt = {"image" : 1 },
133+ limit_mm_per_prompt = {modality : 1 },
134134 )
135135
136136 prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
155155 max_num_seqs = 2 ,
156156 trust_remote_code = True ,
157157 dtype = "bfloat16" ,
158- limit_mm_per_prompt = {"image" : 1 },
158+ limit_mm_per_prompt = {modality : 1 },
159159 )
160160
161161 prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions ]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
175175 model = "adept/fuyu-8b" ,
176176 max_model_len = 2048 ,
177177 max_num_seqs = 2 ,
178- limit_mm_per_prompt = {"image" : 1 },
178+ limit_mm_per_prompt = {modality : 1 },
179179 )
180180
181181 return ModelRequestData (
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
194194 max_model_len = 2048 ,
195195 max_num_seqs = 2 ,
196196 mm_processor_kwargs = {"do_pan_and_scan" : True },
197- limit_mm_per_prompt = {"image" : 1 },
197+ limit_mm_per_prompt = {modality : 1 },
198198 )
199199
200200 prompts = [("<bos><start_of_turn>user\n "
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
219219 trust_remote_code = True ,
220220 enforce_eager = True ,
221221 hf_overrides = {"architectures" : ["GLM4VForCausalLM" ]},
222- limit_mm_per_prompt = {"image" : 1 },
222+ limit_mm_per_prompt = {modality : 1 },
223223 )
224224
225225 prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
246246 model = model_name ,
247247 trust_remote_code = True ,
248248 max_model_len = 8192 ,
249- limit_mm_per_prompt = {"image" : 1 },
249+ limit_mm_per_prompt = {modality : 1 },
250250 )
251251
252252 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
287287 "longest_edge" : 3 * 364
288288 },
289289 },
290- limit_mm_per_prompt = {"image" : 1 },
290+ limit_mm_per_prompt = {modality : 1 },
291291 )
292292 prompts = [(
293293 f"<|begin_of_text|>User:<image>{ question } <end_of_utterance>\n Assistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
314314 "longest_edge" : 384
315315 },
316316 },
317- limit_mm_per_prompt = {"image" : 1 },
317+ limit_mm_per_prompt = {modality : 1 },
318318 )
319319 prompts = [
320320 (f"<|im_start|>User:<image>{ question } <end_of_utterance>\n Assistant:" )
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
337337 model = model_name ,
338338 trust_remote_code = True ,
339339 max_model_len = 4096 ,
340- limit_mm_per_prompt = {"image" : 1 },
340+ limit_mm_per_prompt = {modality : 1 },
341341 )
342342
343343 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
378378 model = "moonshotai/Kimi-VL-A3B-Instruct" ,
379379 trust_remote_code = True ,
380380 max_model_len = 4096 ,
381- limit_mm_per_prompt = {"image" : 1 },
381+ limit_mm_per_prompt = {modality : 1 },
382382 )
383383
384384 return ModelRequestData (
@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
398398 engine_args = EngineArgs (
399399 model = "llava-hf/llava-1.5-7b-hf" ,
400400 max_model_len = 4096 ,
401- limit_mm_per_prompt = {"image" : 1 },
401+ limit_mm_per_prompt = {modality : 1 },
402402 )
403403
404404 return ModelRequestData (
@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
415415 engine_args = EngineArgs (
416416 model = "llava-hf/llava-v1.6-mistral-7b-hf" ,
417417 max_model_len = 8192 ,
418- limit_mm_per_prompt = {"image" : 1 },
418+ limit_mm_per_prompt = {modality : 1 },
419419 )
420420
421421 return ModelRequestData (
@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
437437 model = "llava-hf/LLaVA-NeXT-Video-7B-hf" ,
438438 max_model_len = 8192 ,
439439 max_num_seqs = 2 ,
440- limit_mm_per_prompt = {"image" : 1 },
440+ limit_mm_per_prompt = {modality : 1 },
441441 )
442442
443443 return ModelRequestData (
@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
465465 engine_args = EngineArgs (
466466 model = "llava-hf/llava-onevision-qwen2-7b-ov-hf" ,
467467 max_model_len = 16384 ,
468- limit_mm_per_prompt = {"image" : 1 },
468+ limit_mm_per_prompt = {modality : 1 },
469469 )
470470
471471 return ModelRequestData (
@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
488488 model = "TIGER-Lab/Mantis-8B-siglip-llama3" ,
489489 max_model_len = 4096 ,
490490 hf_overrides = {"architectures" : ["MantisForConditionalGeneration" ]},
491- limit_mm_per_prompt = {"image" : 1 },
491+ limit_mm_per_prompt = {modality : 1 },
492492 )
493493 stop_token_ids = [128009 ]
494494
@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
529529 max_model_len = 4096 ,
530530 max_num_seqs = 2 ,
531531 trust_remote_code = True ,
532- limit_mm_per_prompt = {"image" : 1 },
532+ limit_mm_per_prompt = {modality : 1 },
533533 )
534534 # NOTE The stop_token_ids are different for various versions of MiniCPM-V
535535 # 2.0
@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
584584 max_model_len = 8192 ,
585585 max_num_seqs = 2 ,
586586 tensor_parallel_size = 2 ,
587- limit_mm_per_prompt = {"image" : 1 },
587+ limit_mm_per_prompt = {modality : 1 },
588588 )
589589
590590 prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
610610 model = model_name ,
611611 max_model_len = 8192 ,
612612 max_num_seqs = 2 ,
613- limit_mm_per_prompt = {"image" : 1 },
613+ limit_mm_per_prompt = {modality : 1 },
614614 )
615615
616616 tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
645645 max_num_seqs = 4 ,
646646 tensor_parallel_size = 8 ,
647647 gpu_memory_utilization = 0.4 ,
648- limit_mm_per_prompt = {"image" : 1 },
648+ limit_mm_per_prompt = {modality : 1 },
649649 )
650650
651651 tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
680680 model = model_name ,
681681 trust_remote_code = True ,
682682 dtype = "bfloat16" ,
683- limit_mm_per_prompt = {"image" : 1 },
683+ limit_mm_per_prompt = {modality : 1 },
684684 )
685685
686686 prompts = [
@@ -706,7 +706,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
706706 trust_remote_code = True ,
707707 max_model_len = 4096 ,
708708 tensor_parallel_size = 4 ,
709- limit_mm_per_prompt = {"image" : 1 },
709+ limit_mm_per_prompt = {modality : 1 },
710710 )
711711
712712 tokenizer = AutoTokenizer .from_pretrained (model_name ,
@@ -738,7 +738,7 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
738738 trust_remote_code = True ,
739739 dtype = "half" ,
740740 hf_overrides = {"architectures" : ["Ovis2ForConditionalGeneration" ]},
741- limit_mm_per_prompt = {"image" : 1 },
741+ limit_mm_per_prompt = {modality : 1 },
742742 )
743743
744744 placeholder = "<image>\n "
@@ -761,7 +761,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
761761 prompts = ["caption en" for _ in questions ]
762762 engine_args = EngineArgs (
763763 model = "google/paligemma-3b-mix-224" ,
764- limit_mm_per_prompt = {"image" : 1 },
764+ limit_mm_per_prompt = {modality : 1 },
765765 )
766766
767767 return ModelRequestData (
@@ -778,7 +778,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
778778 prompts = ["caption en" for _ in questions ]
779779 engine_args = EngineArgs (
780780 model = "google/paligemma2-3b-ft-docci-448" ,
781- limit_mm_per_prompt = {"image" : 1 },
781+ limit_mm_per_prompt = {modality : 1 },
782782 )
783783
784784 return ModelRequestData (
@@ -815,7 +815,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
815815 max_num_seqs = 2 ,
816816 # Note - mm_processor_kwargs can also be passed to generate/chat calls
817817 mm_processor_kwargs = {"num_crops" : 16 },
818- limit_mm_per_prompt = {"image" : 1 },
818+ limit_mm_per_prompt = {modality : 1 },
819819 )
820820
821821 return ModelRequestData (
@@ -849,7 +849,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
849849 max_lora_rank = 320 ,
850850 # Note - mm_processor_kwargs can also be passed to generate/chat calls
851851 mm_processor_kwargs = {"dynamic_hd" : 16 },
852- limit_mm_per_prompt = {"image" : 1 },
852+ limit_mm_per_prompt = {modality : 1 },
853853 )
854854
855855 return ModelRequestData (
@@ -870,7 +870,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
870870 model = model_name ,
871871 max_model_len = 6144 ,
872872 max_num_seqs = 2 ,
873- limit_mm_per_prompt = {"image" : 1 },
873+ limit_mm_per_prompt = {modality : 1 },
874874 )
875875
876876 prompts = [f"<s>[INST]{ question } \n [IMG][/INST]" for question in questions ]
@@ -891,7 +891,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
891891 max_model_len = 1024 ,
892892 max_num_seqs = 2 ,
893893 hf_overrides = {"architectures" : ["QwenVLForConditionalGeneration" ]},
894- limit_mm_per_prompt = {"image" : 1 },
894+ limit_mm_per_prompt = {modality : 1 },
895895 )
896896
897897 prompts = [f"{ question } Picture 1: <img></img>\n " for question in questions ]
@@ -916,7 +916,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
916916 "min_pixels" : 28 * 28 ,
917917 "max_pixels" : 1280 * 28 * 28 ,
918918 },
919- limit_mm_per_prompt = {"image" : 1 },
919+ limit_mm_per_prompt = {modality : 1 },
920920 )
921921
922922 if modality == "image" :
@@ -951,7 +951,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
951951 "max_pixels" : 1280 * 28 * 28 ,
952952 "fps" : 1 ,
953953 },
954- limit_mm_per_prompt = {"image" : 1 },
954+ limit_mm_per_prompt = {modality : 1 },
955955 )
956956
957957 if modality == "image" :
@@ -985,7 +985,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
985985 "max_pixels" : 1280 * 28 * 28 ,
986986 "fps" : [1 ],
987987 },
988- limit_mm_per_prompt = {"image" : 1 },
988+ limit_mm_per_prompt = {modality : 1 },
989989 )
990990
991991 if modality == "image" :
@@ -1018,7 +1018,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
10181018 model = model_name ,
10191019 trust_remote_code = True ,
10201020 max_model_len = 4096 ,
1021- limit_mm_per_prompt = {"image" : 1 },
1021+ limit_mm_per_prompt = {modality : 1 },
10221022 )
10231023
10241024 tokenizer = AutoTokenizer .from_pretrained (model_name ,
0 commit comments