| 
8 | 8 | from pathlib import PosixPath  | 
9 | 9 | 
 
  | 
10 | 10 | import pytest  | 
11 |  | -from packaging.version import Version  | 
12 | 11 | from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq  | 
13 |  | -from transformers import __version__ as TRANSFORMERS_VERSION  | 
14 | 12 | 
 
  | 
15 | 13 | from vllm.platforms import current_platform  | 
16 | 14 | from vllm.utils import identity  | 
 | 
126 | 124 |         dtype="bfloat16",  | 
127 | 125 |         marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501  | 
128 | 126 |     ),  | 
129 |  | -    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL  | 
130 |  | -    # once we upgraded to transformers>=4.49.0.  | 
131 |  | -    "qwen2_vl": VLMTestInfo(  | 
132 |  | -        models=["Qwen/Qwen2-VL-2B-Instruct"],  | 
133 |  | -        test_type=(  | 
134 |  | -            VLMTestType.IMAGE,  | 
135 |  | -            VLMTestType.MULTI_IMAGE,  | 
136 |  | -            VLMTestType.VIDEO  | 
137 |  | -        ),  | 
138 |  | -        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501  | 
139 |  | -        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501  | 
140 |  | -        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501  | 
141 |  | -        max_model_len=4096,  | 
142 |  | -        max_num_seqs=2,  | 
143 |  | -        auto_cls=AutoModelForVision2Seq,  | 
144 |  | -        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,  | 
145 |  | -        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],  | 
146 |  | -        marks=[pytest.mark.core_model, pytest.mark.cpu_model],  | 
147 |  | -    ),  | 
148 | 127 |     "qwen2_5_vl": VLMTestInfo(  | 
149 | 128 |         models=["Qwen/Qwen2.5-VL-3B-Instruct"],  | 
150 | 129 |         test_type=(  | 
 | 
218 | 197 |         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,  | 
219 | 198 |         stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],  # noqa: E501  | 
220 | 199 |         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],  | 
221 |  | -        marks=[  | 
222 |  | -            pytest.mark.skipif(  | 
223 |  | -                Version(TRANSFORMERS_VERSION) >= Version("4.48"),  | 
224 |  | -                reason="HF model is not compatible with transformers>=4.48",  | 
225 |  | -            )  | 
226 |  | -        ],  | 
227 | 200 |     ),  | 
228 | 201 |     "fuyu": VLMTestInfo(  | 
229 | 202 |         models=["adept/fuyu-8b"],  | 
 | 
336 | 309 |         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501  | 
337 | 310 |         num_video_frames=16,  | 
338 | 311 |         max_model_len=16384,  | 
 | 312 | +        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501  | 
339 | 313 |         auto_cls=AutoModelForVision2Seq,  | 
340 | 314 |         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,  | 
341 | 315 |         custom_test_opts=[CustomTestOptions(  | 
 | 
365 | 339 |         auto_cls=AutoModelForImageTextToText,  | 
366 | 340 |         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,  | 
367 | 341 |         patch_hf_runner=model_utils.mantis_patch_hf_runner,  | 
368 |  | -        marks=[  | 
369 |  | -            pytest.mark.skipif(  | 
370 |  | -                Version(TRANSFORMERS_VERSION) >= Version("4.48"),  | 
371 |  | -                reason="HF model is not compatible with transformers>=4.48",  | 
372 |  | -            )  | 
373 |  | -        ],  | 
374 | 342 |     ),  | 
375 | 343 |     "minicpmv_25": VLMTestInfo(  | 
376 | 344 |         models=["openbmb/MiniCPM-Llama3-V-2_5"],  | 
 | 
450 | 418 |         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,  | 
451 | 419 |         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,  | 
452 | 420 |     ),  | 
 | 421 | +    "qwen2_vl": VLMTestInfo(  | 
 | 422 | +        models=["Qwen/Qwen2-VL-2B-Instruct"],  | 
 | 423 | +        test_type=(  | 
 | 424 | +            VLMTestType.IMAGE,  | 
 | 425 | +            VLMTestType.MULTI_IMAGE,  | 
 | 426 | +            VLMTestType.VIDEO  | 
 | 427 | +        ),  | 
 | 428 | +        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501  | 
 | 429 | +        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501  | 
 | 430 | +        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501  | 
 | 431 | +        max_model_len=4096,  | 
 | 432 | +        max_num_seqs=2,  | 
 | 433 | +        auto_cls=AutoModelForVision2Seq,  | 
 | 434 | +        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,  | 
 | 435 | +        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],  | 
 | 436 | +        marks=[pytest.mark.cpu_model],  | 
 | 437 | +    ),  | 
453 | 438 |     "skywork_r1v": VLMTestInfo(  | 
454 | 439 |         models=["Skywork/Skywork-R1V-38B"],  | 
455 | 440 |         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),  | 
 | 
515 | 500 |         max_model_len=16384,  | 
516 | 501 |         max_num_seqs=2,  | 
517 | 502 |         auto_cls=AutoModelForVision2Seq,  | 
 | 503 | +        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501  | 
518 | 504 |         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,  | 
519 | 505 |         custom_test_opts=[CustomTestOptions(  | 
520 | 506 |             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(  | 
 | 
0 commit comments