|
8 | 8 | from pathlib import PosixPath |
9 | 9 |
|
10 | 10 | import pytest |
11 | | -from packaging.version import Version |
12 | 11 | from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq |
13 | | -from transformers import __version__ as TRANSFORMERS_VERSION |
14 | 12 |
|
15 | 13 | from vllm.platforms import current_platform |
16 | 14 | from vllm.utils import identity |
|
126 | 124 | dtype="bfloat16", |
127 | 125 | marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501 |
128 | 126 | ), |
129 | | - # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL |
130 | | - # once we upgraded to transformers>=4.49.0. |
131 | | - "qwen2_vl": VLMTestInfo( |
132 | | - models=["Qwen/Qwen2-VL-2B-Instruct"], |
133 | | - test_type=( |
134 | | - VLMTestType.IMAGE, |
135 | | - VLMTestType.MULTI_IMAGE, |
136 | | - VLMTestType.VIDEO |
137 | | - ), |
138 | | - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 |
139 | | - img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 |
140 | | - video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 |
141 | | - max_model_len=4096, |
142 | | - max_num_seqs=2, |
143 | | - auto_cls=AutoModelForVision2Seq, |
144 | | - vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
145 | | - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
146 | | - marks=[pytest.mark.core_model, pytest.mark.cpu_model], |
147 | | - ), |
148 | 127 | "qwen2_5_vl": VLMTestInfo( |
149 | 128 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
150 | 129 | test_type=( |
|
218 | 197 | hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, |
219 | 198 | stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 |
220 | 199 | image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], |
221 | | - marks=[ |
222 | | - pytest.mark.skipif( |
223 | | - Version(TRANSFORMERS_VERSION) >= Version("4.48"), |
224 | | - reason="HF model is not compatible with transformers>=4.48", |
225 | | - ) |
226 | | - ], |
227 | 200 | ), |
228 | 201 | "fuyu": VLMTestInfo( |
229 | 202 | models=["adept/fuyu-8b"], |
|
336 | 309 | prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 |
337 | 310 | num_video_frames=16, |
338 | 311 | max_model_len=16384, |
| 312 | + hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 |
339 | 313 | auto_cls=AutoModelForVision2Seq, |
340 | 314 | vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, |
341 | 315 | custom_test_opts=[CustomTestOptions( |
|
365 | 339 | auto_cls=AutoModelForImageTextToText, |
366 | 340 | vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, |
367 | 341 | patch_hf_runner=model_utils.mantis_patch_hf_runner, |
368 | | - marks=[ |
369 | | - pytest.mark.skipif( |
370 | | - Version(TRANSFORMERS_VERSION) >= Version("4.48"), |
371 | | - reason="HF model is not compatible with transformers>=4.48", |
372 | | - ) |
373 | | - ], |
374 | 342 | ), |
375 | 343 | "minicpmv_25": VLMTestInfo( |
376 | 344 | models=["openbmb/MiniCPM-Llama3-V-2_5"], |
|
450 | 418 | vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, |
451 | 419 | prompt_path_encoder=model_utils.qwen_prompt_path_encoder, |
452 | 420 | ), |
| 421 | + "qwen2_vl": VLMTestInfo( |
| 422 | + models=["Qwen/Qwen2-VL-2B-Instruct"], |
| 423 | + test_type=( |
| 424 | + VLMTestType.IMAGE, |
| 425 | + VLMTestType.MULTI_IMAGE, |
| 426 | + VLMTestType.VIDEO |
| 427 | + ), |
| 428 | + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 |
| 429 | + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 |
| 430 | + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 |
| 431 | + max_model_len=4096, |
| 432 | + max_num_seqs=2, |
| 433 | + auto_cls=AutoModelForVision2Seq, |
| 434 | + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
| 435 | + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
| 436 | + marks=[pytest.mark.cpu_model], |
| 437 | + ), |
453 | 438 | "skywork_r1v": VLMTestInfo( |
454 | 439 | models=["Skywork/Skywork-R1V-38B"], |
455 | 440 | test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
|
515 | 500 | max_model_len=16384, |
516 | 501 | max_num_seqs=2, |
517 | 502 | auto_cls=AutoModelForVision2Seq, |
| 503 | + hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 |
518 | 504 | vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, |
519 | 505 | custom_test_opts=[CustomTestOptions( |
520 | 506 | inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( |
|
0 commit comments