diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index ae851c35e626..a158ae0511cd 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -893,10 +893,6 @@ Currently the PaliGemma model series is implemented without PrefixLM attention m `mistral-community/pixtral-12b` does not support V1 yet. ::: -:::{note} -To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`. -::: - ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. diff --git a/requirements-common.txt b/requirements-common.txt index 0514bf8adcaf..23b31c78f444 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,7 +6,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.48.2 # Required for Bamba model and Transformers backend. +transformers >= 4.49.0 # Required for Qwen2.5-VL tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9' diff --git a/requirements-test.in b/requirements-test.in index 53c531360d87..bf44bece4f76 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -28,7 +28,7 @@ matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.0 # required for pixtral test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test -transformers==4.48.2 +transformers==4.49.0 # quantization bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 @@ -38,4 +38,4 @@ tritonclient==2.51.0 numpy < 2.0.0 runai-model-streamer==0.11.0 -runai-model-streamer-s3==0.11.0 \ No newline at end of file +runai-model-streamer-s3==0.11.0 diff --git a/requirements-test.txt b/requirements-test.txt index 11f0e10969a6..19b5776b60de 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -628,7 +628,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.48.2 +transformers==4.49.0 # via # -r requirements-test.in # genai-perf diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 2c66edb539dc..8484576e533e 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -121,25 +121,6 @@ else ("half", "float")), marks=[pytest.mark.core_model], ), - # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL - # once we upgraded to transformers>=4.49.0. - "qwen2_vl": VLMTestInfo( - models=["Qwen/Qwen2-VL-2B-Instruct"], - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - VLMTestType.VIDEO - ), - prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 - video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 - max_model_len=4096, - max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - marks=[pytest.mark.core_model, pytest.mark.cpu_model], - ), "qwen2_5_vl": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], test_type=( @@ -442,6 +423,23 @@ vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, ), + "qwen2_vl": VLMTestInfo( + models=["Qwen/Qwen2-VL-2B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.cpu_model], + ), ### Tensor parallel / multi-gpu broadcast tests "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"],