22
33import os
44import re
5+ from collections .abc import Sequence
56from typing import Optional
67
8+ import librosa
79import pytest
810from huggingface_hub import snapshot_download
911from transformers import AutoTokenizer
1012
13+ from vllm .assets .image import ImageAsset
1114from vllm .lora .request import LoRARequest
1215from vllm .multimodal .image import rescale_image_size
1316from vllm .platforms import current_platform
1417from vllm .sequence import SampleLogprobs
1518
16- from ....conftest import IMAGE_ASSETS , HfRunner , PromptImageInput , VllmRunner
19+ from ....conftest import (IMAGE_ASSETS , HfRunner , PromptAudioInput ,
20+ PromptImageInput , VllmRunner )
1721from ....utils import large_gpu_test
1822from ...utils import check_logprobs_close
1923
2933# Since the vision-lora and speech-lora co-exist with the base model,
3034# we have to manually specify the path of the lora weights.
3135vision_lora_path = os .path .join (model_path , "vision-lora" )
36+ speech_question = os .path .join (model_path , "examples" ,
37+ "what_is_shown_in_this_image.wav" )
3238models = [model_path ]
3339
3440
@@ -64,7 +70,8 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
6470def run_test (
6571 hf_runner : type [HfRunner ],
6672 vllm_runner : type [VllmRunner ],
67- inputs : list [tuple [list [str ], PromptImageInput ]],
73+ inputs : Sequence [tuple [list [str ], PromptImageInput ,
74+ Optional [PromptAudioInput ]]],
6875 model : str ,
6976 * ,
7077 max_model_len : int ,
@@ -104,28 +111,49 @@ def run_test(
104111 enforce_eager = True ,
105112 ) as vllm_model :
106113 lora_request = LoRARequest ("vision" , 1 , vision_lora_path )
107- vllm_model .model .llm_engine .add_lora (lora_request = lora_request )
108114 vllm_outputs_per_case = [
109115 vllm_model .generate_greedy_logprobs (prompts ,
110116 max_tokens ,
111117 num_logprobs = num_logprobs ,
112- images = images )
113- for prompts , images in inputs
118+ images = images ,
119+ audios = audios ,
120+ lora_request = lora_request )
121+ for prompts , images , audios in inputs
114122 ]
115123
116- # use eager mode for hf runner, since phi3_v didn't work with flash_attn
117- hf_model_kwargs = {"_attn_implementation" : "eager" }
124+ hf_model_kwargs = {"_attn_implementation" : "sdpa" }
118125 with hf_runner (model , dtype = dtype ,
119126 model_kwargs = hf_model_kwargs ) as hf_model :
120- eos_token_id = hf_model .processor .tokenizer .eos_token_id
127+
128+ hf_processor = hf_model .processor
129+ eos_token_id = hf_processor .tokenizer .eos_token_id
130+
131+ def patch_hf_processor (* args ,
132+ text = "" ,
133+ images = None ,
134+ audio = None ,
135+ sampling_rate = None ,
136+ ** kwargs ):
137+ audios = None
138+ if audio is not None and sampling_rate is not None :
139+ audios = [(audio , sampling_rate )]
140+ return hf_processor (* args ,
141+ text = text ,
142+ images = images ,
143+ audios = audios ,
144+ ** kwargs )
145+
146+ hf_model .processor = patch_hf_processor
147+
121148 hf_outputs_per_case = [
122149 hf_model .generate_greedy_logprobs_limit (prompts ,
123150 max_tokens ,
124151 num_logprobs = num_logprobs ,
125152 images = images ,
153+ audios = audios ,
126154 eos_token_id = eos_token_id ,
127155 num_logits_to_keep = 0 )
128- for prompts , images in inputs
156+ for prompts , images , audios in inputs
129157 ]
130158
131159 for hf_outputs , vllm_outputs in zip (hf_outputs_per_case ,
@@ -138,8 +166,6 @@ def run_test(
138166 )
139167
140168
141- # Since we use _attn_implementation="eager" for hf_runner, there is more
142- # significant numerical difference. The basic `logprobs=5` fails to pass.
143169@pytest .mark .parametrize ("model" , models )
144170@pytest .mark .parametrize (
145171 "size_factors" ,
@@ -151,7 +177,7 @@ def run_test(
151177 # Single-scale, batched
152178 [1.0 , 1.0 , 1.0 ],
153179 # Multi-scale
154- [0.7 , 0.75 , 1.0 ],
180+ [0.25 , 0.5 , 1.0 ],
155181 ],
156182)
157183@pytest .mark .parametrize ("dtype" , [target_dtype ])
@@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
166192 inputs_per_image = [(
167193 [prompt for _ in size_factors ],
168194 [rescale_image_size (image , factor ) for factor in size_factors ],
195+ None ,
169196 ) for image , prompt in zip (images , HF_IMAGE_PROMPTS )]
170197
171198 run_test (
@@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
201228@pytest .mark .parametrize ("max_model_len" , [10000 ])
202229@pytest .mark .parametrize ("max_tokens" , [128 ])
203230@pytest .mark .parametrize ("num_logprobs" , [10 ])
204- @pytest .mark .xfail (
205- reason = "Phi-4-MM multi-image inference is divergent with hf model." )
206231def test_multi_images_models (hf_runner , vllm_runner , image_assets , model ,
207232 size_factors , dtype : str , max_model_len : int ,
208233 max_tokens : int , num_logprobs : int ) -> None :
209234 images = [asset .pil_image for asset in image_assets ]
210235
211236 inputs_per_case = [
212- ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors ],
213- [[rescale_image_size (image , factor ) for image in images ]
214- for factor in size_factors ])
237+ (
238+ [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors ],
239+ [[rescale_image_size (image , factor ) for image in images ]
240+ for factor in size_factors ],
241+ None ,
242+ ),
215243 ]
216244
217245 run_test (
@@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
226254 mm_limit = 2 ,
227255 tensor_parallel_size = 1 ,
228256 )
257+
258+
259+ @pytest .mark .parametrize ("model" , models )
260+ @pytest .mark .parametrize ("dtype" , [target_dtype ])
261+ @pytest .mark .parametrize ("max_model_len" , [10000 ])
262+ @pytest .mark .parametrize ("max_tokens" , [128 ])
263+ @pytest .mark .parametrize ("num_logprobs" , [10 ])
264+ def test_vision_speech_models (hf_runner , vllm_runner , model , dtype : str ,
265+ max_model_len : int , max_tokens : int ,
266+ num_logprobs : int ) -> None :
267+
268+ # use the example speech question so that the model outputs are reasonable
269+ audio = librosa .load (speech_question , sr = None )
270+ image = ImageAsset ("cherry_blossom" ).pil_image .convert ("RGB" )
271+
272+ inputs_vision_speech = [
273+ (
274+ ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>" ],
275+ [image ],
276+ [audio ],
277+ ),
278+ ]
279+
280+ run_test (
281+ hf_runner ,
282+ vllm_runner ,
283+ inputs_vision_speech ,
284+ model ,
285+ dtype = dtype ,
286+ max_model_len = max_model_len ,
287+ max_tokens = max_tokens ,
288+ num_logprobs = num_logprobs ,
289+ mm_limit = 1 ,
290+ tensor_parallel_size = 1 ,
291+ )
0 commit comments