2727import vllm # noqa: F401
2828from modelscope import snapshot_download # type: ignore[import-untyped]
2929from vllm import SamplingParams
30+ from vllm .assets .audio import AudioAsset
3031from vllm .assets .image import ImageAsset
3132
3233import vllm_ascend # noqa: F401
3637 "Qwen/Qwen2.5-0.5B-Instruct" ,
3738 "Qwen/Qwen3-0.6B-Base" ,
3839]
39- MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40+ MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
41+ MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct" ]
4042
4143QUANTIZATION_MODELS = [
4244 "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" ,
4749 "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" ,
4850]
4951
52+ AUDIO_ASSETS = [AudioAsset ("mary_had_lamb" ), AudioAsset ("winning_call" )]
53+ AUDIO_PROMPT_TEMPLATES = {
54+ 1 : "What is recited in the audio?" ,
55+ 2 : "What sport and what nursery rhyme are referenced?"
56+ }
5057
5158@pytest .mark .parametrize ("model" , MODELS )
5259@pytest .mark .parametrize ("dtype" , ["half" , "float16" ])
@@ -88,8 +95,8 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
8895 vllm_model .generate_greedy (example_prompts , max_tokens )
8996
9097
91- @pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
92- def test_multimodal (model , prompt_template , vllm_runner ):
98+ @pytest .mark .parametrize ("model" , MULTIMODALITY_VL_MODELS )
99+ def test_multimodal_vl (model , prompt_template , vllm_runner ):
93100 image = ImageAsset ("cherry_blossom" ) \
94101 .pil_image .convert ("RGB" )
95102 img_questions = [
@@ -131,3 +138,41 @@ def test_models_topk() -> None:
131138 enforce_eager = True ,
132139 gpu_memory_utilization = 0.7 ) as vllm_model :
133140 vllm_model .generate (example_prompts , sampling_params )
141+
142+ def prepare_audio_inputs (audio_count : int ):
143+ audio_prompt = "" .join ([
144+ f"Audio { idx + 1 } : <|audio_bos|><|AUDIO|><|audio_eos|>\n "
145+ for idx in range (audio_count )
146+ ])
147+ question = AUDIO_PROMPT_TEMPLATES [audio_count ]
148+ prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
149+ "<|im_start|>user\n "
150+ f"{ audio_prompt } { question } <|im_end|>\n "
151+ "<|im_start|>assistant\n " )
152+ mm_data = {
153+ "audio" :
154+ [asset .audio_and_sample_rate for asset in AUDIO_ASSETS [:audio_count ]]
155+ }
156+ inputs = {"prompt" : prompt , "multi_modal_data" : mm_data }
157+ return inputs
158+
159+
160+ @pytest .mark .parametrize ("model" , MULTIMODALITY_AUDIO_MODELS )
161+ @pytest .mark .parametrize ("audio_count" , [2 ])
162+ @pytest .mark .parametrize ("max_tokens" , [10 ])
163+ def test_multimodal_audio (model : str , audio_count : int ,
164+ max_tokens : int ) -> None :
165+ inputs = prepare_audio_inputs (audio_count )
166+
167+ sampling_params = SamplingParams (temperature = 0.2 ,
168+ max_tokens = max_tokens ,
169+ stop_token_ids = None )
170+
171+ with VllmRunner (model ,
172+ max_model_len = 4096 ,
173+ max_num_seqs = 5 ,
174+ enforce_eager = False ,
175+ dtype = "bfloat16" ,
176+ limit_mm_per_prompt = {"audio" : audio_count },
177+ gpu_memory_utilization = 0.9 ) as vllm_model :
178+ vllm_model .generate (inputs , sampling_params = sampling_params )
0 commit comments