2727import vllm # noqa: F401
2828from modelscope import snapshot_download # type: ignore[import-untyped]
2929from vllm import SamplingParams
30+ from vllm .assets .audio import AudioAsset
3031from vllm .assets .image import ImageAsset
3132
3233import vllm_ascend # noqa: F401
3637 "Qwen/Qwen2.5-0.5B-Instruct" ,
3738 "Qwen/Qwen3-0.6B-Base" ,
3839]
39- MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40+ MULTIMODALITY_VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
41+ MULTIMODALITY_AUDIO_MODELS = ["Qwen/Qwen2-Audio-7B-Instruct" ]
4042
4143QUANTIZATION_MODELS = [
4244 "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8" ,
4345]
4446os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
47+ AUDIO_ASSETS = [AudioAsset ("mary_had_lamb" ), AudioAsset ("winning_call" )]
48+ AUDIO_PROMPT_TEMPLATES = {
49+ 1 : "What is recited in the audio?" ,
50+ 2 : "What sport and what nursery rhyme are referenced?"
51+ }
4552
4653
4754@pytest .mark .parametrize ("model" , MODELS )
@@ -84,8 +91,8 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
8491 vllm_model .generate_greedy (example_prompts , max_tokens )
8592
8693
87- @pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
88- def test_multimodal (model , prompt_template , vllm_runner ):
94+ @pytest .mark .parametrize ("model" , MULTIMODALITY_VL_MODELS )
95+ def test_multimodal_vl (model , prompt_template , vllm_runner ):
8996 image = ImageAsset ("cherry_blossom" ) \
9097 .pil_image .convert ("RGB" )
9198 img_questions = [
@@ -108,6 +115,44 @@ def test_multimodal(model, prompt_template, vllm_runner):
108115 max_tokens = 64 )
109116
110117
118+ def prepare_audio_inputs (audio_count : int ):
119+ audio_prompt = "" .join ([
120+ f"Audio { idx + 1 } : <|audio_bos|><|AUDIO|><|audio_eos|>\n "
121+ for idx in range (audio_count )
122+ ])
123+ question = AUDIO_PROMPT_TEMPLATES [audio_count ]
124+ prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
125+ "<|im_start|>user\n "
126+ f"{ audio_prompt } { question } <|im_end|>\n "
127+ "<|im_start|>assistant\n " )
128+ mm_data = {
129+ "audio" :
130+ [asset .audio_and_sample_rate for asset in AUDIO_ASSETS [:audio_count ]]
131+ }
132+ inputs = {"prompt" : prompt , "multi_modal_data" : mm_data }
133+ return inputs
134+
135+
136+ @pytest .mark .parametrize ("model" , MULTIMODALITY_AUDIO_MODELS )
137+ @pytest .mark .parametrize ("audio_count" , [2 ])
138+ @pytest .mark .parametrize ("max_tokens" , [10 ])
139+ def test_multimodal_audio (model : str , audio_count : int ,
140+ max_tokens : int ) -> None :
141+ inputs = prepare_audio_inputs (audio_count )
142+
143+ sampling_params = SamplingParams (temperature = 0.2 ,
144+ max_tokens = max_tokens ,
145+ stop_token_ids = None )
146+
147+ with VllmRunner (model ,
148+ max_model_len = 4096 ,
149+ max_num_seqs = 5 ,
150+ enforce_eager = False ,
151+ limit_mm_per_prompt = {"audio" : audio_count },
152+ gpu_memory_utilization = 0.9 ) as vllm_model :
153+ vllm_model .generate (inputs , sampling_params = sampling_params )
154+
155+
111156@patch .dict (os .environ , {"VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION" : "1" })
112157def test_models_topk () -> None :
113158 example_prompts = [
0 commit comments