11# SPDX-License-Identifier: Apache-2.0
22"""
3- This example shows how to use vLLM for running offline inference
3+ This example shows how to use vLLM for running offline inference
44with the correct prompt format on audio language models.
55
66For most models, the prompt format should follow corresponding examples
77on HuggingFace model repository.
88"""
9+
910import os
1011from dataclasses import asdict
1112from typing import NamedTuple , Optional
2223question_per_audio_count = {
2324 0 : "What is 1+1?" ,
2425 1 : "What is recited in the audio?" ,
25- 2 : "What sport and what nursery rhyme are referenced?"
26+ 2 : "What sport and what nursery rhyme are referenced?" ,
2627}
2728
2829
@@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
7273# MiniCPM-O
7374def run_minicpmo (question : str , audio_count : int ) -> ModelRequestData :
7475 model_name = "openbmb/MiniCPM-o-2_6"
75- tokenizer = AutoTokenizer .from_pretrained (model_name ,
76- trust_remote_code = True )
76+ tokenizer = AutoTokenizer .from_pretrained (model_name , trust_remote_code = True )
7777 engine_args = EngineArgs (
7878 model = model_name ,
7979 trust_remote_code = True ,
@@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
8282 limit_mm_per_prompt = {"audio" : audio_count },
8383 )
8484
85- stop_tokens = [' <|im_end|>' , ' <|endoftext|>' ]
85+ stop_tokens = [" <|im_end|>" , " <|endoftext|>" ]
8686 stop_token_ids = [tokenizer .convert_tokens_to_ids (i ) for i in stop_tokens ]
8787
8888 audio_placeholder = "(<audio>./</audio>)" * audio_count
8989 audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n ' + message['content'] + '<|im_end|>' + '\n '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n <|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501
90- messages = [{
91- 'role' : 'user' ,
92- 'content' : f'{ audio_placeholder } \n { question } '
93- }]
94- prompt = tokenizer .apply_chat_template (messages ,
95- tokenize = False ,
96- add_generation_prompt = True ,
97- chat_template = audio_chat_template )
90+ messages = [{"role" : "user" , "content" : f"{ audio_placeholder } \n { question } " }]
91+ prompt = tokenizer .apply_chat_template (
92+ messages ,
93+ tokenize = False ,
94+ add_generation_prompt = True ,
95+ chat_template = audio_chat_template ,
96+ )
9897
9998 return ModelRequestData (
10099 engine_args = engine_args ,
@@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
113112 # Since the vision-lora and speech-lora co-exist with the base model,
114113 # we have to manually specify the path of the lora weights.
115114 speech_lora_path = os .path .join (model_path , "speech-lora" )
116- placeholders = "" .join ([f"<|audio_{ i + 1 } |>" for i in range (audio_count )])
115+ placeholders = "" .join ([f"<|audio_{ i + 1 } |>" for i in range (audio_count )])
117116
118117 prompts = f"<|user|>{ placeholders } { question } <|end|><|assistant|>"
119118
@@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
145144 limit_mm_per_prompt = {"audio" : audio_count },
146145 )
147146
148- audio_in_prompt = "" .join ([
149- f"Audio { idx + 1 } : "
150- f"<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )
151- ])
147+ audio_in_prompt = "" .join (
148+ [
149+ f"Audio { idx + 1 } : <|audio_bos|><|AUDIO|><|audio_eos|>\n "
150+ for idx in range (audio_count )
151+ ]
152+ )
152153
153- prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
154- "<|im_start|>user\n "
155- f"{ audio_in_prompt } { question } <|im_end|>\n "
156- "<|im_start|>assistant\n " )
154+ prompt = (
155+ "<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
156+ "<|im_start|>user\n "
157+ f"{ audio_in_prompt } { question } <|im_end|>\n "
158+ "<|im_start|>assistant\n "
159+ )
157160
158161 return ModelRequestData (
159162 engine_args = engine_args ,
@@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
172175 limit_mm_per_prompt = {"audio" : audio_count },
173176 )
174177
175- audio_in_prompt = "" .join ([
176- "<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )
177- ] )
178+ audio_in_prompt = "" .join (
179+ [ "<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )]
180+ )
178181
179182 default_system = (
180183 "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
181184 "Group, capable of perceiving auditory and visual inputs, as well as "
182- "generating text and speech." )
185+ "generating text and speech."
186+ )
183187
184- prompt = (f"<|im_start|>system\n { default_system } <|im_end|>\n "
185- "<|im_start|>user\n "
186- f"{ audio_in_prompt } { question } <|im_end|>\n "
187- "<|im_start|>assistant\n " )
188+ prompt = (
189+ f"<|im_start|>system\n { default_system } <|im_end|>\n "
190+ "<|im_start|>user\n "
191+ f"{ audio_in_prompt } { question } <|im_end|>\n "
192+ "<|im_start|>assistant\n "
193+ )
188194 return ModelRequestData (
189195 engine_args = engine_args ,
190196 prompt = prompt ,
@@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
196202 model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
197203
198204 tokenizer = AutoTokenizer .from_pretrained (model_name )
199- messages = [{
200- 'role' : 'user' ,
201- 'content' : "<|audio|>\n " * audio_count + question
202- }]
203- prompt = tokenizer .apply_chat_template (messages ,
204- tokenize = False ,
205- add_generation_prompt = True )
205+ messages = [{"role" : "user" , "content" : "<|audio|>\n " * audio_count + question }]
206+ prompt = tokenizer .apply_chat_template (
207+ messages , tokenize = False , add_generation_prompt = True
208+ )
206209
207210 engine_args = EngineArgs (
208211 model = model_name ,
@@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
220223
221224# Whisper
222225def run_whisper (question : str , audio_count : int ) -> ModelRequestData :
223- assert audio_count == 1 , (
224- "Whisper only support single audio input per prompt" )
226+ assert audio_count == 1 , "Whisper only support single audio input per prompt"
225227 model_name = "openai/whisper-large-v3-turbo"
226228
227229 prompt = "<|startoftranscript|>"
@@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
252254
253255def parse_args ():
254256 parser = FlexibleArgumentParser (
255- description = 'Demo on using vLLM for offline inference with '
256- 'audio language models' )
257- parser .add_argument ('--model-type' ,
258- '-m' ,
259- type = str ,
260- default = "ultravox" ,
261- choices = model_example_map .keys (),
262- help = 'Huggingface "model_type".' )
263- parser .add_argument ('--num-prompts' ,
264- type = int ,
265- default = 1 ,
266- help = 'Number of prompts to run.' )
267- parser .add_argument ("--num-audios" ,
268- type = int ,
269- default = 1 ,
270- choices = [0 , 1 , 2 ],
271- help = "Number of audio items per prompt." )
272- parser .add_argument ("--seed" ,
273- type = int ,
274- default = None ,
275- help = "Set the seed when initializing `vllm.LLM`." )
257+ description = "Demo on using vLLM for offline inference with "
258+ "audio language models"
259+ )
260+ parser .add_argument (
261+ "--model-type" ,
262+ "-m" ,
263+ type = str ,
264+ default = "ultravox" ,
265+ choices = model_example_map .keys (),
266+ help = 'Huggingface "model_type".' ,
267+ )
268+ parser .add_argument (
269+ "--num-prompts" , type = int , default = 1 , help = "Number of prompts to run."
270+ )
271+ parser .add_argument (
272+ "--num-audios" ,
273+ type = int ,
274+ default = 1 ,
275+ choices = [0 , 1 , 2 ],
276+ help = "Number of audio items per prompt." ,
277+ )
278+ parser .add_argument (
279+ "--seed" ,
280+ type = int ,
281+ default = None ,
282+ help = "Set the seed when initializing `vllm.LLM`." ,
283+ )
276284
277285 return parser .parse_args ()
278286
@@ -283,29 +291,30 @@ def main(args):
283291 raise ValueError (f"Model type { model } is not supported." )
284292
285293 audio_count = args .num_audios
286- req_data = model_example_map [model ](question_per_audio_count [audio_count ],
287- audio_count )
294+ req_data = model_example_map [model ](
295+ question_per_audio_count [audio_count ], audio_count
296+ )
288297
289298 # Disable other modalities to save memory
290299 default_limits = {"image" : 0 , "video" : 0 , "audio" : 0 }
291300 req_data .engine_args .limit_mm_per_prompt = default_limits | dict (
292- req_data .engine_args .limit_mm_per_prompt or {})
301+ req_data .engine_args .limit_mm_per_prompt or {}
302+ )
293303
294304 engine_args = asdict (req_data .engine_args ) | {"seed" : args .seed }
295305 llm = LLM (** engine_args )
296306
297307 # We set temperature to 0.2 so that outputs can be different
298308 # even when all prompts are identical when running batch inference.
299- sampling_params = SamplingParams (temperature = 0.2 ,
300- max_tokens = 64 ,
301- stop_token_ids = req_data . stop_token_ids )
309+ sampling_params = SamplingParams (
310+ temperature = 0.2 , max_tokens = 64 , stop_token_ids = req_data . stop_token_ids
311+ )
302312
303313 mm_data = {}
304314 if audio_count > 0 :
305315 mm_data = {
306316 "audio" : [
307- asset .audio_and_sample_rate
308- for asset in audio_assets [:audio_count ]
317+ asset .audio_and_sample_rate for asset in audio_assets [:audio_count ]
309318 ]
310319 }
311320
@@ -315,8 +324,9 @@ def main(args):
315324 # Batch inference
316325 inputs = [inputs ] * args .num_prompts
317326 # Add LoRA request if applicable
318- lora_request = (req_data .lora_requests *
319- args .num_prompts if req_data .lora_requests else None )
327+ lora_request = (
328+ req_data .lora_requests * args .num_prompts if req_data .lora_requests else None
329+ )
320330
321331 outputs = llm .generate (
322332 inputs ,
0 commit comments