Skip to content

Commit b469f95

Browse files
Isotr0pyAkshat-Tripathi
authored andcommitted
[Doc] Consolidate whisper and florence2 examples (vllm-project#14050)
1 parent 0daae74 commit b469f95

File tree

5 files changed

+210
-148
lines changed

5 files changed

+210
-148
lines changed

examples/offline_inference/audio_language.py

Lines changed: 50 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,30 @@
2424
# Unless specified, these settings have been tested to work on a single L4.
2525

2626

27-
# Ultravox 0.5-1B
28-
def run_ultravox(question: str, audio_count: int):
29-
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
27+
# MiniCPM-O
28+
def run_minicpmo(question: str, audio_count: int):
29+
model_name = "openbmb/MiniCPM-o-2_6"
30+
tokenizer = AutoTokenizer.from_pretrained(model_name,
31+
trust_remote_code=True)
32+
llm = LLM(model=model_name,
33+
trust_remote_code=True,
34+
max_model_len=4096,
35+
max_num_seqs=5,
36+
limit_mm_per_prompt={"audio": audio_count})
3037

31-
tokenizer = AutoTokenizer.from_pretrained(model_name)
38+
stop_tokens = ['<|im_end|>', '<|endoftext|>']
39+
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
40+
41+
audio_placeholder = "(<audio>./</audio>)" * audio_count
42+
audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501
3243
messages = [{
3344
'role': 'user',
34-
'content': "<|audio|>\n" * audio_count + question
45+
'content': f'{audio_placeholder}\n{question}'
3546
}]
3647
prompt = tokenizer.apply_chat_template(messages,
3748
tokenize=False,
38-
add_generation_prompt=True)
39-
40-
llm = LLM(model=model_name,
41-
max_model_len=4096,
42-
max_num_seqs=5,
43-
trust_remote_code=True,
44-
limit_mm_per_prompt={"audio": audio_count})
45-
stop_token_ids = None
49+
add_generation_prompt=True,
50+
chat_template=audio_chat_template)
4651
return llm, prompt, stop_token_ids
4752

4853

@@ -68,36 +73,49 @@ def run_qwen2_audio(question: str, audio_count: int):
6873
return llm, prompt, stop_token_ids
6974

7075

71-
def run_minicpmo(question: str, audio_count: int):
72-
model_name = "openbmb/MiniCPM-o-2_6"
73-
tokenizer = AutoTokenizer.from_pretrained(model_name,
74-
trust_remote_code=True)
75-
llm = LLM(model=model_name,
76-
trust_remote_code=True,
77-
max_model_len=4096,
78-
max_num_seqs=5,
79-
limit_mm_per_prompt={"audio": audio_count})
80-
81-
stop_tokens = ['<|im_end|>', '<|endoftext|>']
82-
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
76+
# Ultravox 0.5-1B
77+
def run_ultravox(question: str, audio_count: int):
78+
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
8379

84-
audio_placeholder = "(<audio>./</audio>)" * audio_count
85-
audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501
80+
tokenizer = AutoTokenizer.from_pretrained(model_name)
8681
messages = [{
8782
'role': 'user',
88-
'content': f'{audio_placeholder}\n{question}'
83+
'content': "<|audio|>\n" * audio_count + question
8984
}]
9085
prompt = tokenizer.apply_chat_template(messages,
9186
tokenize=False,
92-
add_generation_prompt=True,
93-
chat_template=audio_chat_template)
87+
add_generation_prompt=True)
88+
89+
llm = LLM(model=model_name,
90+
max_model_len=4096,
91+
max_num_seqs=5,
92+
trust_remote_code=True,
93+
limit_mm_per_prompt={"audio": audio_count})
94+
stop_token_ids = None
95+
return llm, prompt, stop_token_ids
96+
97+
98+
# Whisper
99+
def run_whisper(question: str, audio_count: int):
100+
assert audio_count == 1, (
101+
"Whisper only support single audio input per prompt")
102+
model_name = "openai/whisper-large-v3-turbo"
103+
104+
prompt = "<|startoftranscript|>"
105+
106+
llm = LLM(model=model_name,
107+
max_model_len=448,
108+
max_num_seqs=5,
109+
limit_mm_per_prompt={"audio": audio_count})
110+
stop_token_ids = None
94111
return llm, prompt, stop_token_ids
95112

96113

97114
model_example_map = {
98-
"ultravox": run_ultravox,
115+
"minicpmo": run_minicpmo,
99116
"qwen2_audio": run_qwen2_audio,
100-
"minicpmo": run_minicpmo
117+
"ultravox": run_ultravox,
118+
"whisper": run_whisper,
101119
}
102120

103121

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""
3+
This example shows how to use vLLM for running offline inference with
4+
the explicit/implicit prompt format on enc-dec LMMs for text generation.
5+
"""
6+
import time
7+
8+
from vllm import LLM, SamplingParams
9+
from vllm.assets.audio import AudioAsset
10+
from vllm.assets.image import ImageAsset
11+
from vllm.utils import FlexibleArgumentParser
12+
13+
14+
def run_florence2():
15+
# Create a Florence-2 encoder/decoder model instance
16+
llm = LLM(
17+
model="microsoft/Florence-2-large",
18+
tokenizer="facebook/bart-large",
19+
max_num_seqs=8,
20+
trust_remote_code=True,
21+
limit_mm_per_prompt={"image": 1},
22+
dtype="half",
23+
)
24+
25+
prompts = [
26+
{ # implicit prompt with task token
27+
"prompt": "<DETAILED_CAPTION>",
28+
"multi_modal_data": {
29+
"image": ImageAsset("stop_sign").pil_image
30+
},
31+
},
32+
{ # explicit encoder/decoder prompt
33+
"encoder_prompt": {
34+
"prompt": "Describe in detail what is shown in the image.",
35+
"multi_modal_data": {
36+
"image": ImageAsset("cherry_blossom").pil_image
37+
},
38+
},
39+
"decoder_prompt": "",
40+
},
41+
]
42+
return llm, prompts
43+
44+
45+
def run_mllama():
46+
# Create a Mllama encoder/decoder model instance
47+
llm = LLM(
48+
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
49+
max_model_len=4096,
50+
max_num_seqs=2,
51+
limit_mm_per_prompt={"image": 1},
52+
dtype="half",
53+
)
54+
55+
prompts = [
56+
{ # Implicit prompt
57+
"prompt": "<|image|><|begin_of_text|>What is the content of this image?", # noqa: E501
58+
"multi_modal_data": {
59+
"image": ImageAsset("stop_sign").pil_image,
60+
},
61+
},
62+
{ # Explicit prompt
63+
"encoder_prompt": {
64+
"prompt": "<|image|>",
65+
"multi_modal_data": {
66+
"image": ImageAsset("stop_sign").pil_image,
67+
},
68+
},
69+
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
70+
},
71+
]
72+
return llm, prompts
73+
74+
75+
def run_whisper():
76+
# Create a Whisper encoder/decoder model instance
77+
llm = LLM(
78+
model="openai/whisper-large-v3-turbo",
79+
max_model_len=448,
80+
max_num_seqs=16,
81+
limit_mm_per_prompt={"audio": 1},
82+
dtype="half",
83+
)
84+
85+
prompts = [
86+
{ # Test implicit prompt
87+
"prompt": "<|startoftranscript|>",
88+
"multi_modal_data": {
89+
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
90+
},
91+
},
92+
{ # Test explicit encoder/decoder prompt
93+
"encoder_prompt": {
94+
"prompt": "",
95+
"multi_modal_data": {
96+
"audio": AudioAsset("winning_call").audio_and_sample_rate,
97+
},
98+
},
99+
"decoder_prompt": "<|startoftranscript|>",
100+
}
101+
]
102+
return llm, prompts
103+
104+
105+
model_example_map = {
106+
"florence2": run_florence2,
107+
"mllama": run_mllama,
108+
"whisper": run_whisper,
109+
}
110+
111+
112+
def main(args):
113+
model = args.model_type
114+
if model not in model_example_map:
115+
raise ValueError(f"Model type {model} is not supported.")
116+
117+
llm, prompts = model_example_map[model]()
118+
119+
# Create a sampling params object.
120+
sampling_params = SamplingParams(
121+
temperature=0,
122+
top_p=1.0,
123+
max_tokens=64,
124+
)
125+
126+
start = time.time()
127+
128+
# Generate output tokens from the prompts. The output is a list of
129+
# RequestOutput objects that contain the prompt, generated
130+
# text, and other information.
131+
outputs = llm.generate(prompts, sampling_params)
132+
133+
# Print the outputs.
134+
for output in outputs:
135+
prompt = output.prompt
136+
generated_text = output.outputs[0].text
137+
print(f"Decoder prompt: {prompt!r}, "
138+
f"Generated text: {generated_text!r}")
139+
140+
duration = time.time() - start
141+
142+
print("Duration:", duration)
143+
print("RPS:", len(prompts) / duration)
144+
145+
146+
if __name__ == "__main__":
147+
parser = FlexibleArgumentParser(
148+
description='Demo on using vLLM for offline inference with '
149+
'vision language models for text generation')
150+
parser.add_argument('--model-type',
151+
'-m',
152+
type=str,
153+
default="mllama",
154+
choices=model_example_map.keys(),
155+
help='Huggingface "model_type".')
156+
157+
args = parser.parse_args()
158+
main(args)

examples/offline_inference/florence2_inference.py

Lines changed: 0 additions & 53 deletions
This file was deleted.

examples/offline_inference/whisper.py

Lines changed: 0 additions & 61 deletions
This file was deleted.

0 commit comments

Comments
 (0)