krai
diff --git a/‎examples/offline_inference/audio_language.py‎
Lines changed: 50 additions & 32 deletions b/‎examples/offline_inference/audio_language.py‎
Lines changed: 50 additions & 32 deletions
diff --git a/‎examples/offline_inference/encoder_decoder_multimodal.py‎
Lines changed: 158 additions & 0 deletions b/‎examples/offline_inference/encoder_decoder_multimodal.py‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎examples/offline_inference/florence2_inference.py‎
Lines changed: 0 additions & 53 deletions b/‎examples/offline_inference/florence2_inference.py‎
Lines changed: 0 additions & 53 deletions
diff --git a/‎examples/offline_inference/whisper.py‎
Lines changed: 0 additions & 61 deletions b/‎examples/offline_inference/whisper.py‎
Lines changed: 0 additions & 61 deletions
@@ -24,25 +24,30 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-# Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
-    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+# MiniCPM-O
+def run_minicpmo(question: str, audio_count: int):
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
     messages = [{
         'role': 'user',
-        'content': "<|audio|>\n" * audio_count + question
+        'content': f'{audio_placeholder}\n{question}'
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
-                                           add_generation_prompt=True)
-
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              trust_remote_code=True,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
+                                           add_generation_prompt=True,
+                                           chat_template=audio_chat_template)
     return llm, prompt, stop_token_ids
 
 
@@ -68,36 +73,49 @@ def run_qwen2_audio(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
-def run_minicpmo(question: str, audio_count: int):
-    model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    llm = LLM(model=model_name,
-              trust_remote_code=True,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
-
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# Ultravox 0.5-1B
+def run_ultravox(question: str, audio_count: int):
+    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
-    audio_placeholder = "(<audio>./</audio>)" * audio_count
-    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
         'role': 'user',
-        'content': f'{audio_placeholder}\n{question}'
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=audio_chat_template)
+                                           add_generation_prompt=True)
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Whisper
+def run_whisper(question: str, audio_count: int):
+    assert audio_count == 1, (
+        "Whisper only support single audio input per prompt")
+    model_name = "openai/whisper-large-v3-turbo"
+
+    prompt = "<|startoftranscript|>"
+
+    llm = LLM(model=model_name,
+              max_model_len=448,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
-    "ultravox": run_ultravox,
+    "minicpmo": run_minicpmo,
     "qwen2_audio": run_qwen2_audio,
-    "minicpmo": run_minicpmo
+    "ultravox": run_ultravox,
+    "whisper": run_whisper,
 }
 
 
 
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+def run_florence2():
+    # Create a Florence-2 encoder/decoder model instance
+    llm = LLM(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image
+            },
+        },
+        {   # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {
+                    "image": ImageAsset("cherry_blossom").pil_image
+                },
+            },
+            "decoder_prompt": "",
+        },
+    ]
+    return llm, prompts
+
+
+def run_mllama():
+    # Create a Mllama encoder/decoder model instance
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {   # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+        },
+    ]
+    return llm, prompts
+
+
+def run_whisper():
+    # Create a Whisper encoder/decoder model instance
+    llm = LLM(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {   # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        }
+    ]
+    return llm, prompts
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompts = model_example_map[model]()
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+
+    args = parser.parse_args()
+    main(args)