From 879d1e2a0f706a302c751f26c15aa8b08819c371 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Mon, 3 Mar 2025 23:47:18 +0000
Subject: [PATCH 1/5] [V1][Molmo] Fix get_multimodal_embeddings() in molmo.py

Expected: get_multimodal_embeddings() should return list[Tensor]
for `GPUModelRunner` to iterate.

Actual: prious to this PR, molmo's _get_mm_embeds() returns a list
thus get_multimodal_embeddings() returns a list of list.

This is reproducible when all of following satisfy:
* more than one request
* the tailing part of each request is a bit different, to trigger partial cache hit

This PR also updates vision_language.py to help reproduce.

Tested with:
```
VLLM_USE_V1=1 \
python examples/offline_inference/vision_language.py \
  --model molmo \
  --num-prompts=2 \
  --use-different-prompt-per-request
```

Signed-off-by: Linkun Chen <github@lkchen.net>
---
 examples/offline_inference/vision_language.py | 299 +++++++++++-------
 vllm/model_executor/models/molmo.py           |   4 +-
 2 files changed, 190 insertions(+), 113 deletions(-)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 5f05389faf80..819df62ef46b 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -21,7 +21,7 @@
 
 
 # Aria
-def run_aria(question: str, modality: str):
+def run_aria(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
@@ -32,41 +32,50 @@ def run_aria(question: str, modality: str):
               dtype="bfloat16",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
+    prompts = [
+        (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+         "<|im_end|>\n<|im_start|>assistant\n")
+        for question in questions
+    ]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # BLIP-2
-def run_blip2(question: str, modality: str):
+def run_blip2(questions: list[str], modality: str):
     assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
+    prompts = [
+        f"Question: {question} Answer:"
+        for question in questions
+    ]
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Chameleon
-def run_chameleon(question: str, modality: str):
+def run_chameleon(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}<image>"
+    prompts = [
+        f"{question}<image>"
+        for question in questions
+    ]
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Deepseek-VL2
-def run_deepseek_vl2(question: str, modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -77,26 +86,32 @@ def run_deepseek_vl2(question: str, modality: str):
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
 
-    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+    prompts = [
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Fuyu
-def run_fuyu(question: str, modality: str):
+def run_fuyu(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}\n"
+    prompts = [
+        f"{question}\n"
+        for question in questions
+    ]
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # GLM-4v
-def run_glm4v(question: str, modality: str):
+def run_glm4v(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "THUDM/glm-4v-9b"
 
@@ -108,15 +123,18 @@ def run_glm4v(question: str, modality: str):
               hf_overrides={"architectures": ["GLM4VForCausalLM"]},
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+    prompts = [
+        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
         {question}<|assistant|>"
+        for question in questions
+    ]
 
     stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+def run_h2ovl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "h2oai/h2ovl-mississippi-800m"
@@ -130,19 +148,22 @@ def run_h2ovl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{'role': 'user', 'content': f"<image>\n{question}"}],
+            tokenize=False,
+            add_generation_prompt=True)
+        for question in questions
+    ]
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
+def run_idefics3(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
@@ -160,15 +181,18 @@ def run_idefics3(question: str, modality: str):
         },
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
+    prompts = [
+        (
+            f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+        )
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # InternVL
-def run_internvl(question: str, modality: str):
+def run_internvl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "OpenGVLab/InternVL2-2B"
@@ -182,10 +206,13 @@ def run_internvl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{'role': 'user', 'content': f"<image>\n{question}"}],
+            tokenize=False,
+            add_generation_prompt=True)
+        for question in questions
+    ]
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -193,71 +220,89 @@ def run_internvl(question: str, modality: str):
     # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.5
-def run_llava(question: str, modality: str):
+def run_llava(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    prompts = [
+        f"USER: <image>\n{question}\nASSISTANT:"
+        for question in questions
+    ]
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question: str, modality: str):
+def run_llava_next(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"[INST] <image>\n{question} [/INST]"
+    prompts = [
+        f"[INST] <image>\n{question} [/INST]"
+        for question in questions
+    ]
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question: str, modality: str):
+def run_llava_next_video(questions: list[str], modality: str):
     assert modality == "video"
 
-    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    prompts = [
+        f"USER: <video>\n{question} ASSISTANT:"
+        for question in questions
+    ]
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-OneVision
-def run_llava_onevision(question: str, modality: str):
+def run_llava_onevision(questions: list[str], modality: str):
 
     if modality == "video":
-        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        prompts = [
+            f"<|im_start|>user <video>\n{question}<|im_end|> \
         <|im_start|>assistant\n"
+            for question in questions
+        ]
 
     elif modality == "image":
-        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        prompts = [
+            f"<|im_start|>user <image>\n{question}<|im_end|> \
         <|im_start|>assistant\n"
+            for question in questions
+        ]
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
               max_model_len=16384,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Mantis
-def run_mantis(question: str, modality: str):
+def run_mantis(questions: list[str], modality: str):
     assert modality == "image"
 
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
+    prompts = [
+        llama3_template.format(f"{question}\n<image>")
+        for question in questions
+    ]
 
     llm = LLM(
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
@@ -266,11 +311,11 @@ def run_mantis(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # MiniCPM-V
-def run_minicpmv_base(question: str, modality: str, model_name):
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
     assert modality in ["image", "video"]
     # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
@@ -317,26 +362,25 @@ def run_minicpmv_base(question: str, modality: str, model_name):
         "video": "(<video>./</video>)",
     }
 
-    messages = [{
-        'role': 'user',
-        'content': f'{modality_placeholder[modality]}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    prompts = [
+        tokenizer.apply_chat_template([{
+            'role': 'user',
+            'content': f"{modality_placeholder[modality]}\n{question}"
+        }], tokenize=False, add_generation_prompt=True) for question in questions
+    ]
+    return llm, prompts, stop_token_ids
 
 
-def run_minicpmo(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+def run_minicpmo(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
 
 
-def run_minicpmv(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+def run_minicpmv(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
 # LLama 3.2
-def run_mllama(question: str, modality: str):
+def run_mllama(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -363,16 +407,16 @@ def run_mllama(question: str, modality: str):
             "type": "text",
             "text": f"{question}"
         }]
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
+    } for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
                                            add_generation_prompt=True,
                                            tokenize=False)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Molmo
-def run_molmo(question, modality):
+def run_molmo(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "allenai/Molmo-7B-D-0924"
@@ -384,13 +428,17 @@ def run_molmo(question, modality):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = question
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # NVLM-D
-def run_nvlm_d(question: str, modality: str):
+def run_nvlm_d(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "nvidia/NVLM-D-72B"
@@ -406,12 +454,12 @@ def run_nvlm_d(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"} for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # PaliGemma
@@ -419,7 +467,7 @@ def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma-3b-mix-224",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -431,7 +479,7 @@ def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma2-3b-ft-docci-448",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -439,10 +487,13 @@ def run_paligemma2(question: str, modality: str):
 
 
 # Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+def run_phi3v(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    prompts = [
+        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]
 
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -466,11 +517,11 @@ def run_phi3v(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Pixtral HF-format
-def run_pixtral_hf(question: str, modality: str):
+def run_pixtral_hf(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "mistral-community/pixtral-12b"
@@ -483,13 +534,16 @@ def run_pixtral_hf(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
+    prompts = [
+        f"<s>[INST]{question}\n[IMG][/INST]"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen
-def run_qwen_vl(question: str, modality: str):
+def run_qwen_vl(questions: list[str], modality: str):
     assert modality == "image"
 
     llm = LLM(
@@ -501,13 +555,16 @@ def run_qwen_vl(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"{question}Picture 1: <img></img>\n"
+    prompts = [
+        f"{question}Picture 1: <img></img>\n"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+def run_qwen2_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -528,16 +585,19 @@ def run_qwen2_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n")
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2.5-VL
-def run_qwen2_5_vl(question: str, modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
@@ -558,12 +618,15 @@ def run_qwen2_5_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n")
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 model_example_map = {
@@ -607,29 +670,34 @@ def get_multi_modal_input(args):
         # Input image and question
         image = ImageAsset("cherry_blossom") \
             .pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+def apply_image_repeat(image_repeat_prob, num_prompts, data, prompts: list[str], modality):
     """Repeats images with provided probability of "image_repeat_prob". 
     Used to simulate hit/miss for the MM preprocessor cache.
     """
@@ -649,7 +717,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
                 cur_image.putpixel((0, 0), new_val)
 
         inputs.append({
-            "prompt": prompt,
+            "prompt": prompts[i % len(prompts)],
             "multi_modal_data": {
                 modality: cur_image
             }
@@ -666,9 +734,11 @@ def main(args):
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
+    llm, prompts, stop_token_ids = model_example_map[model](questions, modality)
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = prompts if args.use_different_prompt_per_request else [prompts[0]]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -680,27 +750,26 @@ def main(args):
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
         if args.image_repeat_prob is not None:
             # Repeat images with specified probability of "image_repeat_prob"
             inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompt,
+                                        args.num_prompts, data, prompts,
                                         modality)
         else:
             # Use the same image for all prompts
             inputs = [{
-                "prompt": prompt,
+                "prompt": prompts[i % len(prompts)],
                 "multi_modal_data": {
                     modality: data
                 },
-            } for _ in range(args.num_prompts)]
+            } for i in range(args.num_prompts)]
 
     if args.time_generate:
         import time
@@ -758,5 +827,11 @@ def main(args):
         action='store_true',
         help='If True, then print the total generate() call time')
 
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
     args = parser.parse_args()
     main(args)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc4d38d8740b..3803daefdfbe 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -3,6 +3,7 @@
 import math
 from dataclasses import dataclass
 from functools import cached_property, partial
+from itertools import chain
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union, cast)
 
@@ -1592,7 +1593,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
 
         image_features = self._process_image_input(image_input)
 
-        return [
+        nested_embeds = [
             self._get_mm_embeds(*args) for args in zip(
                 image_features,
                 image_input["feat_is_patch"],
@@ -1600,6 +1601,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
                 image_input["embed_is_patch"],
             )
         ]
+        return list(chain(*nested_embeds))
 
     def get_input_embeddings(
         self,

From 1a91a8c40af6d3a31a22860d7ad15689a3d58bb4 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Tue, 4 Mar 2025 09:04:34 +0000
Subject: [PATCH 2/5] use flatten_2d_lists

Signed-off-by: Linkun Chen <github@lkchen.net>
---
 vllm/model_executor/models/molmo.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5d5c507dba6a..a35ee22ad76c 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -4,7 +4,6 @@
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
-from itertools import chain
 from typing import List, Optional, Set, Tuple, TypedDict, Union, cast
 
 import numpy as np
@@ -51,7 +50,7 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import JSONTree, json_map_leaves
+from vllm.utils import flatten_2d_lists, JSONTree, json_map_leaves
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -1592,7 +1591,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
                 image_input["embed_is_patch"],
             )
         ]
-        return list(chain(*nested_embeds))
+        return flatten_2d_lists(nested_embeds)
 
     def get_input_embeddings(
         self,

From 2994db96a4bbcbe7c04f846ad85983738ddb3d7d Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Tue, 4 Mar 2025 09:33:42 +0000
Subject: [PATCH 3/5] update get_multimodal_embeddings() return type

Signed-off-by: Linkun Chen <github@lkchen.net>
---
 vllm/model_executor/models/aria.py             | 4 +++-
 vllm/model_executor/models/blip2.py            | 4 +++-
 vllm/model_executor/models/chameleon.py        | 4 +++-
 vllm/model_executor/models/deepseek_vl2.py     | 4 +++-
 vllm/model_executor/models/florence2.py        | 4 +++-
 vllm/model_executor/models/fuyu.py             | 6 ++++--
 vllm/model_executor/models/glm4v.py            | 4 +++-
 vllm/model_executor/models/idefics3.py         | 4 +++-
 vllm/model_executor/models/interfaces.py       | 4 +++-
 vllm/model_executor/models/internvl.py         | 4 +++-
 vllm/model_executor/models/llava.py            | 4 +++-
 vllm/model_executor/models/llava_next.py       | 4 +++-
 vllm/model_executor/models/llava_next_video.py | 4 +++-
 vllm/model_executor/models/molmo.py            | 6 ++++--
 vllm/model_executor/models/paligemma.py        | 4 +++-
 vllm/model_executor/models/phi3v.py            | 4 +++-
 vllm/model_executor/models/pixtral.py          | 4 +++-
 vllm/model_executor/models/qwen2_audio.py      | 4 +++-
 vllm/model_executor/models/qwen_vl.py          | 4 +++-
 vllm/model_executor/models/ultravox.py         | 4 +++-
 vllm/model_executor/models/whisper.py          | 4 +++-
 21 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 061a9a5bd2bc..c8bf6681d7ca 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -602,7 +602,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8457f6294460..d7eaac2563f6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -628,7 +628,9 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9d597e240951..137bfc0f98cf 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -986,7 +986,9 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 3d2e452bb50e..532400b3b425 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -606,7 +606,9 @@ def _process_image_input(
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 7a8510379455..7c226ea47bd3 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1037,7 +1037,9 @@ def _process_image_input(
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 581ec54b2cab..6f4b6cdda332 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -327,7 +327,9 @@ def _process_image_input(
             image_patches_flat)
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ca34c4f8d53f..2700ebccb831 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -595,7 +595,9 @@ def _process_image_input(
 
         return self.transformer.vision(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 286a75339d20..3e0b3c768b69 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -617,7 +617,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self.model._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index fb3ceb005295..b75b52e39ad6 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -36,7 +36,9 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 48c2eb8c9f6e..1aa8455bad82 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -904,7 +904,9 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         else:
             self.visual_token_mask = None
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8318a496e608..542eb944de9e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -635,7 +635,9 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 6a050d7798a2..04b0f2910292 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -479,7 +479,9 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index ca9406657df5..508b47d13519 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -420,7 +420,9 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return None
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index a35ee22ad76c..cc571bc24bac 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -50,7 +50,7 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists, JSONTree, json_map_leaves
+from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -1576,7 +1576,9 @@ def _get_mm_embeds(
 
         return embeds_in_batch
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9a1398c28dbc..0e39389eb633 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -263,7 +263,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fd4b3c70211..06fa5c5e0199 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -648,7 +648,9 @@ def _process_image_input(
 
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 87b1d50749a2..d2388dda3f4a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -220,7 +220,9 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input, image_tokens = self._parse_and_validate_image_input(
             **kwargs)
         if image_input is None:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 1c3107e76eb6..c44f4fa4d75a 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -356,7 +356,9 @@ def _process_audio_input(self,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index b8aaa7f1db1b..ff581b093b47 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -740,7 +740,9 @@ def _process_image_input(self,
 
         return self.transformer.visual(image_input["data"])
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d47f924ea19e..90a833a83b66 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -476,7 +476,9 @@ def _process_audio_input(
 
         return result
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c5a55e300c46..1cb026f4bcda 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -692,7 +692,9 @@ def forward(
         )
         return decoder_outputs
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
         # Refactor this once encoder/decoder support is implemented in V1.
         audio_input = self._parse_and_validate_audio_input(**kwargs)

From 13f855a50fbcb3953a00b2c9428cb66462f93f3d Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Tue, 4 Mar 2025 09:43:47 +0000
Subject: [PATCH 4/5] update TypeVar in interfaces.py

Signed-off-by: Linkun Chen <github@lkchen.net>
---
 vllm/model_executor/models/interfaces.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index b75b52e39ad6..43196bf544e8 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -4,6 +4,7 @@
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
+from torch import Tensor
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -15,12 +16,11 @@
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
-    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
-T = TypeVar("T", default="NestedTensors")
+T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]])
 
 
 @runtime_checkable
@@ -36,9 +36,7 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+    def get_multimodal_embeddings(self, **kwargs) -> T:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
@@ -61,18 +59,18 @@ def get_multimodal_embeddings(
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
         attn_metadata: Optional["AttentionMetadata"] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         ...
 
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         """
         Returns the input embeddings merged from the text embeddings from 
         input_ids and the multimodal embeddings generated from multimodal 
@@ -212,7 +210,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         """
         Accept :class:`IntermediateTensors` when PP rank > 0.
 
@@ -239,7 +237,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         ...
 
 

From d8e6a99d1b4ce272480b462b4565dcfb0902ffe9 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github@lkchen.net>
Date: Tue, 4 Mar 2025 10:01:10 +0000
Subject: [PATCH 5/5] format

Signed-off-by: Linkun Chen <github@lkchen.net>
---
 examples/offline_inference/vision_language.py | 125 ++++++++----------
 1 file changed, 54 insertions(+), 71 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 13ce822deb69..a0a71f18ed94 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -32,11 +32,9 @@ def run_aria(questions: list[str], modality: str):
               dtype="bfloat16",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompts = [
-        (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-         "<|im_end|>\n<|im_start|>assistant\n")
-        for question in questions
-    ]
+    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+                "<|im_end|>\n<|im_start|>assistant\n")
+               for question in questions]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
     return llm, prompts, stop_token_ids
@@ -48,10 +46,7 @@ def run_blip2(questions: list[str], modality: str):
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompts = [
-        f"Question: {question} Answer:"
-        for question in questions
-    ]
+    prompts = [f"Question: {question} Answer:" for question in questions]
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -62,10 +57,7 @@ def run_blip2(questions: list[str], modality: str):
 def run_chameleon(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompts = [
-        f"{question}<image>"
-        for question in questions
-    ]
+    prompts = [f"{question}<image>" for question in questions]
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
               max_num_seqs=2,
@@ -114,10 +106,7 @@ def run_florence2(question: str, modality: str):
 def run_fuyu(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompts = [
-        f"{question}\n"
-        for question in questions
-    ]
+    prompts = [f"{question}\n" for question in questions]
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
@@ -141,8 +130,7 @@ def run_glm4v(questions: list[str], modality: str):
 
     prompts = [
         f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>"
-        for question in questions
+        {question}<|assistant|>" for question in questions
     ]
 
     stop_token_ids = [151329, 151336, 151338]
@@ -165,10 +153,12 @@ def run_h2ovl(questions: list[str], modality: str):
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
     prompts = [
-        tokenizer.apply_chat_template(
-            [{'role': 'user', 'content': f"<image>\n{question}"}],
-            tokenize=False,
-            add_generation_prompt=True)
+        tokenizer.apply_chat_template([{
+            'role': 'user',
+            'content': f"<image>\n{question}"
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
         for question in questions
     ]
 
@@ -197,12 +187,9 @@ def run_idefics3(questions: list[str], modality: str):
         },
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    prompts = [
-        (
-            f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-        )
-        for question in questions
-    ]
+    prompts = [(
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    ) for question in questions]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
 
@@ -223,10 +210,12 @@ def run_internvl(questions: list[str], modality: str):
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
     prompts = [
-        tokenizer.apply_chat_template(
-            [{'role': 'user', 'content': f"<image>\n{question}"}],
-            tokenize=False,
-            add_generation_prompt=True)
+        tokenizer.apply_chat_template([{
+            'role': 'user',
+            'content': f"<image>\n{question}"
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
         for question in questions
     ]
 
@@ -244,8 +233,7 @@ def run_llava(questions: list[str], modality: str):
     assert modality == "image"
 
     prompts = [
-        f"USER: <image>\n{question}\nASSISTANT:"
-        for question in questions
+        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
     ]
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
@@ -259,10 +247,7 @@ def run_llava(questions: list[str], modality: str):
 def run_llava_next(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompts = [
-        f"[INST] <image>\n{question} [/INST]"
-        for question in questions
-    ]
+    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
@@ -276,8 +261,7 @@ def run_llava_next_video(questions: list[str], modality: str):
     assert modality == "video"
 
     prompts = [
-        f"USER: <video>\n{question} ASSISTANT:"
-        for question in questions
+        f"USER: <video>\n{question} ASSISTANT:" for question in questions
     ]
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
@@ -292,15 +276,13 @@ def run_llava_onevision(questions: list[str], modality: str):
     if modality == "video":
         prompts = [
             f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
-            for question in questions
+        <|im_start|>assistant\n" for question in questions
         ]
 
     elif modality == "image":
         prompts = [
             f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
-            for question in questions
+        <|im_start|>assistant\n" for question in questions
         ]
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
@@ -379,10 +361,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
     }
 
     prompts = [
-        tokenizer.apply_chat_template([{
-            'role': 'user',
-            'content': f"{modality_placeholder[modality]}\n{question}"
-        }], tokenize=False, add_generation_prompt=True) for question in questions
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
     ]
     return llm, prompts, stop_token_ids
 
@@ -425,8 +410,8 @@ def run_mllama(questions: list[str], modality: str):
         }]
     } for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
-                                           add_generation_prompt=True,
-                                           tokenize=False)
+                                            add_generation_prompt=True,
+                                            tokenize=False)
     stop_token_ids = None
     return llm, prompts, stop_token_ids
 
@@ -446,8 +431,7 @@ def run_molmo(questions: list[str], modality: str):
 
     prompts = [
         f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
-        for question in questions
+        <|im_start|>assistant\n" for question in questions
     ]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
@@ -470,10 +454,13 @@ def run_nvlm_d(questions: list[str], modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"} for question in questions]
+    messages = [{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    } for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+                                            tokenize=False,
+                                            add_generation_prompt=True)
     stop_token_ids = None
     return llm, prompts, stop_token_ids
 
@@ -550,10 +537,7 @@ def run_pixtral_hf(questions: list[str], modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompts = [
-        f"<s>[INST]{question}\n[IMG][/INST]"
-        for question in questions
-    ]
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
 
@@ -571,10 +555,7 @@ def run_qwen_vl(questions: list[str], modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompts = [
-        f"{question}Picture 1: <img></img>\n"
-        for question in questions
-    ]
+    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
 
@@ -605,8 +586,7 @@ def run_qwen2_vl(questions: list[str], modality: str):
         ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
          f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
          f"{question}<|im_end|>\n"
-         "<|im_start|>assistant\n")
-        for question in questions
+         "<|im_start|>assistant\n") for question in questions
     ]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
@@ -638,8 +618,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
         ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
          f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
          f"{question}<|im_end|>\n"
-         "<|im_start|>assistant\n")
-        for question in questions
+         "<|im_start|>assistant\n") for question in questions
     ]
     stop_token_ids = None
     return llm, prompts, stop_token_ids
@@ -714,7 +693,8 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
-def apply_image_repeat(image_repeat_prob, num_prompts, data, prompts: list[str], modality):
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
     """Repeats images with provided probability of "image_repeat_prob". 
     Used to simulate hit/miss for the MM preprocessor cache.
     """
@@ -753,9 +733,12 @@ def main(args):
     data = mm_input["data"]
     questions = mm_input["questions"]
 
-    llm, prompts, stop_token_ids = model_example_map[model](questions, modality)
+    llm, prompts, stop_token_ids = model_example_map[model](questions,
+                                                            modality)
     # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = prompts if args.use_different_prompt_per_request else [prompts[0]]
+    prompts = prompts if args.use_different_prompt_per_request else [
+        prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.