Skip to content

Commit 4062a17

Browse files
Isotr0pyepwalsh
authored andcommitted
[VLM] Support HF format Phi-4-MM model (vllm-project#17121)
Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent e80579d commit 4062a17

File tree

10 files changed

+1847
-5
lines changed

10 files changed

+1847
-5
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,7 @@ Specified using `--task generate`.
614614
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
615615
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
616616
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
617+
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
617618
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
618619
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
619620
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |

examples/offline_inference/audio_language.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
190190
)
191191

192192

193+
def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
194+
"""
195+
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
196+
show how to process audio inputs.
197+
"""
198+
model_path = snapshot_download(
199+
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
200+
)
201+
# Since the vision-lora and speech-lora co-exist with the base model,
202+
# we have to manually specify the path of the lora weights.
203+
speech_lora_path = os.path.join(model_path, "speech-lora")
204+
placeholders = "<|audio|>" * audio_count
205+
206+
prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
207+
208+
engine_args = EngineArgs(
209+
model=model_path,
210+
max_model_len=12800,
211+
max_num_seqs=2,
212+
enable_lora=True,
213+
max_lora_rank=320,
214+
limit_mm_per_prompt={"audio": audio_count},
215+
)
216+
217+
return ModelRequestData(
218+
engine_args=engine_args,
219+
prompt=prompts,
220+
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
221+
)
222+
223+
193224
# Qwen2-Audio
194225
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
195226
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -303,6 +334,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
303334
"granite_speech": run_granite_speech,
304335
"minicpmo": run_minicpmo,
305336
"phi4_mm": run_phi4mm,
337+
"phi4_multimodal": run_phi4_multimodal,
306338
"qwen2_audio": run_qwen2_audio,
307339
"qwen2_5_omni": run_qwen2_5_omni,
308340
"ultravox": run_ultravox,

examples/offline_inference/vision_language.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
10971097
)
10981098

10991099

1100+
# HF format Phi-4-multimodal-instruct
1101+
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
1102+
"""
1103+
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
1104+
show how to process image inputs.
1105+
"""
1106+
assert modality == "image"
1107+
model_path = snapshot_download(
1108+
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
1109+
)
1110+
# Since the vision-lora and speech-lora co-exist with the base model,
1111+
# we have to manually specify the path of the lora weights.
1112+
vision_lora_path = os.path.join(model_path, "vision-lora")
1113+
prompts = [
1114+
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
1115+
]
1116+
engine_args = EngineArgs(
1117+
model=model_path,
1118+
max_model_len=5120,
1119+
max_num_seqs=2,
1120+
max_num_batched_tokens=12800,
1121+
enable_lora=True,
1122+
max_lora_rank=320,
1123+
# Note - mm_processor_kwargs can also be passed to generate/chat calls
1124+
mm_processor_kwargs={"dynamic_hd": 16},
1125+
limit_mm_per_prompt={"image": 1},
1126+
)
1127+
1128+
return ModelRequestData(
1129+
engine_args=engine_args,
1130+
prompts=prompts,
1131+
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
1132+
)
1133+
1134+
11001135
# Pixtral HF-format
11011136
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
11021137
assert modality == "image"
@@ -1356,6 +1391,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
13561391
"paligemma2": run_paligemma2,
13571392
"phi3_v": run_phi3v,
13581393
"phi4_mm": run_phi4mm,
1394+
"phi4_multimodal": run_phi4_multimodal,
13591395
"pixtral_hf": run_pixtral_hf,
13601396
"qwen_vl": run_qwen_vl,
13611397
"qwen2_vl": run_qwen2_vl,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
760760
)
761761

762762

763+
def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
764+
"""
765+
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
766+
show how to process multi images inputs.
767+
"""
768+
769+
model_path = snapshot_download(
770+
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
771+
)
772+
# Since the vision-lora and speech-lora co-exist with the base model,
773+
# we have to manually specify the path of the lora weights.
774+
vision_lora_path = os.path.join(model_path, "vision-lora")
775+
engine_args = EngineArgs(
776+
model=model_path,
777+
max_model_len=4096,
778+
max_num_seqs=2,
779+
limit_mm_per_prompt={"image": len(image_urls)},
780+
enable_lora=True,
781+
max_lora_rank=320,
782+
# Note - mm_processor_kwargs can also be passed to generate/chat calls
783+
mm_processor_kwargs={"dynamic_hd": 4},
784+
)
785+
786+
placeholders = "<|image|>" * len(image_urls)
787+
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
788+
789+
return ModelRequestData(
790+
engine_args=engine_args,
791+
prompt=prompt,
792+
image_data=[fetch_image(url) for url in image_urls],
793+
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
794+
)
795+
796+
763797
def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
764798
model_name = "Qwen/Qwen-VL-Chat"
765799
engine_args = EngineArgs(
@@ -988,6 +1022,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
9881022
"ovis": load_ovis,
9891023
"phi3_v": load_phi3v,
9901024
"phi4_mm": load_phi4mm,
1025+
"phi4_multimodal": load_phi4_multimodal,
9911026
"pixtral_hf": load_pixtral_hf,
9921027
"qwen_vl_chat": load_qwen_vl_chat,
9931028
"qwen2_vl": load_qwen2_vl,

0 commit comments

Comments
 (0)