From 7217a27e8ad2347d613d0e8764cb6bd95082dcdb Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 00:16:50 +0000 Subject: [PATCH 01/11] add image support Signed-off-by: sfeng33 <4florafeng@gmail.com> --- vllm/entrypoints/chat_utils.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1054b969cd3b..365b8db7b328 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -662,6 +662,11 @@ def parse_image(self, image_url: str) -> None: placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) + def parse_direct_image(self, image: object) -> None: + # Directly add the PIL image without URL processing + placeholder = self._tracker.add("image", image) + self._add_placeholder(placeholder) + def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: if isinstance(image_embeds, dict): @@ -869,6 +874,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), + "image": lambda part: part.get("image", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": @@ -938,7 +944,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "image_embeds", + "image_embeds", "image", "audio_url", "input_audio", "video_url") @@ -1017,6 +1023,10 @@ def _parse_chat_message_content_part( content = cast(Union[str, dict[str, str]], content) mm_parser.parse_image_embeds(content) return {'type': 'image'} if wrap_dicts else None + if part_type == "image": + image_content = cast(object, content) # PIL image or similar + mm_parser.parse_direct_image(image_content) + return {'type': 'image'} if wrap_dicts else None if part_type == "audio_url": str_content = cast(str, content) mm_parser.parse_audio(str_content) From b1bbb46c4dab4b074fe1ff575c0ee7e57fcb8cd1 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 01:25:46 +0000 Subject: [PATCH 02/11] Support async Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/test_chat_utils.py | 24 ++++++++---------------- vllm/entrypoints/chat_utils.py | 12 +++++++++++- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 49294664275a..8700911563e8 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -136,10 +136,8 @@ def test_parse_chat_messages_single_image( "role": "user", "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image", + "image_url": image_url }, { "type": "text", "text": "What's in the image?" @@ -228,10 +226,8 @@ async def test_parse_chat_messages_single_image_async( "role": "user", "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image", + "image": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in the image?" @@ -264,10 +260,8 @@ def test_parse_chat_messages_multiple_images( "url": image_url } }, { - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image", + "image": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" @@ -303,10 +297,8 @@ async def test_parse_chat_messages_multiple_images_async( "url": image_url } }, { - "type": "image_url", - "image_url": { - "url": image_url - } + "type": "image", + "image": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 365b8db7b328..3d702ca1f825 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -626,6 +626,10 @@ def mm_placeholder_counts(self) -> dict[str, int]: def parse_image(self, image_url: str) -> None: raise NotImplementedError + @abstractmethod + def parse_direct_image(self, image: object) -> None: + raise NotImplementedError + @abstractmethod def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: @@ -663,7 +667,6 @@ def parse_image(self, image_url: str) -> None: self._add_placeholder(placeholder) def parse_direct_image(self, image: object) -> None: - # Directly add the PIL image without URL processing placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) @@ -719,6 +722,13 @@ def parse_image(self, image_url: str) -> None: placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) + def parse_direct_image(self, image: object) -> None: + future: asyncio.Future[object] = asyncio.Future() + future.set_result(image) + + placeholder = self._tracker.add("image", future) + self._add_placeholder(placeholder) + def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() From e781aed9270a28e0b5d7eb8099b65377a4142089 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 02:58:21 +0000 Subject: [PATCH 03/11] Fix test Signed-off-by: Flora Feng <4florafeng@gmail.com> Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/test_chat_utils.py | 13 ++++++++---- vllm/entrypoints/chat_utils.py | 30 ++++++++++++++-------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 8700911563e8..fb903477e5bf 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -136,8 +136,10 @@ def test_parse_chat_messages_single_image( "role": "user", "content": [{ - "type": "image", - "image_url": image_url + "type": "image_url", + "image_url": { + "url": image_url + } }, { "type": "text", "text": "What's in the image?" @@ -152,6 +154,7 @@ def test_parse_chat_messages_single_image( "role": "user", "content": "<|image_1|>\nWhat's in the image?" }] + print('mm data', mm_data) _assert_mm_data_is_image_input(mm_data, 1) @@ -226,8 +229,10 @@ async def test_parse_chat_messages_single_image_async( "role": "user", "content": [{ - "type": "image", - "image": ImageAsset('cherry_blossom').pil_image + "type": "image_url", + "image_url": { + "url": image_url + } }, { "type": "text", "text": "What's in the image?" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3d702ca1f825..a76450258945 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -627,12 +627,12 @@ def parse_image(self, image_url: str) -> None: raise NotImplementedError @abstractmethod - def parse_direct_image(self, image: object) -> None: + def parse_image_embeds(self, + image_embeds: Union[str, dict[str, str]]) -> None: raise NotImplementedError @abstractmethod - def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + def parse_image_object(self, image: object) -> None: raise NotImplementedError @abstractmethod @@ -666,10 +666,6 @@ def parse_image(self, image_url: str) -> None: placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) - def parse_direct_image(self, image: object) -> None: - placeholder = self._tracker.add("image", image) - self._add_placeholder(placeholder) - def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: if isinstance(image_embeds, dict): @@ -685,6 +681,10 @@ def parse_image_embeds(self, self._add_placeholder(placeholder) + def parse_image_object(self, image: object) -> None: + placeholder = self._tracker.add("image", image) + self._add_placeholder(placeholder) + def parse_audio(self, audio_url: str) -> None: audio = self._connector.fetch_audio(audio_url) @@ -722,13 +722,6 @@ def parse_image(self, image_url: str) -> None: placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) - def parse_direct_image(self, image: object) -> None: - future: asyncio.Future[object] = asyncio.Future() - future.set_result(image) - - placeholder = self._tracker.add("image", future) - self._add_placeholder(placeholder) - def parse_image_embeds(self, image_embeds: Union[str, dict[str, str]]) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() @@ -748,6 +741,13 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder(placeholder) + def parse_image_object(self, image: object) -> None: + future: asyncio.Future[object] = asyncio.Future() + future.set_result(image) + + placeholder = self._tracker.add("image", future) + self._add_placeholder(placeholder) + def parse_audio(self, audio_url: str) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) @@ -1035,7 +1035,7 @@ def _parse_chat_message_content_part( return {'type': 'image'} if wrap_dicts else None if part_type == "image": image_content = cast(object, content) # PIL image or similar - mm_parser.parse_direct_image(image_content) + mm_parser.parse_image_object(image_content) return {'type': 'image'} if wrap_dicts else None if part_type == "audio_url": str_content = cast(str, content) From b53ed8828d4f0e69770691977876b0aa24e5947d Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 03:00:24 +0000 Subject: [PATCH 04/11] Format Signed-off-by: Flora Feng <4florafeng@gmail.com> Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/test_chat_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index fb903477e5bf..5003ccab0fe8 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -154,7 +154,6 @@ def test_parse_chat_messages_single_image( "role": "user", "content": "<|image_1|>\nWhat's in the image?" }] - print('mm data', mm_data) _assert_mm_data_is_image_input(mm_data, 1) From 93dd1bb0d888456e45e85b83cb91f209b4ccc19d Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 03:47:30 +0000 Subject: [PATCH 05/11] test mistral Signed-off-by: sfeng33 <4florafeng@gmail.com> --- examples/offline_inference/mistral-small.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 330103d5818a..f50e485feb67 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -6,6 +6,7 @@ from vllm import LLM from vllm.sampling_params import SamplingParams +from vllm.assets.image import ImageAsset # This script is an offline demo for running Mistral-Small-3.1 # @@ -55,7 +56,7 @@ def run_simple_demo(args: argparse.Namespace): - model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + model_name = "mistral-community/pixtral-12b" sampling_params = SamplingParams(max_tokens=8192) llm = LLM( @@ -66,7 +67,7 @@ def run_simple_demo(args: argparse.Namespace): limit_mm_per_prompt={"image": 1}, max_model_len=4096, max_num_seqs=2, - tensor_parallel_size=2, + tensor_parallel_size=1, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -78,7 +79,7 @@ def run_simple_demo(args: argparse.Namespace): "role": "user", "content": [ {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "image", "image": ImageAsset('cherry_blossom').pil_image}, ], }, ] @@ -89,7 +90,7 @@ def run_simple_demo(args: argparse.Namespace): def run_advanced_demo(args: argparse.Namespace): - model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + model_name = "mistral-community/pixtral-12b" max_img_per_msg = 3 max_tokens_per_img = 4096 From eb7b1eefabb28d9230be08cdc1b38b25f7f19ce6 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 06:25:46 +0000 Subject: [PATCH 06/11] Fix precommit and update type Signed-off-by: sfeng33 <4florafeng@gmail.com> --- vllm/entrypoints/chat_utils.py | 44 +++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a76450258945..e582f0a77768 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -8,6 +8,7 @@ from collections.abc import Awaitable, Iterable from functools import cached_property, lru_cache, partial from pathlib import Path +from PIL import Image from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, cast) @@ -28,7 +29,7 @@ ChatCompletionToolMessageParam) from openai.types.chat.chat_completion_content_part_input_audio_param import ( InputAudio) -from pydantic import TypeAdapter +from pydantic import TypeAdapter, ConfigDict, BaseModel # yapf: enable from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin) @@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): """The type of the content part.""" +class PILImage(BaseModel): + """ + A PIL.Image.Image object. + """ + image: Image.Image + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class CustomChatCompletionContentPILImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a PIL image. + + Example: + { + "image": ImageAsset('cherry_blossom').pil_image + } + """ + image: Required[PILImage] + + class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. @@ -129,6 +149,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartInputAudioParam, ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, + CustomChatCompletionContentPILImageParam, CustomChatCompletionContentSimpleImageParam, ChatCompletionContentPartImageEmbedsParam, CustomChatCompletionContentSimpleAudioParam, @@ -632,7 +653,7 @@ def parse_image_embeds(self, raise NotImplementedError @abstractmethod - def parse_image_object(self, image: object) -> None: + def parse_pil_image(self, image: Image.Image) -> None: raise NotImplementedError @abstractmethod @@ -681,7 +702,7 @@ def parse_image_embeds(self, self._add_placeholder(placeholder) - def parse_image_object(self, image: object) -> None: + def parse_pil_image(self, image: Image.Image) -> None: placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) @@ -741,8 +762,8 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder(placeholder) - def parse_image_object(self, image: object) -> None: - future: asyncio.Future[object] = asyncio.Future() + def parse_pil_image(self, image: Image.Image) -> None: + future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image) placeholder = self._tracker.add("image", future) @@ -866,12 +887,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam) _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) +_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam) # Need to validate url objects _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python -_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio] +_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage] # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: dict[ @@ -884,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), - "image": lambda part: part.get("image", None), + "image": lambda part: _PILImageParser(part).get("image", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": @@ -1025,6 +1047,10 @@ def _parse_chat_message_content_part( else: return str_content + if part_type == "image": + image_content = cast(Image.Image, content) + mm_parser.parse_pil_image(image_content) + return {'type': 'image'} if wrap_dicts else None if part_type == "image_url": str_content = cast(str, content) mm_parser.parse_image(str_content) @@ -1033,10 +1059,6 @@ def _parse_chat_message_content_part( content = cast(Union[str, dict[str, str]], content) mm_parser.parse_image_embeds(content) return {'type': 'image'} if wrap_dicts else None - if part_type == "image": - image_content = cast(object, content) # PIL image or similar - mm_parser.parse_image_object(image_content) - return {'type': 'image'} if wrap_dicts else None if part_type == "audio_url": str_content = cast(str, content) mm_parser.parse_audio(str_content) From 036981bfed702e8607555d2371f5751cdd53291a Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 06:36:39 +0000 Subject: [PATCH 07/11] Revert "test" This reverts commit 4faa6cce809a1375b0d380ac67eee76d6e521017. Signed-off-by: sfeng33 <4florafeng@gmail.com> --- examples/offline_inference/mistral-small.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index f50e485feb67..330103d5818a 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -6,7 +6,6 @@ from vllm import LLM from vllm.sampling_params import SamplingParams -from vllm.assets.image import ImageAsset # This script is an offline demo for running Mistral-Small-3.1 # @@ -56,7 +55,7 @@ def run_simple_demo(args: argparse.Namespace): - model_name = "mistral-community/pixtral-12b" + model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" sampling_params = SamplingParams(max_tokens=8192) llm = LLM( @@ -67,7 +66,7 @@ def run_simple_demo(args: argparse.Namespace): limit_mm_per_prompt={"image": 1}, max_model_len=4096, max_num_seqs=2, - tensor_parallel_size=1, + tensor_parallel_size=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) @@ -79,7 +78,7 @@ def run_simple_demo(args: argparse.Namespace): "role": "user", "content": [ {"type": "text", "text": prompt}, - {"type": "image", "image": ImageAsset('cherry_blossom').pil_image}, + {"type": "image_url", "image_url": {"url": image_url}}, ], }, ] @@ -90,7 +89,7 @@ def run_simple_demo(args: argparse.Namespace): def run_advanced_demo(args: argparse.Namespace): - model_name = "mistral-community/pixtral-12b" + model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" max_img_per_msg = 3 max_tokens_per_img = 4096 From 6a34e59c6d7c8964f3593e5c8b8f361eb799595b Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 14 Jun 2025 07:22:02 +0000 Subject: [PATCH 08/11] Format Signed-off-by: Flora Feng <4florafeng@gmail.com> Signed-off-by: sfeng33 <4florafeng@gmail.com> --- vllm/entrypoints/chat_utils.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e582f0a77768..a7c6361882ea 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -8,7 +8,6 @@ from collections.abc import Awaitable, Iterable from functools import cached_property, lru_cache, partial from pathlib import Path -from PIL import Image from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union, cast) @@ -29,7 +28,8 @@ ChatCompletionToolMessageParam) from openai.types.chat.chat_completion_content_part_input_audio_param import ( InputAudio) -from pydantic import TypeAdapter, ConfigDict, BaseModel +from PIL import Image +from pydantic import BaseModel, ConfigDict, TypeAdapter # yapf: enable from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin) @@ -95,9 +95,9 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): class PILImage(BaseModel): """ A PIL.Image.Image object. - """ + """ image: Image.Image - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True) class CustomChatCompletionContentPILImageParam(TypedDict, total=False): @@ -702,9 +702,9 @@ def parse_image_embeds(self, self._add_placeholder(placeholder) - def parse_pil_image(self, image: Image.Image) -> None: - placeholder = self._tracker.add("image", image) - self._add_placeholder(placeholder) + def parse_pil_image(self, image: Image.Image) -> None: + placeholder = self._tracker.add("image", image) + self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: audio = self._connector.fetch_audio(audio_url) @@ -762,12 +762,12 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder(placeholder) - def parse_pil_image(self, image: Image.Image) -> None: - future: asyncio.Future[Image.Image] = asyncio.Future() - future.set_result(image) - - placeholder = self._tracker.add("image", future) - self._add_placeholder(placeholder) + def parse_pil_image(self, image: Image.Image) -> None: + future: asyncio.Future[Image.Image] = asyncio.Future() + future.set_result(image) + + placeholder = self._tracker.add("image", future) + self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) @@ -906,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), - "image": lambda part: _PILImageParser(part).get("image", None), + "image": lambda part: _PILImageParser(part).get("image", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": @@ -1048,9 +1048,9 @@ def _parse_chat_message_content_part( return str_content if part_type == "image": - image_content = cast(Image.Image, content) - mm_parser.parse_pil_image(image_content) - return {'type': 'image'} if wrap_dicts else None + image_content = cast(Image.Image, content) + mm_parser.parse_pil_image(image_content) + return {'type': 'image'} if wrap_dicts else None if part_type == "image_url": str_content = cast(str, content) mm_parser.parse_image(str_content) From 18b99e2a9883cc89d01a24efa3f068d9b49224e8 Mon Sep 17 00:00:00 2001 From: sfeng33 <4florafeng@gmail.com> Date: Fri, 20 Jun 2025 05:59:25 +0000 Subject: [PATCH 09/11] Address feedback Signed-off-by: sfeng33 <4florafeng@gmail.com> --- examples/offline_inference/mistral-small.py | 7 +++++-- tests/entrypoints/test_chat_utils.py | 8 ++++---- vllm/entrypoints/chat_utils.py | 22 ++++++++++----------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 330103d5818a..a38fc9216d40 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -6,6 +6,7 @@ from vllm import LLM from vllm.sampling_params import SamplingParams +from vllm.assets.image import ImageAsset # This script is an offline demo for running Mistral-Small-3.1 # @@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace): ) prompt = "Describe this image in one sentence." - image_url = "https://picsum.photos/id/237/200/300" messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + }, ], }, ] diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5003ccab0fe8..e41ea686e992 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -264,8 +264,8 @@ def test_parse_chat_messages_multiple_images( "url": image_url } }, { - "type": "image", - "image": ImageAsset('cherry_blossom').pil_image + "type": "image_pil", + "image_pil": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" @@ -301,8 +301,8 @@ async def test_parse_chat_messages_multiple_images_async( "url": image_url } }, { - "type": "image", - "image": ImageAsset('cherry_blossom').pil_image + "type": "image_pil", + "image_pil": ImageAsset('cherry_blossom').pil_image }, { "type": "text", "text": "What's in these images?" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a7c6361882ea..aac827f97362 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -105,10 +105,10 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): Example: { - "image": ImageAsset('cherry_blossom').pil_image + "image_pil": ImageAsset('cherry_blossom').pil_image } """ - image: Required[PILImage] + image_pil: Required[PILImage] class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): @@ -653,7 +653,7 @@ def parse_image_embeds(self, raise NotImplementedError @abstractmethod - def parse_pil_image(self, image: Image.Image) -> None: + def parse_image_pil(self, image_pil: Image.Image) -> None: raise NotImplementedError @abstractmethod @@ -702,8 +702,8 @@ def parse_image_embeds(self, self._add_placeholder(placeholder) - def parse_pil_image(self, image: Image.Image) -> None: - placeholder = self._tracker.add("image", image) + def parse_image_pil(self, image_pil: Image.Image) -> None: + placeholder = self._tracker.add("image", image_pil) self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: @@ -762,9 +762,9 @@ def parse_image_embeds(self, placeholder = self._tracker.add("image_embeds", future) self._add_placeholder(placeholder) - def parse_pil_image(self, image: Image.Image) -> None: + def parse_image_pil(self, image_pil: Image.Image) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() - future.set_result(image) + future.set_result(image_pil) placeholder = self._tracker.add("image", future) self._add_placeholder(placeholder) @@ -906,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int], lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), - "image": lambda part: _PILImageParser(part).get("image", None), + "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), "input_audio": @@ -976,7 +976,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "image_embeds", "image", + "image_embeds", "image_pil", "audio_url", "input_audio", "video_url") @@ -1047,9 +1047,9 @@ def _parse_chat_message_content_part( else: return str_content - if part_type == "image": + if part_type == "image_pil": image_content = cast(Image.Image, content) - mm_parser.parse_pil_image(image_content) + mm_parser.parse_image_pil(image_content) return {'type': 'image'} if wrap_dicts else None if part_type == "image_url": str_content = cast(str, content) From 870ef7901ca4065e38c92eb06b97d8c170f3e21d Mon Sep 17 00:00:00 2001 From: sfeng33 <4florafeng@gmail.com> Date: Tue, 1 Jul 2025 06:35:35 +0000 Subject: [PATCH 10/11] Update doc Signed-off-by: sfeng33 <4florafeng@gmail.com> --- docs/features/multimodal_inputs.md | 43 ++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index c45d620dc8e0..ed11d2836037 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis Full example: +If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: + +```python +from vllm import LLM +from vllm.assets.image import ImageAsset + +llm = LLM(model="llava-hf/llava-1.5-7b-hf") +image_url = "https://picsum.photos/id/32/512/512" +image_pil = ImageAsset('cherry_blossom').pil_image +image_embeds = torch.load(...) + +conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, + { + "role": "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + },{ + "type": "image_pil", + "image_pil": image_pil + }, { + "type": "image_embeds", + "image_embeds": image_embeds + }, { + "type": "text", + "text": "What's in these images?" + }], + }, +] + +# Perform inference and log output. +outputs = llm.chat(conversation) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: ??? Code From a178d9ae6d6177fe18c291e150401723c043678a Mon Sep 17 00:00:00 2001 From: sfeng33 <4florafeng@gmail.com> Date: Wed, 2 Jul 2025 02:23:57 +0000 Subject: [PATCH 11/11] Update field name Signed-off-by: sfeng33 <4florafeng@gmail.com> --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aac827f97362..4b6c50526b10 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -96,7 +96,7 @@ class PILImage(BaseModel): """ A PIL.Image.Image object. """ - image: Image.Image + image_pil: Image.Image model_config = ConfigDict(arbitrary_types_allowed=True)