From 7217a27e8ad2347d613d0e8764cb6bd95082dcdb Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 00:16:50 +0000
Subject: [PATCH 01/11] add image support

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 1054b969cd3b..365b8db7b328 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -662,6 +662,11 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
+    def parse_direct_image(self, image: object) -> None:  
+        # Directly add the PIL image without URL processing  
+        placeholder = self._tracker.add("image", image)  
+        self._add_placeholder(placeholder)  
+
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
         if isinstance(image_embeds, dict):
@@ -869,6 +874,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image": lambda part: part.get("image", None),  
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -938,7 +944,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -1017,6 +1023,10 @@ def _parse_chat_message_content_part(
         content = cast(Union[str, dict[str, str]], content)
         mm_parser.parse_image_embeds(content)
         return {'type': 'image'} if wrap_dicts else None
+    if part_type == "image":
+        image_content = cast(object, content)  # PIL image or similar  
+        mm_parser.parse_direct_image(image_content)  
+        return {'type': 'image'} if wrap_dicts else None  
     if part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)

From b1bbb46c4dab4b074fe1ff575c0ee7e57fcb8cd1 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 01:25:46 +0000
Subject: [PATCH 02/11] Support async

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/entrypoints/test_chat_utils.py | 24 ++++++++----------------
 vllm/entrypoints/chat_utils.py       | 12 +++++++++++-
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 49294664275a..8700911563e8 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -136,10 +136,8 @@ def test_parse_chat_messages_single_image(
             "role":
             "user",
             "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image",
+                "image_url": image_url
             }, {
                 "type": "text",
                 "text": "What's in the image?"
@@ -228,10 +226,8 @@ async def test_parse_chat_messages_single_image_async(
             "role":
             "user",
             "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image",
+                "image": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in the image?"
@@ -264,10 +260,8 @@ def test_parse_chat_messages_multiple_images(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image",
+                "image": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
@@ -303,10 +297,8 @@ async def test_parse_chat_messages_multiple_images_async(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image",
+                "image": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 365b8db7b328..3d702ca1f825 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -626,6 +626,10 @@ def mm_placeholder_counts(self) -> dict[str, int]:
     def parse_image(self, image_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_direct_image(self, image: object) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
@@ -663,7 +667,6 @@ def parse_image(self, image_url: str) -> None:
         self._add_placeholder(placeholder)
 
     def parse_direct_image(self, image: object) -> None:  
-        # Directly add the PIL image without URL processing  
         placeholder = self._tracker.add("image", image)  
         self._add_placeholder(placeholder)  
 
@@ -719,6 +722,13 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
+    def parse_direct_image(self, image: object) -> None:  
+        future: asyncio.Future[object] = asyncio.Future()  
+        future.set_result(image)  
+        
+        placeholder = self._tracker.add("image", future) 
+        self._add_placeholder(placeholder)  
+
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()

From e781aed9270a28e0b5d7eb8099b65377a4142089 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 02:58:21 +0000
Subject: [PATCH 03/11] Fix test

Signed-off-by: Flora Feng <4florafeng@gmail.com>
Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/entrypoints/test_chat_utils.py | 13 ++++++++----
 vllm/entrypoints/chat_utils.py       | 30 ++++++++++++++--------------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 8700911563e8..fb903477e5bf 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -136,8 +136,10 @@ def test_parse_chat_messages_single_image(
             "role":
             "user",
             "content": [{
-                "type": "image",
-                "image_url": image_url
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
             }, {
                 "type": "text",
                 "text": "What's in the image?"
@@ -152,6 +154,7 @@ def test_parse_chat_messages_single_image(
         "role": "user",
         "content": "<|image_1|>\nWhat's in the image?"
     }]
+    print('mm data', mm_data)
     _assert_mm_data_is_image_input(mm_data, 1)
 
 
@@ -226,8 +229,10 @@ async def test_parse_chat_messages_single_image_async(
             "role":
             "user",
             "content": [{
-                "type": "image",
-                "image": ImageAsset('cherry_blossom').pil_image
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
             }, {
                 "type": "text",
                 "text": "What's in the image?"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3d702ca1f825..a76450258945 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -627,12 +627,12 @@ def parse_image(self, image_url: str) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_direct_image(self, image: object) -> None:
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_embeds(self,
-                           image_embeds: Union[str, dict[str, str]]) -> None:
+    def parse_image_object(self, image: object) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -666,10 +666,6 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
-    def parse_direct_image(self, image: object) -> None:  
-        placeholder = self._tracker.add("image", image)  
-        self._add_placeholder(placeholder)  
-
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
         if isinstance(image_embeds, dict):
@@ -685,6 +681,10 @@ def parse_image_embeds(self,
 
         self._add_placeholder(placeholder)
 
+    def parse_image_object(self, image: object) -> None:  
+        placeholder = self._tracker.add("image", image)  
+        self._add_placeholder(placeholder)  
+
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
@@ -722,13 +722,6 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
-    def parse_direct_image(self, image: object) -> None:  
-        future: asyncio.Future[object] = asyncio.Future()  
-        future.set_result(image)  
-        
-        placeholder = self._tracker.add("image", future) 
-        self._add_placeholder(placeholder)  
-
     def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
@@ -748,6 +741,13 @@ def parse_image_embeds(self,
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
+    def parse_image_object(self, image: object) -> None:  
+        future: asyncio.Future[object] = asyncio.Future()  
+        future.set_result(image)  
+        
+        placeholder = self._tracker.add("image", future) 
+        self._add_placeholder(placeholder)  
+
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
@@ -1035,7 +1035,7 @@ def _parse_chat_message_content_part(
         return {'type': 'image'} if wrap_dicts else None
     if part_type == "image":
         image_content = cast(object, content)  # PIL image or similar  
-        mm_parser.parse_direct_image(image_content)  
+        mm_parser.parse_image_object(image_content)  
         return {'type': 'image'} if wrap_dicts else None  
     if part_type == "audio_url":
         str_content = cast(str, content)

From b53ed8828d4f0e69770691977876b0aa24e5947d Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 03:00:24 +0000
Subject: [PATCH 04/11] Format

Signed-off-by: Flora Feng <4florafeng@gmail.com>
Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/entrypoints/test_chat_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index fb903477e5bf..5003ccab0fe8 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -154,7 +154,6 @@ def test_parse_chat_messages_single_image(
         "role": "user",
         "content": "<|image_1|>\nWhat's in the image?"
     }]
-    print('mm data', mm_data)
     _assert_mm_data_is_image_input(mm_data, 1)
 
 

From 93dd1bb0d888456e45e85b83cb91f209b4ccc19d Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 03:47:30 +0000
Subject: [PATCH 05/11] test mistral

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 examples/offline_inference/mistral-small.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 330103d5818a..f50e485feb67 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,6 +6,7 @@
 
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset
 
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -55,7 +56,7 @@
 
 
 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    model_name = "mistral-community/pixtral-12b"
     sampling_params = SamplingParams(max_tokens=8192)
 
     llm = LLM(
@@ -66,7 +67,7 @@ def run_simple_demo(args: argparse.Namespace):
         limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
-        tensor_parallel_size=2,
+        tensor_parallel_size=1,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -78,7 +79,7 @@ def run_simple_demo(args: argparse.Namespace):
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "image", "image": ImageAsset('cherry_blossom').pil_image},
             ],
         },
     ]
@@ -89,7 +90,7 @@ def run_simple_demo(args: argparse.Namespace):
 
 
 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+    model_name = "mistral-community/pixtral-12b"
     max_img_per_msg = 3
     max_tokens_per_img = 4096
 

From eb7b1eefabb28d9230be08cdc1b38b25f7f19ce6 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 06:25:46 +0000
Subject: [PATCH 06/11] Fix precommit and update type

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 44 +++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a76450258945..e582f0a77768 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -8,6 +8,7 @@
 from collections.abc import Awaitable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
+from PIL import Image
 from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
                     cast)
 
@@ -28,7 +29,7 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
-from pydantic import TypeAdapter
+from pydantic import TypeAdapter, ConfigDict, BaseModel
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """       
+    image: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True) 
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image: Required[PILImage]
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
@@ -129,6 +149,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -632,7 +653,7 @@ def parse_image_embeds(self,
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_object(self, image: object) -> None:
+    def parse_pil_image(self, image: Image.Image) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -681,7 +702,7 @@ def parse_image_embeds(self,
 
         self._add_placeholder(placeholder)
 
-    def parse_image_object(self, image: object) -> None:  
+    def parse_pil_image(self, image: Image.Image) -> None:  
         placeholder = self._tracker.add("image", image)  
         self._add_placeholder(placeholder)  
 
@@ -741,8 +762,8 @@ def parse_image_embeds(self,
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
-    def parse_image_object(self, image: object) -> None:  
-        future: asyncio.Future[object] = asyncio.Future()  
+    def parse_pil_image(self, image: Image.Image) -> None:  
+        future: asyncio.Future[Image.Image] = asyncio.Future()  
         future.set_result(image)  
         
         placeholder = self._tracker.add("image", future) 
@@ -866,12 +887,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -884,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
-    "image": lambda part: part.get("image", None),  
+    "image": lambda part: _PILImageParser(part).get("image", None),  
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -1025,6 +1047,10 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    if part_type == "image":
+        image_content = cast(Image.Image, content)  
+        mm_parser.parse_pil_image(image_content)  
+        return {'type': 'image'} if wrap_dicts else None  
     if part_type == "image_url":
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)
@@ -1033,10 +1059,6 @@ def _parse_chat_message_content_part(
         content = cast(Union[str, dict[str, str]], content)
         mm_parser.parse_image_embeds(content)
         return {'type': 'image'} if wrap_dicts else None
-    if part_type == "image":
-        image_content = cast(object, content)  # PIL image or similar  
-        mm_parser.parse_image_object(image_content)  
-        return {'type': 'image'} if wrap_dicts else None  
     if part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)

From 036981bfed702e8607555d2371f5751cdd53291a Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 06:36:39 +0000
Subject: [PATCH 07/11] Revert "test"

This reverts commit 4faa6cce809a1375b0d380ac67eee76d6e521017.

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 examples/offline_inference/mistral-small.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index f50e485feb67..330103d5818a 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,7 +6,6 @@
 
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
-from vllm.assets.image import ImageAsset
 
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -56,7 +55,7 @@
 
 
 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistral-community/pixtral-12b"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     sampling_params = SamplingParams(max_tokens=8192)
 
     llm = LLM(
@@ -67,7 +66,7 @@ def run_simple_demo(args: argparse.Namespace):
         limit_mm_per_prompt={"image": 1},
         max_model_len=4096,
         max_num_seqs=2,
-        tensor_parallel_size=1,
+        tensor_parallel_size=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -79,7 +78,7 @@ def run_simple_demo(args: argparse.Namespace):
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image", "image": ImageAsset('cherry_blossom').pil_image},
+                {"type": "image_url", "image_url": {"url": image_url}},
             ],
         },
     ]
@@ -90,7 +89,7 @@ def run_simple_demo(args: argparse.Namespace):
 
 
 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistral-community/pixtral-12b"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     max_img_per_msg = 3
     max_tokens_per_img = 4096
 

From 6a34e59c6d7c8964f3593e5c8b8f361eb799595b Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 14 Jun 2025 07:22:02 +0000
Subject: [PATCH 08/11] Format

Signed-off-by: Flora Feng <4florafeng@gmail.com>
Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e582f0a77768..a7c6361882ea 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -8,7 +8,6 @@
 from collections.abc import Awaitable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
-from PIL import Image
 from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
                     cast)
 
@@ -29,7 +28,8 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
-from pydantic import TypeAdapter, ConfigDict, BaseModel
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
@@ -95,9 +95,9 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
 class PILImage(BaseModel):
     """
     A PIL.Image.Image object.
-    """       
+    """
     image: Image.Image
-    model_config = ConfigDict(arbitrary_types_allowed=True) 
+    model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
 class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
@@ -702,9 +702,9 @@ def parse_image_embeds(self,
 
         self._add_placeholder(placeholder)
 
-    def parse_pil_image(self, image: Image.Image) -> None:  
-        placeholder = self._tracker.add("image", image)  
-        self._add_placeholder(placeholder)  
+    def parse_pil_image(self, image: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image)
+        self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
@@ -762,12 +762,12 @@ def parse_image_embeds(self,
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
-    def parse_pil_image(self, image: Image.Image) -> None:  
-        future: asyncio.Future[Image.Image] = asyncio.Future()  
-        future.set_result(image)  
-        
-        placeholder = self._tracker.add("image", future) 
-        self._add_placeholder(placeholder)  
+    def parse_pil_image(self, image: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
@@ -906,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
-    "image": lambda part: _PILImageParser(part).get("image", None),  
+    "image": lambda part: _PILImageParser(part).get("image", None),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -1048,9 +1048,9 @@ def _parse_chat_message_content_part(
             return str_content
 
     if part_type == "image":
-        image_content = cast(Image.Image, content)  
-        mm_parser.parse_pil_image(image_content)  
-        return {'type': 'image'} if wrap_dicts else None  
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_pil_image(image_content)
+        return {'type': 'image'} if wrap_dicts else None
     if part_type == "image_url":
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)

From 18b99e2a9883cc89d01a24efa3f068d9b49224e8 Mon Sep 17 00:00:00 2001
From: sfeng33 <4florafeng@gmail.com>
Date: Fri, 20 Jun 2025 05:59:25 +0000
Subject: [PATCH 09/11] Address feedback

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 examples/offline_inference/mistral-small.py |  7 +++++--
 tests/entrypoints/test_chat_utils.py        |  8 ++++----
 vllm/entrypoints/chat_utils.py              | 22 ++++++++++-----------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 330103d5818a..a38fc9216d40 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,6 +6,7 @@
 
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset
 
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
     )
 
     prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"
 
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
             ],
         },
     ]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5003ccab0fe8..e41ea686e992 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -264,8 +264,8 @@ def test_parse_chat_messages_multiple_images(
                     "url": image_url
                 }
             }, {
-                "type": "image",
-                "image": ImageAsset('cherry_blossom').pil_image
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
@@ -301,8 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                     "url": image_url
                 }
             }, {
-                "type": "image",
-                "image": ImageAsset('cherry_blossom').pil_image
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index a7c6361882ea..aac827f97362 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -105,10 +105,10 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
 
     Example:
     {
-        "image": ImageAsset('cherry_blossom').pil_image
+        "image_pil": ImageAsset('cherry_blossom').pil_image
     }
     """
-    image: Required[PILImage]
+    image_pil: Required[PILImage]
 
 
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
@@ -653,7 +653,7 @@ def parse_image_embeds(self,
         raise NotImplementedError
 
     @abstractmethod
-    def parse_pil_image(self, image: Image.Image) -> None:
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
         raise NotImplementedError
 
     @abstractmethod
@@ -702,8 +702,8 @@ def parse_image_embeds(self,
 
         self._add_placeholder(placeholder)
 
-    def parse_pil_image(self, image: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image)
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
         self._add_placeholder(placeholder)
 
     def parse_audio(self, audio_url: str) -> None:
@@ -762,9 +762,9 @@ def parse_image_embeds(self,
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
-    def parse_pil_image(self, image: Image.Image) -> None:
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
         future: asyncio.Future[Image.Image] = asyncio.Future()
-        future.set_result(image)
+        future.set_result(image_pil)
 
         placeholder = self._tracker.add("image", future)
         self._add_placeholder(placeholder)
@@ -906,7 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
-    "image": lambda part: _PILImageParser(part).get("image", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -976,7 +976,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds", "image",
+                                       "image_embeds", "image_pil",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -1047,9 +1047,9 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
-    if part_type == "image":
+    if part_type == "image_pil":
         image_content = cast(Image.Image, content)
-        mm_parser.parse_pil_image(image_content)
+        mm_parser.parse_image_pil(image_content)
         return {'type': 'image'} if wrap_dicts else None
     if part_type == "image_url":
         str_content = cast(str, content)

From 870ef7901ca4065e38c92eb06b97d8c170f3e21d Mon Sep 17 00:00:00 2001
From: sfeng33 <4florafeng@gmail.com>
Date: Tue, 1 Jul 2025 06:35:35 +0000
Subject: [PATCH 10/11] Update doc

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 docs/features/multimodal_inputs.md | 43 ++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index c45d620dc8e0..ed11d2836037 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
+If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ??? Code

From a178d9ae6d6177fe18c291e150401723c043678a Mon Sep 17 00:00:00 2001
From: sfeng33 <4florafeng@gmail.com>
Date: Wed, 2 Jul 2025 02:23:57 +0000
Subject: [PATCH 11/11] Update field name

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index aac827f97362..4b6c50526b10 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -96,7 +96,7 @@ class PILImage(BaseModel):
     """
     A PIL.Image.Image object.
     """
-    image: Image.Image
+    image_pil: Image.Image
     model_config = ConfigDict(arbitrary_types_allowed=True)