From c0447dd2741a69c6706899c616c8861b21bbdc02 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 07:09:08 +0000
Subject: [PATCH 01/11] Indicate more information about supported modalities

---
 docs/source/models/supported_models.rst | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 2c20b6e48407d..de21bcc1012c2 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -194,12 +194,12 @@ Multimodal Language Models
 
   * - Architecture
     - Models
-    - Supported Modalities
+    - Modalities
     - Example HuggingFace Models
     - :ref:`LoRA <lora>`
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
-    - Image
+    - Image\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
   * - :code:`ChameleonForConditionalGeneration`
@@ -214,40 +214,43 @@ Multimodal Language Models
     - 
   * - :code:`InternVLChatModel`
     - InternVL2
-    - Image
+    - Image\ :sup:`E`
     - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
     - 
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
-    - Image
+    - Image\ :sup:`E`
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
     -
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT
-    - Image
+    - Image\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma
-    - Image
+    - Image\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
     - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
-    - Image
+    - Image\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
   * - :code:`MiniCPMV`
     - MiniCPM-V
-    - Image
+    - Image\ :sup:`+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     -
   * - :code:`UltravoxModel`
     - Ultravox
-    - Audio
+    - Audio\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
 
+| :sup:`E` Pre-computed embeddings can be inputted for this modality.
+| :sup:`+` Multiple items can be inputted for this modality.
+
 .. note::
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

From 1b974988aa6c392c2c7ebf88bc19eed3957877d6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 07:11:02 +0000
Subject: [PATCH 02/11] Clarify

---
 docs/source/models/supported_models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index de21bcc1012c2..084be1e2a4f8e 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -249,7 +249,7 @@ Multimodal Language Models
     -
 
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
-| :sup:`+` Multiple items can be inputted for this modality.
+| :sup:`+` Multiple items can be inputted per text prompt for this modality.
 
 .. note::
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.

From d170164d05ba77a9a1bcf1eebc4d250cb2a3302d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 07:15:09 +0000
Subject: [PATCH 03/11] Remove outdated note about single-image input

---
 docs/source/models/vlm.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 236e37b51d470..e7ba357f06ff6 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -9,10 +9,6 @@ This document shows you how to run and serve these models using vLLM.
 .. important::
     We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation.
 
-    Currently, the support for vision language models on vLLM has the following limitations:
-
-    * Only single image input is supported per text prompt.
-
     We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
 Offline Batched Inference

From 61905b8efe2b7018921521bf1a454fc503dfe434 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:08:19 +0000
Subject: [PATCH 04/11] Update docs and add example

---
 docs/source/models/vlm.rst                    | 133 ++++++++++++++----
 ...e_inference_vision_language_multi_image.py |  50 +++++++
 examples/openai_vision_api_client.py          |   4 +-
 3 files changed, 158 insertions(+), 29 deletions(-)
 create mode 100644 examples/offline_inference_vision_language_multi_image.py

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index e7ba357f06ff6..4ec0c403e364d 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -14,17 +14,18 @@ This document shows you how to run and serve these models using vLLM.
 Offline Batched Inference
 -------------------------
 
-To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+Single-image input
+^^^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models.
 
 .. code-block:: python
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
-.. important::
+.. note::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
-    internally for each model.
-
+    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that internally for each model.
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
 
@@ -82,61 +83,139 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
 
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
+Multi-image input
+^^^^^^^^^^^^^^^^^^
+
+Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
+
+To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class.
+
+.. code-block:: python
+
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        # Set the value to the maximum number you want to support
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    # It's quite tedious to create the prompt with multiple image placeholders
+    # Let's instead use the chat template that is built into Phi-3.5-vision
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+    outputs = llm.chat([{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What are the animals in these images?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url_duck},
+            },
+            {
+                "type": "image_url",
+                "image_url": {"url": image_url_lion},
+            },
+        ],
+    }])
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
 
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
 
 You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
 
-.. note::
-    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
-    added in the future.
-
-Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
-
-.. important::
-    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
-    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
-    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
+Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server.
 
 .. code-block:: bash
 
-    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+    vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+      --trust-remote-code --limit-mm-per-prompt image=2
 
 .. important::
-    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that
-    internally for each model.
+    Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
+    a chat template is **required** to launch the API server.
+
+    Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
+    The chat template can be inferred based on the documentation on the model's HuggingFace repo.
+    For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
 
 To consume the server, you can use the OpenAI client like in the example below:
 
 .. code-block:: python
 
     from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
     openai_api_key = "EMPTY"
     openai_api_base = "http://localhost:8000/v1"
+
     client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
+
+    models = client.models.list()
+    model = models.data[0].id
+    assert model == "microsoft/Phi-3.5-vision-instruct"
+
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+    # Use image url in the payload
+    chat_response = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What’s in this image?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+            ],
+        }],
+        model=model,
+        max_tokens=64,
+    )
+    print("Chat completion output:", chat_response.choices[0].message.content)
+
+    # Multi-image input inference
+    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
     chat_response = client.chat.completions.create(
-        model="llava-hf/llava-1.5-7b-hf",
         messages=[{
             "role": "user",
             "content": [
-                # NOTE: The prompt formatting with the image token `<image>` is not needed
-                # since the prompt will be processed automatically by the API server.
-                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "text",
+                    "text": "What are the animals in these images?"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url_duck},
+                },
                 {
                     "type": "image_url",
-                    "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-                    },
+                    "image_url": {"url": image_url_lion},
                 },
             ],
         }],
+        model=model,
+        max_tokens=64,
     )
-    print("Chat response:", chat_response)
+    print("Chat completion output:", chat_response.choices[0].message.content)
+
 
 A full code example can be found in `examples/openai_vision_api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_vision_api_client.py>`_.
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
new file mode 100644
index 0000000000000..db3a47627144e
--- /dev/null
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -0,0 +1,50 @@
+"""
+This example shows how to use vLLM for running offline inference with
+multi-image input on vision language models, using the chat template defined
+by the model.
+"""
+from typing import List
+
+from vllm import LLM
+
+IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
+]
+
+
+def run_phi3v(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        trust_remote_code=True,  # Required to load Phi-3.5-vision
+        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
+        # Set the value to the maximum number you want to support
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    # It's quite tedious to create the prompt with multiple image placeholders
+    # Let's instead use the chat template that is built into Phi-3.5-vision
+    outputs = llm.chat([{
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What are the animals in these images?"
+            },
+            *(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                }
+                for image_url in image_urls
+            ),
+        ],
+    }])
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    run_phi3v(IMAGE_URLS)
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index e1d4055763e5f..694b8227fcd65 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -52,7 +52,7 @@
 )
 
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)
 
 
 # Use base64 encoded image in the payload
@@ -122,4 +122,4 @@ def encode_image_base64_from_url(image_url: str) -> str:
 )
 
 result = chat_completion_from_url.choices[0].message.content
-print(f"Chat completion output:{result}")
+print("Chat completion output:", result)

From 53d4bea65d1485bddeca85adedef840f4a025d9a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:09:26 +0000
Subject: [PATCH 05/11] Add missing references

---
 docs/source/getting_started/debugging.rst  | 2 +-
 docs/source/getting_started/quickstart.rst | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 117a9dd666481..31ecca1332e5d 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still
 
 With more logging, hopefully you can find the root cause of the issue.
 
-If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
+If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
 
 Here are some common issues that can cause hangs:
 
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 89bdc247c5e8e..80b19ac672936 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -24,7 +24,9 @@ Offline Batched Inference
 
 We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
 
-Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
+Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
+The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
+The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
 
 .. code-block:: python
 
@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
+Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
 
 .. code-block:: python
 

From 1861b424daa1423794153c0fae26b486f3f00565 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:14:34 +0000
Subject: [PATCH 06/11] Clean up

---
 docs/source/models/vlm.rst                    | 23 +++++++++++--------
 ...e_inference_vision_language_multi_image.py |  3 +--
 examples/openai_vision_api_client.py          |  5 ++--
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 4ec0c403e364d..e3e5dd4a6d35b 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -11,8 +11,8 @@ This document shows you how to run and serve these models using vLLM.
 
     We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_ if you have any feedback or feature requests.
 
-Offline Batched Inference
--------------------------
+Offline Inference
+-----------------
 
 Single-image input
 ^^^^^^^^^^^^^^^^^^
@@ -25,7 +25,7 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag
 
 .. note::
     We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow
-    the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that internally for each model.
+    the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model.
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:
 
@@ -84,7 +84,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI
 A code example can be found in `examples/offline_inference_vision_language.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py>`_.
 
 Multi-image input
-^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^
 
 Multi-image input is only supported for a subset of VLMs, as shown :ref:`here <supported_vlms>`.
 
@@ -96,14 +96,15 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,  # Required to load Phi-3.5-vision
         max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-        # Set the value to the maximum number you want to support
-        limit_mm_per_prompt={"image": 2},
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
     )
 
     # It's quite tedious to create the prompt with multiple image placeholders
     # Let's instead use the chat template that is built into Phi-3.5-vision
     image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
+    # Multi-image input inference
     outputs = llm.chat([{
         "role": "user",
         "content": [
@@ -128,8 +129,11 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm
 
 A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
 
-Online OpenAI Vision API Compatible Inference
-----------------------------------------------
+Online Inference
+----------------
+
+OpenAI Vision API
+^^^^^^^^^^^^^^^^^
 
 You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
 
@@ -168,9 +172,9 @@ To consume the server, you can use the OpenAI client like in the example below:
     model = models.data[0].id
     assert model == "microsoft/Phi-3.5-vision-instruct"
 
+    # Single-image input inference
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-    # Use image url in the payload
     chat_response = client.chat.completions.create(
         messages=[{
             "role": "user",
@@ -193,6 +197,7 @@ To consume the server, you can use the OpenAI client like in the example below:
     # Multi-image input inference
     image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+
     chat_response = client.chat.completions.create(
         messages=[{
             "role": "user",
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index db3a47627144e..94e46d8e911fa 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -18,8 +18,7 @@ def run_phi3v(image_urls: List[str]):
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,  # Required to load Phi-3.5-vision
         max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-        # Set the value to the maximum number you want to support
-        limit_mm_per_prompt={"image": 2},
+        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
     )
 
     # It's quite tedious to create the prompt with multiple image placeholders
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 694b8227fcd65..1ba702ef019e4 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -27,9 +27,10 @@
 models = client.models.list()
 model = models.data[0].id
 
+# Single-image input inference
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-# Use image url in the payload
+## Use image url in the payload
 chat_completion_from_url = client.chat.completions.create(
     messages=[{
         "role":
@@ -55,7 +56,7 @@
 print("Chat completion output:", result)
 
 
-# Use base64 encoded image in the payload
+## Use base64 encoded image in the payload
 def encode_image_base64_from_url(image_url: str) -> str:
     """Encode an image retrieved from a remote url to base64 format."""
 

From 5df8488a6ac6f96420c3af0c2c7af8313007963b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:41:06 +0000
Subject: [PATCH 07/11] Use generate method

---
 docs/source/models/vlm.rst                    | 38 ++++-----
 ...e_inference_vision_language_multi_image.py | 79 +++++++++++++++----
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index e3e5dd4a6d35b..5eee08e8bb9a9 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -99,29 +99,23 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm
         limit_mm_per_prompt={"image": 2},  # The maximum number to accept
     )
 
-    # It's quite tedious to create the prompt with multiple image placeholders
-    # Let's instead use the chat template that is built into Phi-3.5-vision
-    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
-    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+Instead of passing in a single image, you can pass in a list of images.
 
-    # Multi-image input inference
-    outputs = llm.chat([{
-        "role": "user",
-        "content": [
-            {
-                "type": "text",
-                "text": "What are the animals in these images?"
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": image_url_duck},
-            },
-            {
-                "type": "image_url",
-                "image_url": {"url": image_url_lion},
-            },
-        ],
-    }])
+.. code-block:: python
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "<|user|>\n<image_1>\n<image_2>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
+
+    # Load the images using PIL.Image
+    image1 = PIL.Image.open(...)
+    image2 = PIL.Image.open(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [image1, image2]
+        },
+    })
 
     for o in outputs:
         generated_text = o.outputs[0].text
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 94e46d8e911fa..e64555b8fdf6a 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -3,40 +3,65 @@
 multi-image input on vision language models, using the chat template defined
 by the model.
 """
+from argparse import Namespace
 from typing import List
 
 from vllm import LLM
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
 
+QUESTION = "What is the content of each image?"
 IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
     "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
 ]
 
 
-def run_phi3v(image_urls: List[str]):
-    llm = LLM(
+def _load_phi3v(image_urls: List[str]):
+    return LLM(
         model="microsoft/Phi-3.5-vision-instruct",
-        trust_remote_code=True,  # Required to load Phi-3.5-vision
-        max_model_len=4096,  # Otherwise, it may not fit in smaller GPUs
-        limit_mm_per_prompt={"image": 2},  # The maximum number to accept
+        trust_remote_code=True,
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    # It's quite tedious to create the prompt with multiple image placeholders
-    # Let's instead use the chat template that is built into Phi-3.5-vision
+
+def run_phi3v_generate(question: str, image_urls: List[str]):
+    llm = _load_phi3v(image_urls)
+
+    placeholders = "\n".join(f"<|image_{i}|>"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": [fetch_image(url) for url in image_urls]
+        },
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def run_phi3v_chat(question: str, image_urls: List[str]):
+    llm = _load_phi3v(image_urls)
+
     outputs = llm.chat([{
-        "role": "user",
+        "role":
+        "user",
         "content": [
             {
                 "type": "text",
-                "text": "What are the animals in these images?"
+                "text": question,
             },
-            *(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": image_url},
-                }
-                for image_url in image_urls
-            ),
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            } for image_url in image_urls),
         ],
     }])
 
@@ -45,5 +70,25 @@ def run_phi3v(image_urls: List[str]):
         print(generated_text)
 
 
+def main(args: Namespace):
+    method = args.method
+
+    if method == "generate":
+        run_phi3v_generate(QUESTION, IMAGE_URLS)
+    elif method == "chat":
+        run_phi3v_chat(QUESTION, IMAGE_URLS)
+    else:
+        raise ValueError(f"Invalid method: {method}")
+
+
 if __name__ == "__main__":
-    run_phi3v(IMAGE_URLS)
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models that support multi-image input')
+    parser.add_argument("method",
+                        type=str,
+                        choices=["generate", "chat"],
+                        help="The method to run in `vllm.LLM`.")
+
+    args = parser.parse_args()
+    main(args)

From 9cdbf0e9963a89dfbf94180d55af749175f9e595 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:42:54 +0000
Subject: [PATCH 08/11] Remove some unnecessary lines

---
 docs/source/models/vlm.rst | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 5eee08e8bb9a9..0ab944c68e6d4 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -152,24 +152,19 @@ To consume the server, you can use the OpenAI client like in the example below:
 
     from openai import OpenAI
 
-    # Modify OpenAI's API key and API base to use vLLM's API server.
     openai_api_key = "EMPTY"
     openai_api_base = "http://localhost:8000/v1"
 
     client = OpenAI(
-        # defaults to os.environ.get("OPENAI_API_KEY")
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
 
-    models = client.models.list()
-    model = models.data[0].id
-    assert model == "microsoft/Phi-3.5-vision-instruct"
-
     # Single-image input inference
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
     chat_response = client.chat.completions.create(
+        "microsoft/Phi-3.5-vision-instruct",
         messages=[{
             "role": "user",
             "content": [
@@ -183,8 +178,6 @@ To consume the server, you can use the OpenAI client like in the example below:
                 },
             ],
         }],
-        model=model,
-        max_tokens=64,
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 
@@ -193,6 +186,7 @@ To consume the server, you can use the OpenAI client like in the example below:
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
 
     chat_response = client.chat.completions.create(
+        "microsoft/Phi-3.5-vision-instruct",
         messages=[{
             "role": "user",
             "content": [
@@ -210,8 +204,6 @@ To consume the server, you can use the OpenAI client like in the example below:
                 },
             ],
         }],
-        model=model,
-        max_tokens=64,
     )
     print("Chat completion output:", chat_response.choices[0].message.content)
 

From f7476786827b5076d66ea45e7d744967e13ec063 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:44:43 +0000
Subject: [PATCH 09/11] Further compress the lines

---
 docs/source/models/vlm.rst | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 0ab944c68e6d4..08db891665044 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -164,18 +164,14 @@ To consume the server, you can use the OpenAI client like in the example below:
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
     chat_response = client.chat.completions.create(
-        "microsoft/Phi-3.5-vision-instruct",
+        model="microsoft/Phi-3.5-vision-instruct",
         messages=[{
             "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": "What’s in this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": image_url},
-                },
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
+                {"type": "text", "text": "What’s in this image?"},
+                {"type": "image_url", "image_url": {"url": image_url}},
             ],
         }],
     )
@@ -186,22 +182,13 @@ To consume the server, you can use the OpenAI client like in the example below:
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
 
     chat_response = client.chat.completions.create(
-        "microsoft/Phi-3.5-vision-instruct",
+        model="microsoft/Phi-3.5-vision-instruct",
         messages=[{
             "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": "What are the animals in these images?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": image_url_duck},
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {"url": image_url_lion},
-                },
+                {"type": "text", "text": "What are the animals in these images?"},
+                {"type": "image_url", "image_url": {"url": image_url_duck}},
+                {"type": "image_url", "image_url": {"url": image_url_lion}},
             ],
         }],
     )

From 7950c27e183fcd1aaaaf284f1599da1fa3440feb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:48:05 +0000
Subject: [PATCH 10/11] Add new example to the tests

---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b2874750a777e..6f9802899205e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -158,6 +158,7 @@ steps:
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
     - python3 offline_inference_vision_language.py
+    - python3 offline_inference_vision_language_multi_image.py generate
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 

From 97b6006f5452199de6c57275aebecca1e41e1ea8 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 5 Sep 2024 08:49:34 +0000
Subject: [PATCH 11/11] Use a default argument

---
 .buildkite/test-pipeline.yaml                             | 2 +-
 examples/offline_inference_vision_language_multi_image.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6f9802899205e..d0317b2fc48c9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -158,7 +158,7 @@ steps:
     - python3 offline_inference_with_prefix.py
     - python3 llm_engine_example.py
     - python3 offline_inference_vision_language.py
-    - python3 offline_inference_vision_language_multi_image.py generate
+    - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index e64555b8fdf6a..73543ab5da2b4 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -85,8 +85,9 @@ def main(args: Namespace):
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models that support multi-image input')
-    parser.add_argument("method",
+    parser.add_argument("--method",
                         type=str,
+                        default="generate",
                         choices=["generate", "chat"],
                         help="The method to run in `vllm.LLM`.")