vllm-project · DarkLight1337 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -82,6 +82,9 @@ steps:
 - label: LogitsProcessor Test
   command: pytest -v -s test_logits_processor.py
 
+- label: Utils Test
+  command: pytest -v -s test_utils.py
+
 - label: Worker Test
   command: pytest -v -s worker
 

diff --git a/README.md b/README.md
@@ -70,6 +70,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
 - Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
 - LLaMA, Llama 2, and Meta Llama 3 (`meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- LLavA-1.5 (`llava-hf/llava-1.5-7b-hf`, `llava-hf/llava-1.5-13b-hf`, etc.)
 - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.)

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -85,6 +85,7 @@ Documentation
    models/adding_model
    models/engine_args
    models/lora
+   models/vlm
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -83,6 +83,24 @@ Alongside each architecture, we include some popular models that use it.
     - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
+  * - :code:`LlavaForConditionalGeneration`
+    - LLaVA-1.5
+    - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
+
+      .. note::
+
+          Models with an asterisk (\*) are missing :code:`chat_template` from HuggingFace :code:`config.json`. A predefined template can be found in our repo (:code:`examples/template_llava.jinja`). To host the OpenAI-compatible server, provide the chat template via command-line arguments. You also need to provide the :code:`VisionLanguageConfig` to initialize the model. See the following example:
+
+          .. code-block:: shell
+
+              $ python -m vllm.entrypoints.openai.api_server \
+                  --model llava-hf/llava-1.5-7b-hf \
+                  --chat-template examples/template_llava.jinja \
+                  --image-input-type pixel_values \
+                  --image-token-id 32000 \
+                  --image-input-shape 1,3,336,336 \
+                  --image-feature-size 576
+    - 
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -0,0 +1,118 @@
+.. _vlm:
+
+Using VLMs
+==========
+
+This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+
+Additional Engine Arguments
+---------------------------
+
+Apart from the :ref:`basic engine arguments <engine_args>`, VLMs additionally require the following engine arguments for vLLM.
+
+.. option:: --image-input-type {pixel_values,image_features}
+
+    The image input type passed into vLLM. Should be one of "pixel_values" or "image_features".
+
+.. option:: --image-token-id <id>
+
+    Input ID for image token.
+
+.. option:: --image-input-shape <tuple>
+
+    The biggest image input shape (worst for memory footprint) given an input type. Only used for vLLM's profile_run.
+
+    For example, if the image tensor has shape :code:`(1, 3, 336, 336)`, then you should pass :code:`--image-input-shape 1,3,336,336`.
+
+.. option:: --image-feature-size <size>
+
+    The image feature size along the context dimension.
+
+.. option:: --image-processor <size>
+
+    Name or path of the huggingface image processor to use.
+
+.. option:: --image-processor-revision <revision>
+
+    The specific image processor version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+
+.. option:: --no-image-processor
+
+    Disables the use of image processor, even if one is defined for the model on huggingface.
+
+Offline Batched Inference
+-------------------------
+
+To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+
+.. code-block:: python
+
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        image_input_type="pixel_values",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=576,
+    )
+
+For now, we only support a single image per text prompt when calling ``llm.generate``. To pass an image to the model, note the following parameters:
+
+* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``multi_modal_datas``: This should be an instance of ``ImagePixelData``.
+
+.. code-block:: python
+
+    prompt = "<image>" * 576 + (
+        "\nUSER: What is the content of this image?\nASSISTANT:")
+
+    # Load the image using PIL.Image
+    image = ...
+
+    outputs = llm.generate(prompt, multi_modal_datas=ImagePixelData(image))
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+
+OpenAI-Compatible Server
+------------------------
+
+We support image inputs to the OpenAI Chat API, as described in `GPT-4 with Vision <https://platform.openai.com/docs/guides/vision>`_.
+
+Here is a simple example using the :code:`openai` package:
+
+.. code-block:: python
+
+    from openai import OpenAI
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Note that this model expects the image to come before the main text
+    chat_response = client.chat.completions.create(
+        model="llava-hf/llava-1.5-7b-hf",
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
+                },
+                {"type": "text", "text": "What's in this image?"},
+            ],
+        }],
+    )
+    print("Chat response:", chat_response)
+
+.. note::
+
+    For now, we only support a single image per API call. Also, the ``detail`` parameter is ignored since it may not be applicable to other models.
diff --git a/examples/llava_example.py b/examples/llava_example.py
@@ -3,9 +3,10 @@
 import subprocess
 
 import torch
+from PIL import Image
 
 from vllm import LLM
-from vllm.sequence import MultiModalData
+from vllm.sequence import ImageFeatureData, ImagePixelData
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 
@@ -23,11 +24,9 @@ def run_llava_pixel_values():
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
     # This should be provided by another online or offline component.
-    images = torch.load("images/stop_sign_pixel_values.pt")
+    image = Image.open("images/stop_sign.jpg")
 
-    outputs = llm.generate(prompt,
-                           multi_modal_data=MultiModalData(
-                               type=MultiModalData.Type.IMAGE, data=images))
+    outputs = llm.generate(prompt, multi_modal_datas=ImagePixelData(image))
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
@@ -46,11 +45,9 @@ def run_llava_image_features():
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
     # This should be provided by another online or offline component.
-    images = torch.load("images/stop_sign_image_features.pt")
+    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
 
-    outputs = llm.generate(prompt,
-                           multi_modal_data=MultiModalData(
-                               type=MultiModalData.Type.IMAGE, data=images))
+    outputs = llm.generate(prompt, multi_modal_datas=ImageFeatureData(image))
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)

diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja
@@ -0,0 +1,11 @@
+{%- for message in messages -%}
+    {{ message['role'].upper() + ': ' + message['content'] }}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- 'ASSISTANT:' -}}
+{% endif %}
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -9,10 +9,14 @@ transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 tiktoken == 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.9.3
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+
+# OpenAI server
+openai
+pydantic >= 2.0
+pillow
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -21,7 +21,6 @@ pytest-rerunfailures
 pytest-shard
 httpx
 einops # required for MPT
-openai
 requests
 ray
 peft
@@ -30,6 +29,3 @@ ai2-olmo # required for OLMo
 
 # Benchmarking
 aiohttp
-
-# Multimodal
-pillow