vllm-project · ywang96 · Jul 14, 2024 · Apr 8, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -51,6 +51,10 @@ Alongside each architecture, we include some popular models that use it.
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
     -
+  * - :code:`FuyuForCausalLM`
+    - Fuyu
+    - :code:`adept/fuyu-8b` etc.
+    - 
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -139,6 +143,10 @@ Alongside each architecture, we include some popular models that use it.
     - Phi-3-Small
     - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
     -
+  * - :code:`PersimmonForCausalLM`
+    - Persimmon
+    - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision
     - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.

diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
@@ -0,0 +1,63 @@
+import math
+import os
+import subprocess
+
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+
+def run_fuyu_pixel_values():
+    llm = LLM(
+        model="adept/fuyu-8b",
+        max_model_len=4096,
+        image_input_type="pixel_values",
+        image_token_id=71011,
+        image_input_shape="1,3,1080,1920",
+        image_feature_size=2304,
+    )
+
+    # load and create image prompt
+    image = Image.open("images/stop_sign.jpg")
+    W, H = image.size
+
+    nrow = math.ceil(min(H, 1080) / 30)
+    ncol = math.ceil(min(W, 1920) / 30)
+
+    # single-image prompt
+    prompt = "<image>\nWhat is the content of this image?\n"
+    prompt = prompt.replace("<image>",
+                            ("|SPEAKER|" * ncol + "|NEWLINE|") * nrow)
+
+    sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": ImagePixelData(image),
+        },
+        sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    # Download from s3
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
+    run_fuyu_pixel_values()
@@ -0,0 +1,115 @@
+from typing import List, Tuple
+
+import pytest
+
+from vllm.config import VisionLanguageConfig
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What color is the stop sign?\n",  # noqa: E501
+    "cherry_blossom":
+    "What is the season?\n",  # noqa: E501
+})
+
+
+def iter_fuyu_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (420, 660): 308,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=71011,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_fuyu_configs("adept/fuyu-8b"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str]):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+
+    hf_input_ids = input_ids[2:]
+    hf_output_str = output_str
+
+    return hf_input_ids, hf_output_str
+
+
+target_dtype = "half"
+if is_cpu():
+    target_dtype = "bfloat16"
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
+                dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+    _, _, H, W = vlm_config.image_input_shape
+
+    # resize images to the model's input shape
+    hf_images = [asset.for_hf().resize((W, H)) for asset in image_assets]
+    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    for i in range(len(image_assets)):
+        vllm_images[i].image = vllm_images[i].image.resize((W, H))
+
+    with hf_runner(model_id, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            HF_IMAGE_PROMPTS,
+            max_tokens,
+            images=hf_images,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
+
+    ncol, nrow = W // 30, H // 30
+    image_prompts = ("|SPEAKER|" * ncol + "|NEWLINE|") * nrow
+    vllm_image_prompts = [
+        image_prompts + "<s> " + p + "\x04" for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(model_id,
+                     max_model_len=1024,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(vllm_outputs[i])
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -22,6 +22,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
@@ -48,6 +49,7 @@
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),
+    "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),