Merge branch 'main' into refactor-moe

vllm-project · Jul 2, 2024 · d8d8be1 · d8d8be1
2 parents 66de813 + c5832d2
commit d8d8be1
Show file tree

Hide file tree

Showing 129 changed files with 2,679 additions and 1,701 deletions.
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
@@ -8,10 +8,6 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -74,6 +74,16 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+
+
 - label: Engine Test
   mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -46,6 +46,7 @@ def main(args: argparse.Namespace):
         load_format=args.load_format,
         distributed_executor_backend=args.distributed_executor_backend,
         otlp_traces_endpoint=args.otlp_traces_endpoint,
+        enable_prefix_caching=args.enable_prefix_caching,
     )
 
     sampling_params = SamplingParams(
@@ -220,6 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help='If True, the prefill requests can be chunked based on the '
         'max_num_batched_tokens')
+    parser.add_argument("--enable-prefix-caching",
+                        action='store_true',
+                        help="Enable automatic prefix caching")
     parser.add_argument('--use-v2-block-manager', action='store_true')
     parser.add_argument(
         "--ray-workers-use-nsight",

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -1,7 +1,7 @@
-sphinx == 6.2.1
-sphinx-book-theme == 1.0.1
-sphinx-copybutton == 0.5.2
-myst-parser == 2.0.0
+sphinx==6.2.1
+sphinx-book-theme==1.0.1
+sphinx-copybutton==0.5.2
+myst-parser==2.0.0
 sphinx-argparse
 
 # packages to install to build the documentation

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 which allows you to pass in multi-modal input alongside text and token prompts.
 
 By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
+
+# TODO: Add more instructions on how to do that once embeddings is in.
 
 Module Contents
 +++++++++++++++
@@ -29,7 +31,7 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.MultiModalData
+.. autoclass:: vllm.multimodal.MultiModalDataDict
     :members:
     :show-inheritance:
 

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
@@ -49,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
+
+.. note::
+
+   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
+    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
 
 .. code-block:: python
 
@@ -61,7 +65,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {"image": image},
     })
 
     for o in outputs:
@@ -93,7 +97,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
     python -m vllm.entrypoints.openai.api_server \
         --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
         --image-token-id 32000 \
         --image-input-shape 1,3,336,336 \
         --image-feature-size 576 \

diff --git a/examples/llava_example.py b/examples/llava_example.py
@@ -1,84 +1,44 @@
-import argparse
 import os
 import subprocess
 
-import torch
 from PIL import Image
 
 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them
 
 
-def run_llava_pixel_values(*, disable_image_processor: bool = False):
+def run_llava():
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
-        disable_image_processor=disable_image_processor,
     )
 
     prompt = "<image>" * 576 + (
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = Image.open("images/stop_sign.jpg")
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {
+            "image": image
+        },
     })
 
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
 
 
-def run_llava_image_features():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="image_features",
-        image_token_id=32000,
-        image_input_shape="1,576,1024",
-        image_feature_size=576,
-    )
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_llava_pixel_values()
-    else:
-        run_llava_image_features()
+def main():
+    run_llava()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on Llava")
-    parser.add_argument("--type",
-                        type=str,
-                        choices=["pixel_values", "image_features"],
-                        default="pixel_values",
-                        help="image input type")
-    args = parser.parse_args()
     # Download from s3
     s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
     local_directory = "images"
@@ -95,4 +55,4 @@ def main(args):
         local_directory,
         "--no-sign-request",
     ])
-    main(args)
+    main()
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
@@ -4,35 +4,44 @@
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 # Dynamic image input is currently not supported and therefore
 # a fixed image input shape and its corresponding feature size is required.
 # See https://github.com/vllm-project/vllm/pull/4199 for the complete
 # configuration matrix.
 
-llm = LLM(
-    model="llava-hf/llava-v1.6-mistral-7b-hf",
-    image_input_type="pixel_values",
-    image_token_id=32000,
-    image_input_shape="1,3,336,336",
-    image_feature_size=1176,
-)
-
-prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
-url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-image = Image.open(BytesIO(requests.get(url).content))
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
-
-outputs = llm.generate(
-    {
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
-    },
-    sampling_params=sampling_params)
-
-generated_text = ""
-for o in outputs:
-    generated_text += o.outputs[0].text
-
-print(f"LLM output:{generated_text}")
+
+def run_llava_next():
+    llm = LLM(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=1176,
+    )
+
+    prompt = "[INST] " + "<image>" * 1176 + (
+        "\nWhat is shown in this image? [/INST]")
+    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+    image = Image.open(BytesIO(requests.get(url).content))
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=100)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            }
+        },
+        sampling_params=sampling_params)
+
+    generated_text = ""
+    for o in outputs:
+        generated_text += o.outputs[0].text
+
+    print(f"LLM output:{generated_text}")
+
+
+if __name__ == "__main__":
+    run_llava_next()
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
@@ -3,7 +3,6 @@
 Launch the vLLM server with the following command:
 python -m vllm.entrypoints.openai.api_server \
     --model llava-hf/llava-1.5-7b-hf \
-    --image-input-type pixel_values \
     --image-token-id 32000 \
     --image-input-shape 1,3,336,336 \
     --image-feature-size 576 \

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
@@ -4,7 +4,6 @@
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 
 def run_phi3v():
@@ -17,7 +16,6 @@ def run_phi3v():
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
@@ -35,7 +33,9 @@ def run_phi3v():
     outputs = llm.generate(
         {
             "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
         },
         sampling_params=sampling_params)
     for o in outputs:

diff --git a/requirements-common.txt b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.0  # Required for Gemma 2.
+transformers >= 4.42.0  # Required for Gemma 2 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import SamplingParams
+from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 
 from ..utils import wait_for_gpu_memory_to_clear
@@ -23,15 +24,21 @@ def __init__(self):
         self.add_request_calls = 0
         self.abort_request_calls = 0
         self.request_id = None
+        # Ugly, remove dependency when possible
+        self.parallel_config = ParallelConfig(1, 1, False)
 
-    async def step_async(self):
+    async def step_async(self, virtual_engine):
+        # PP size is 1, ignore virtual engine
         self.step_calls += 1
         return [RequestOutput(
             request_id=self.request_id)] if self.request_id else []
 
     async def process_model_inputs_async(self, *args, **kwargs):
         pass
 
+    async def stop_remote_worker_execution_loop_async(self):
+        pass
+
     def generate(self, request_id):
         self.request_id = request_id
 
@@ -41,6 +48,7 @@ def stop_generating(self):
     def add_request(self, **kwargs):
         del kwargs  # Unused
         self.add_request_calls += 1
+        print(f'Request calls: {self.add_request_calls}')
 
     async def add_request_async(self, **kwargs):
         self.add_request_calls += 1
@@ -53,6 +61,9 @@ def abort_request(self, request_id):
     def has_unfinished_requests(self):
         return self.request_id is not None
 
+    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
+        return self.request_id is not None
+
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
 
@@ -76,6 +87,7 @@ async def test_new_requests_event():
     engine.engine.generate("2")
     await asyncio.sleep(0)
     await asyncio.sleep(0)
+    await asyncio.sleep(0)
     assert engine.engine.add_request_calls == 2
     assert engine.engine.step_calls >= 2
     await asyncio.sleep(0.001)