From 84237f5fb984949591fcea91d836af3669c18648 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Mon, 4 Nov 2024 15:22:20 -0800
Subject: [PATCH 1/4] rebase on top of upstream main

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 84 +++++++++++++++++++++++-------
 1 file changed, 66 insertions(+), 18 deletions(-)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 262b8652e49ff..e4f0d2011133a 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -8,6 +8,7 @@
 
 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -38,12 +39,33 @@ class SampleRequest:
     multi_modal_data: Optional[MultiModalDataDict] = None
 
 
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[SampleRequest]:
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -52,23 +74,36 @@ def sample_requests(
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for i in range(len(dataset)):
+    for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
 
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
         prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
         completion_token_ids = tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
@@ -82,7 +117,8 @@ def sample_requests(
         filtered_dataset.append(
             SampleRequest(prompt=prompt,
                           prompt_len=prompt_len,
-                          expected_output_len=output_len))
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))
 
     return filtered_dataset
 
@@ -99,7 +135,9 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(TextPrompt(prompt=request.prompt))
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -148,7 +186,9 @@ async def run_vllm_async(
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
-            prompts.append(TextPrompt(prompt=request.prompt))
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
             sampling_params.append(
                 SamplingParams(
                     n=n,
@@ -272,9 +312,10 @@ def main(args: argparse.Namespace):
             for _ in range(args.num_prompts)
         ]
     else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
+        requests = sample_requests(tokenizer, args)
 
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -300,6 +341,13 @@ def main(args: argparse.Namespace):
                            for request in requests)
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
+    if is_multi_modal:
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request detected. The "
+            "following metrics is not accurate because image tokens are not "
+            "counted. See vllm-project/vllm/issues/9778 for details."
+        )
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")

From 13c726cded0f5bd99dfd997a538a3233a8a82c2a Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Mon, 4 Nov 2024 15:22:20 -0800
Subject: [PATCH 2/4] rebase on top of upstream main

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 67 +++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index e4f0d2011133a..1b41f99ce9dbb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -9,6 +9,7 @@
 import torch
 import uvloop
 from PIL import Image
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -60,6 +61,33 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
     raise ValueError(f"Unsupported model {model}")
 
 
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                    args: argparse.Namespace) -> List[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> List[SampleRequest]:
     dataset_path: str = args.dataset
@@ -79,6 +107,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
+    for data in dataset:
     for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
@@ -87,6 +116,25 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
         prompt = data["conversations"][0]["value"]
         completion = data["conversations"][1]["value"]
 
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
         multi_modal_data: Optional[MultiModalDataDict] = None
         if "image" in data:
             multi_modal_data = multi_modal_data or {}
@@ -119,6 +167,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                           prompt_len=prompt_len,
                           expected_output_len=output_len,
                           multi_modal_data=multi_modal_data))
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data))
 
     return filtered_dataset
 
@@ -135,6 +185,9 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
+        prompts.append(
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         prompts.append(
             TextPrompt(prompt=request.prompt,
                        multi_modal_data=request.multi_modal_data))
@@ -186,6 +239,9 @@ async def run_vllm_async(
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
+            prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
             prompts.append(
                 TextPrompt(prompt=request.prompt,
                            multi_modal_data=request.multi_modal_data))
@@ -313,7 +369,10 @@ def main(args: argparse.Namespace):
         ]
     else:
         requests = sample_requests(tokenizer, args)
+        requests = sample_requests(tokenizer, args)
 
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
     if args.backend == "vllm":
@@ -342,11 +401,9 @@ def main(args: argparse.Namespace):
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
     if is_multi_modal:
-        print(
-            "\033[91mWARNING\033[0m: Multi-modal request detected. The "
-            "following metrics is not accurate because image tokens are not "
-            "counted. See vllm-project/vllm/issues/9778 for details."
-        )
+        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics is not accurate because image tokens are not "
+              "counted. See vllm-project/vllm/issues/9778 for details.")
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "

From 06c36beafbb9fd04f485e36ddf451302f1c108a8 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Mon, 4 Nov 2024 15:50:28 -0800
Subject: [PATCH 3/4] Update README to include GPT4V

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/README.md               | 11 ++++++
 benchmarks/benchmark_throughput.py | 59 ------------------------------
 2 files changed, 11 insertions(+), 59 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 192d6c4022c83..2aa4a285021f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,3 +6,14 @@ You can download the dataset by running:
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1b41f99ce9dbb..9c6260f16ec5e 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -9,7 +9,6 @@
 import torch
 import uvloop
 from PIL import Image
-from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -61,33 +60,6 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
     raise ValueError(f"Unsupported model {model}")
 
 
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
-    dataset_path: str = args.dataset
-    num_requests: int = args.num_prompts
-    fixed_output_len: Optional[int] = args.output_len
-    model: str = args.model
-def _get_prompt_for_image_model(question: str, *, model: str) -> str:
-    """Prepend and append special tokens around the question to form a prompt.
-
-    Args:
-        question: The input question text to wrap with special tokens
-        model: The name of the model being used, to determine which special
-            tokens to add
-
-    Returns:
-        The formatted prompt string with appropriate special tokens for the
-            model
-
-    Raises:
-        ValueError: If an unsupported model name is provided
-    """
-    model = model.lower()
-    if "pixtral" in model:
-        return f"<s>[INST]{question}\n[IMG][/INST]"
-    raise ValueError(f"Unsupported model {model}")
-
-
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> List[SampleRequest]:
     dataset_path: str = args.dataset
@@ -107,7 +79,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     # Filter out sequences that are too long or too short
     filtered_dataset: List[SampleRequest] = []
-    for data in dataset:
     for data in dataset:
         if len(filtered_dataset) == num_requests:
             break
@@ -116,25 +87,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
         prompt = data["conversations"][0]["value"]
         completion = data["conversations"][1]["value"]
 
-        multi_modal_data: Optional[MultiModalDataDict] = None
-        if "image" in data:
-            multi_modal_data = multi_modal_data or {}
-            image_path = data["image"]
-            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
-            assert isinstance(image_path,
-                              str), "Only support single image input"
-            try:
-                multi_modal_data["image"] = Image.open(image_path).convert(
-                    "RGB")
-            except FileNotFoundError:
-                # Ignore datapoint where asset is missing
-                continue
-            prompt = _get_prompt_for_image_model(question=prompt, model=model)
-
-        # Only keep the first two turns of each conversation.
-        prompt = data["conversations"][0]["value"]
-        completion = data["conversations"][1]["value"]
-
         multi_modal_data: Optional[MultiModalDataDict] = None
         if "image" in data:
             multi_modal_data = multi_modal_data or {}
@@ -167,8 +119,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                           prompt_len=prompt_len,
                           expected_output_len=output_len,
                           multi_modal_data=multi_modal_data))
-                          expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data))
 
     return filtered_dataset
 
@@ -185,9 +135,6 @@ def run_vllm(
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
     for request in requests:
-        prompts.append(
-            TextPrompt(prompt=request.prompt,
-                       multi_modal_data=request.multi_modal_data))
         prompts.append(
             TextPrompt(prompt=request.prompt,
                        multi_modal_data=request.multi_modal_data))
@@ -239,9 +186,6 @@ async def run_vllm_async(
         prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
         for request in requests:
-            prompts.append(
-                TextPrompt(prompt=request.prompt,
-                           multi_modal_data=request.multi_modal_data))
             prompts.append(
                 TextPrompt(prompt=request.prompt,
                            multi_modal_data=request.multi_modal_data))
@@ -369,10 +313,7 @@ def main(args: argparse.Namespace):
         ]
     else:
         requests = sample_requests(tokenizer, args)
-        requests = sample_requests(tokenizer, args)
 
-    is_multi_modal = any(request.multi_modal_data is not None
-                         for request in requests)
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
     if args.backend == "vllm":

From bb85f1d4c2d1829bb56774a78b286289dc7f38d1 Mon Sep 17 00:00:00 2001
From: Linkun Chen <github+anyscale@lkchen.net>
Date: Tue, 5 Nov 2024 09:51:25 -0800
Subject: [PATCH 4/4] typo: s/is/are

Signed-off-by: Linkun Chen <github+anyscale@lkchen.net>
---
 benchmarks/benchmark_throughput.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 9c6260f16ec5e..159cf055737ce 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -343,8 +343,8 @@ def main(args: argparse.Namespace):
                               for request in requests)
     if is_multi_modal:
         print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
-              "following metrics is not accurate because image tokens are not "
-              "counted. See vllm-project/vllm/issues/9778 for details.")
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "