From 84237f5fb984949591fcea91d836af3669c18648 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 4 Nov 2024 15:22:20 -0800 Subject: [PATCH 1/4] rebase on top of upstream main Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 84 +++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 262b8652e49ff..e4f0d2011133a 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -8,6 +8,7 @@ import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -38,12 +39,33 @@ class SampleRequest: multi_modal_data: Optional[MultiModalDataDict] = None -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[SampleRequest]: +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -52,23 +74,36 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for i in range(len(dataset)): + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -82,7 +117,8 @@ def sample_requests( filtered_dataset.append( SampleRequest(prompt=prompt, prompt_len=prompt_len, - expected_output_len=output_len)) + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset @@ -99,7 +135,9 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -148,7 +186,9 @@ async def run_vllm_async( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append(TextPrompt(prompt=request.prompt)) + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, @@ -272,9 +312,10 @@ def main(args: argparse.Namespace): for _ in range(args.num_prompts) ] else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -300,6 +341,13 @@ def main(args: argparse.Namespace): for request in requests) total_output_tokens = sum(request.expected_output_len for request in requests) + if is_multi_modal: + print( + "\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics is not accurate because image tokens are not " + "counted. See vllm-project/vllm/issues/9778 for details." + ) + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") From 13c726cded0f5bd99dfd997a538a3233a8a82c2a Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 4 Nov 2024 15:22:20 -0800 Subject: [PATCH 2/4] rebase on top of upstream main Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 67 +++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e4f0d2011133a..1b41f99ce9dbb 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -9,6 +9,7 @@ import torch import uvloop from PIL import Image +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -60,6 +61,33 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str: raise ValueError(f"Unsupported model {model}") +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + def sample_requests(tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace) -> List[SampleRequest]: dataset_path: str = args.dataset @@ -79,6 +107,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] + for data in dataset: for data in dataset: if len(filtered_dataset) == num_requests: break @@ -87,6 +116,25 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, prompt = data["conversations"][0]["value"] completion = data["conversations"][1]["value"] + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + multi_modal_data: Optional[MultiModalDataDict] = None if "image" in data: multi_modal_data = multi_modal_data or {} @@ -119,6 +167,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=multi_modal_data)) + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset @@ -135,6 +185,9 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) prompts.append( TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) @@ -186,6 +239,9 @@ async def run_vllm_async( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) prompts.append( TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) @@ -313,7 +369,10 @@ def main(args: argparse.Namespace): ] else: requests = sample_requests(tokenizer, args) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) is_multi_modal = any(request.multi_modal_data is not None for request in requests) if args.backend == "vllm": @@ -342,11 +401,9 @@ def main(args: argparse.Namespace): total_output_tokens = sum(request.expected_output_len for request in requests) if is_multi_modal: - print( - "\033[91mWARNING\033[0m: Multi-modal request detected. The " - "following metrics is not accurate because image tokens are not " - "counted. See vllm-project/vllm/issues/9778 for details." - ) + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics is not accurate because image tokens are not " + "counted. See vllm-project/vllm/issues/9778 for details.") # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " From 06c36beafbb9fd04f485e36ddf451302f1c108a8 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Mon, 4 Nov 2024 15:50:28 -0800 Subject: [PATCH 3/4] Update README to include GPT4V Signed-off-by: Linkun Chen --- benchmarks/README.md | 11 ++++++ benchmarks/benchmark_throughput.py | 59 ------------------------------ 2 files changed, 11 insertions(+), 59 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 192d6c4022c83..2aa4a285021f1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -6,3 +6,14 @@ You can download the dataset by running: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1b41f99ce9dbb..9c6260f16ec5e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -9,7 +9,6 @@ import torch import uvloop from PIL import Image -from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -61,33 +60,6 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str: raise ValueError(f"Unsupported model {model}") -def sample_requests(tokenizer: PreTrainedTokenizerBase, - args: argparse.Namespace) -> List[SampleRequest]: - dataset_path: str = args.dataset - num_requests: int = args.num_prompts - fixed_output_len: Optional[int] = args.output_len - model: str = args.model -def _get_prompt_for_image_model(question: str, *, model: str) -> str: - """Prepend and append special tokens around the question to form a prompt. - - Args: - question: The input question text to wrap with special tokens - model: The name of the model being used, to determine which special - tokens to add - - Returns: - The formatted prompt string with appropriate special tokens for the - model - - Raises: - ValueError: If an unsupported model name is provided - """ - model = model.lower() - if "pixtral" in model: - return f"[INST]{question}\n[IMG][/INST]" - raise ValueError(f"Unsupported model {model}") - - def sample_requests(tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace) -> List[SampleRequest]: dataset_path: str = args.dataset @@ -107,7 +79,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, # Filter out sequences that are too long or too short filtered_dataset: List[SampleRequest] = [] - for data in dataset: for data in dataset: if len(filtered_dataset) == num_requests: break @@ -116,25 +87,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, prompt = data["conversations"][0]["value"] completion = data["conversations"][1]["value"] - multi_modal_data: Optional[MultiModalDataDict] = None - if "image" in data: - multi_modal_data = multi_modal_data or {} - image_path = data["image"] - # TODO(vllm-project/vllm/issues/9778): Support multiple images. - assert isinstance(image_path, - str), "Only support single image input" - try: - multi_modal_data["image"] = Image.open(image_path).convert( - "RGB") - except FileNotFoundError: - # Ignore datapoint where asset is missing - continue - prompt = _get_prompt_for_image_model(question=prompt, model=model) - - # Only keep the first two turns of each conversation. - prompt = data["conversations"][0]["value"] - completion = data["conversations"][1]["value"] - multi_modal_data: Optional[MultiModalDataDict] = None if "image" in data: multi_modal_data = multi_modal_data or {} @@ -167,8 +119,6 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=multi_modal_data)) - expected_output_len=output_len, - multi_modal_data=multi_modal_data)) return filtered_dataset @@ -185,9 +135,6 @@ def run_vllm( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append( - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) prompts.append( TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) @@ -239,9 +186,6 @@ async def run_vllm_async( prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] for request in requests: - prompts.append( - TextPrompt(prompt=request.prompt, - multi_modal_data=request.multi_modal_data)) prompts.append( TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)) @@ -369,10 +313,7 @@ def main(args: argparse.Namespace): ] else: requests = sample_requests(tokenizer, args) - requests = sample_requests(tokenizer, args) - is_multi_modal = any(request.multi_modal_data is not None - for request in requests) is_multi_modal = any(request.multi_modal_data is not None for request in requests) if args.backend == "vllm": From bb85f1d4c2d1829bb56774a78b286289dc7f38d1 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Tue, 5 Nov 2024 09:51:25 -0800 Subject: [PATCH 4/4] typo: s/is/are Signed-off-by: Linkun Chen --- benchmarks/benchmark_throughput.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 9c6260f16ec5e..159cf055737ce 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -343,8 +343,8 @@ def main(args: argparse.Namespace): for request in requests) if is_multi_modal: print("\033[91mWARNING\033[0m: Multi-modal request detected. The " - "following metrics is not accurate because image tokens are not " - "counted. See vllm-project/vllm/issues/9778 for details.") + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "