Skip to content

Commit adecd9a

Browse files
JenZhaoywang96
authored andcommitted
[Feature] Add visionarena offline support for benchmark_throughput (vllm-project#14654)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com> Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com> Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
1 parent 348f85f commit adecd9a

File tree

3 files changed

+291
-99
lines changed

3 files changed

+291
-99
lines changed

benchmarks/README.md

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,20 +43,26 @@ become available.
4343
<tr>
4444
<td><strong>HuggingFace</strong></td>
4545
<td style="text-align: center;">✅</td>
46-
<td style="text-align: center;">🚧</td>
46+
<td style="text-align: center;">🟡</td>
4747
<td>Specify your dataset path on HuggingFace</td>
4848
</tr>
4949
<tr>
5050
<td><strong>VisionArena</strong></td>
5151
<td style="text-align: center;">✅</td>
52-
<td style="text-align: center;">🚧</td>
52+
<td style="text-align: center;"></td>
5353
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
5454
</tr>
5555
</tbody>
5656
</table>
57-
✅: supported
57+
58+
✅: supported
59+
5860
🚧: to be supported
5961

62+
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
63+
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
64+
formats, please consider contributing.
65+
6066
**Note**: VisionArena’s `dataset-name` should be set to `hf`
6167

6268
---
@@ -79,7 +85,7 @@ NUM_PROMPTS=10
7985
BACKEND="openai-chat"
8086
DATASET_NAME="sharegpt"
8187
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
82-
python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
88+
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
8389
```
8490

8591
If successful, you will see the following output
@@ -123,7 +129,7 @@ DATASET_NAME="hf"
123129
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
124130
DATASET_SPLIT='train'
125131

126-
python3 benchmarks/benchmark_serving.py \
132+
python3 vllm/benchmarks/benchmark_serving.py \
127133
--backend "${BACKEND}" \
128134
--model "${MODEL_NAME}" \
129135
--endpoint "/v1/chat/completions" \
@@ -140,35 +146,65 @@ python3 benchmarks/benchmark_serving.py \
140146
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
141147
NUM_PROMPTS=10
142148
DATASET_NAME="sonnet"
143-
DATASET_PATH="benchmarks/sonnet.txt"
149+
DATASET_PATH="vllm/benchmarks/sonnet.txt"
144150

145-
python3 benchmarks/benchmark_throughput.py \
151+
python3 vllm/benchmarks/benchmark_throughput.py \
146152
--model "${MODEL_NAME}" \
147153
--dataset-name "${DATASET_NAME}" \
148154
--dataset-path "${DATASET_PATH}" \
149155
--num-prompts "${NUM_PROMPTS}"
150-
```
156+
```
151157

152158
If successful, you will see the following output
153159

154160
```
155-
Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
161+
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
162+
Total num prompt tokens: 5014
163+
Total num output tokens: 1500
164+
```
165+
166+
### VisionArena Benchmark for Vision Language Models
167+
168+
``` bash
169+
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
170+
NUM_PROMPTS=10
171+
DATASET_NAME="hf"
172+
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
173+
DATASET_SPLIT="train"
174+
175+
python3 vllm/benchmarks/benchmark_throughput.py \
176+
--model "${MODEL_NAME}" \
177+
--backend "vllm-chat" \
178+
--dataset-name "${DATASET_NAME}" \
179+
--dataset-path "${DATASET_PATH}" \
180+
--num-prompts "${NUM_PROMPTS}" \
181+
--hf-split "${DATASET_SPLIT}"
182+
```
183+
184+
The `num prompt tokens` now includes image token counts
185+
186+
```
187+
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
188+
Total num prompt tokens: 14527
189+
Total num output tokens: 1280
156190
```
157191

158192
### Benchmark with LoRA Adapters
159193

160194
``` bash
195+
# download dataset
196+
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
161197
MODEL_NAME="meta-llama/Llama-2-7b-hf"
162198
BACKEND="vllm"
163199
DATASET_NAME="sharegpt"
164-
DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
200+
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
165201
NUM_PROMPTS=10
166202
MAX_LORAS=2
167203
MAX_LORA_RANK=8
168204
ENABLE_LORA="--enable-lora"
169205
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
170206

171-
python3 benchmarks/benchmark_throughput.py \
207+
python3 vllm/benchmarks/benchmark_throughput.py \
172208
--model "${MODEL_NAME}" \
173209
--backend "${BACKEND}" \
174210
--dataset_path "${DATASET_PATH}" \

benchmarks/benchmark_dataset.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class SampleRequest:
4646
Represents a single inference request for benchmarking.
4747
"""
4848

49-
prompt: str
49+
prompt: Union[str, Any]
5050
prompt_len: int
5151
expected_output_len: int
5252
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@@ -84,6 +84,20 @@ def __init__(
8484
if random_seed is not None else self.DEFAULT_SEED)
8585
self.data = None
8686

87+
def apply_multimodal_chat_transformation(
88+
self,
89+
prompt: str,
90+
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
91+
"""
92+
Transform a prompt and optional multimodal content into a chat format.
93+
This method is used for chat models that expect a specific
94+
conversation format.
95+
"""
96+
content = [{"text": prompt, "type": "text"}]
97+
if mm_content is not None:
98+
content.append(mm_content)
99+
return [{"role": "user", "content": content}]
100+
87101
def load_data(self) -> None:
88102
"""
89103
Load data from the dataset path into self.data.
@@ -338,6 +352,7 @@ def sample(self,
338352
lora_path: Optional[str] = None,
339353
max_loras: Optional[int] = None,
340354
output_len: Optional[int] = None,
355+
enable_multimodal_chat: bool = False,
341356
**kwargs) -> list:
342357
samples: list = []
343358
for entry in self.data:
@@ -358,6 +373,9 @@ def sample(self,
358373
skip_min_output_len_check=output_len
359374
is not None):
360375
continue
376+
if enable_multimodal_chat:
377+
prompt = self.apply_multimodal_chat_transformation(
378+
prompt, None)
361379
samples.append(
362380
SampleRequest(
363381
prompt=prompt,
@@ -550,34 +568,32 @@ def load_data(self) -> None:
550568
split=self.dataset_split,
551569
streaming=True,
552570
)
553-
554-
if "conversations" not in self.data.features:
555-
raise ValueError("HF Dataset must have a 'conversations' column.")
556-
571+
if self.data.features is None or "conversations" \
572+
not in self.data.features:
573+
raise ValueError(
574+
"HuggingFaceDataset currently only supports datasets with "
575+
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
576+
"Please consider contributing if you would like to add "
577+
"support for additional dataset formats.")
557578
# Shuffle and filter examples with at least 2 conversations.
558579
self.data = self.data.shuffle(seed=self.random_seed).filter(
559580
lambda x: len(x["conversations"]) >= 2)
560581

561582
def sample(self,
562583
tokenizer: PreTrainedTokenizerBase,
563584
num_requests: int,
564-
lora_path: Optional[str] = None,
565-
max_loras: Optional[int] = None,
566585
output_len: Optional[int] = None,
586+
enable_multimodal_chat: bool = False,
567587
**kwargs) -> list:
568588
sampled_requests = []
569589
dynamic_output = output_len is None
570590

571591
for item in self.data:
572592
if len(sampled_requests) >= num_requests:
573593
break
574-
575594
conv = item["conversations"]
576595
prompt, completion = conv[0]["value"], conv[1]["value"]
577596

578-
lora_request, tokenizer = self.get_random_lora_request(
579-
tokenizer, lora_path=lora_path, max_loras=max_loras)
580-
581597
prompt_ids = tokenizer(prompt).input_ids
582598
completion_ids = tokenizer(completion).input_ids
583599
prompt_len = len(prompt_ids)
@@ -587,16 +603,20 @@ def sample(self,
587603
if dynamic_output and not is_valid_sequence(
588604
prompt_len, completion_len):
589605
continue
590-
591606
mm_content = process_image(
592607
item["image"]) if "image" in item else None
608+
if enable_multimodal_chat:
609+
# Note: when chat is enabled the request prompt_len is no longer
610+
# accurate and we will be using request output to count the
611+
# actual prompt len and output len
612+
prompt = self.apply_multimodal_chat_transformation(
613+
prompt, mm_content)
593614
sampled_requests.append(
594615
SampleRequest(
595616
prompt=prompt,
596617
prompt_len=prompt_len,
597618
expected_output_len=output_len,
598619
multi_modal_data=mm_content,
599-
lora_request=lora_request,
600620
))
601621
return sampled_requests
602622

@@ -606,7 +626,7 @@ def sample(self,
606626
# -----------------------------------------------------------------------------
607627

608628

609-
class VisionArenaDataset(BenchmarkDataset):
629+
class VisionArenaDataset(HuggingFaceDataset):
610630
"""
611631
Vision Arena Dataset.
612632
"""
@@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset):
617637

618638
def __init__(
619639
self,
620-
dataset_split: str,
621-
dataset_subset: Optional[str] = None,
622640
**kwargs,
623641
) -> None:
624642
super().__init__(**kwargs)
625-
self.dataset_split = dataset_split
626-
self.dataset_subset = dataset_subset
627-
628643
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
629644
raise ValueError(f"Only support Vision Arena dataset.\
630645
This data path {self.dataset_path} is not valid.")
@@ -645,18 +660,24 @@ def load_data(self) -> None:
645660
def sample(self,
646661
tokenizer: PreTrainedTokenizerBase,
647662
num_requests: int,
648-
output_len: int = DEFAULT_OUTPUT_LEN,
663+
output_len: Optional[int] = None,
664+
enable_multimodal_chat: bool = False,
649665
**kwargs) -> list:
650-
# TODO (jenniferzhao): Add support for offline benchmark sampling
651666
output_len = (output_len
652667
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
653668
sampled_requests = []
654669
for item in self.data:
655670
if len(sampled_requests) >= num_requests:
656671
break
657672
prompt = item["turns"][0][0]["content"]
658-
prompt_len = len(tokenizer(prompt).input_ids)
659673
mm_content = process_image(item["images"][0])
674+
prompt_len = len(tokenizer(prompt).input_ids)
675+
if enable_multimodal_chat:
676+
# Note: when chat is enabled the request prompt_len is no longer
677+
# accurate and we will be using request output to count the
678+
# actual prompt len
679+
prompt = self.apply_multimodal_chat_transformation(
680+
prompt, mm_content)
660681
sampled_requests.append(
661682
SampleRequest(
662683
prompt=prompt,

0 commit comments

Comments
 (0)