Skip to content

Commit fa95264

Browse files
Merge branch 'main' into woosuk-eagle
2 parents a7f0600 + 239b7be commit fa95264

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1407
-763
lines changed

.buildkite/nightly-benchmarks/tests/serving-tests.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,12 @@
6363
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
6464
"disable_log_requests": "",
6565
"tensor_parallel_size": 4,
66-
"swap_space": 16,
67-
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
68-
"num_speculative_tokens": 4,
69-
"speculative_draft_tensor_parallel_size": 1
66+
"swap_space": 16,
67+
"speculative_config": {
68+
"model": "turboderp/Qwama-0.5B-Instruct",
69+
"num_speculative_tokens": 4,
70+
"draft_tensor_parallel_size": 1
71+
}
7072
},
7173
"client_parameters": {
7274
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ set(VLLM_EXT_SRC
234234
"csrc/activation_kernels.cu"
235235
"csrc/layernorm_kernels.cu"
236236
"csrc/layernorm_quant_kernels.cu"
237+
"csrc/cuda_view.cu"
237238
"csrc/quantization/gptq/q_gemm.cu"
238239
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
239240
"csrc/quantization/fp8/common.cu"

benchmarks/README.md

Lines changed: 134 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -41,51 +41,55 @@ become available.
4141
<td><code>synthetic</code></td>
4242
</tr>
4343
<tr>
44-
<td><strong>HuggingFace</strong></td>
45-
<td style="text-align: center;">🟡</td>
46-
<td style="text-align: center;">🟡</td>
47-
<td>Specify your dataset path on HuggingFace</td>
44+
<td><strong>HuggingFace-VisionArena</strong></td>
45+
<td style="text-align: center;">✅</td>
46+
<td style="text-align: center;">✅</td>
47+
<td><code>lmarena-ai/VisionArena-Chat</code></td>
48+
</tr>
49+
<tr>
50+
<td><strong>HuggingFace-InstructCoder</strong></td>
51+
<td style="text-align: center;">✅</td>
52+
<td style="text-align: center;">✅</td>
53+
<td><code>likaixin/InstructCoder</code></td>
4854
</tr>
4955
<tr>
50-
<td><strong>VisionArena</strong></td>
56+
<td><strong>HuggingFace-Other</strong></td>
5157
<td style="text-align: center;">✅</td>
5258
<td style="text-align: center;">✅</td>
53-
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
59+
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
5460
</tr>
5561
</tbody>
5662
</table>
5763

5864
✅: supported
5965

60-
🚧: to be supported
66+
🟡: Partial support
6167

62-
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
63-
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
64-
If you need support for other dataset formats, please consider contributing.
68+
🚧: to be supported
6569

66-
**Note**: VisionArena’s `dataset-name` should be set to `hf`
70+
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
6771

6872
---
6973
## Example - Online Benchmark
7074

7175
First start serving your model
7276

7377
```bash
74-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
75-
vllm serve ${MODEL_NAME} --disable-log-requests
78+
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
7679
```
7780

7881
Then run the benchmarking script
7982

8083
```bash
8184
# download dataset
8285
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
83-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
84-
NUM_PROMPTS=10
85-
BACKEND="vllm"
86-
DATASET_NAME="sharegpt"
87-
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
88-
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
86+
python3 vllm/benchmarks/benchmark_serving.py \
87+
--backend vllm \
88+
--model NousResearch/Hermes-3-Llama-3.1-8B \
89+
--endpoint /v1/completions \
90+
--dataset-name sharegpt \
91+
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
92+
--num-prompts 10
8993
```
9094

9195
If successful, you will see the following output
@@ -122,88 +126,76 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
122126
```
123127

124128
```bash
125-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
126-
NUM_PROMPTS=10
127-
BACKEND="openai-chat"
128-
DATASET_NAME="hf"
129-
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
130-
DATASET_SPLIT='train'
131-
132129
python3 vllm/benchmarks/benchmark_serving.py \
133-
--backend "${BACKEND}" \
134-
--model "${MODEL_NAME}" \
135-
--endpoint "/v1/chat/completions" \
136-
--dataset-name "${DATASET_NAME}" \
137-
--dataset-path "${DATASET_PATH}" \
138-
--hf-split "${DATASET_SPLIT}" \
139-
--num-prompts "${NUM_PROMPTS}"
130+
--backend openai-chat \
131+
--model Qwen/Qwen2-VL-7B-Instruct \
132+
--endpoint /v1/chat/completions \
133+
--dataset-name hf \
134+
--dataset-path lmarena-ai/VisionArena-Chat \
135+
--hf-split train \
136+
--num-prompts 1000
140137
```
141138

142-
### HuggingFaceDataset Examples
139+
### InstructCoder Benchmark with Speculative Decoding
143140

144-
Currently, HuggingFaceDataset only supports dataset formats
145-
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
146-
formats, please consider contributing.
141+
``` bash
142+
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
143+
--speculative-model "[ngram]" \
144+
--ngram_prompt_lookup_min 2 \
145+
--ngram-prompt-lookup-max 5 \
146+
--num_speculative_tokens 5
147+
```
148+
149+
``` bash
150+
python3 benchmarks/benchmark_serving.py \
151+
--model meta-llama/Meta-Llama-3-8B-Instruct \
152+
--dataset-name hf \
153+
--dataset-path likaixin/InstructCoder \
154+
--num-prompts 2048
155+
```
156+
157+
### Other HuggingFaceDataset Examples
147158

148159
```bash
149-
# need a model with vision capability here
150160
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
151161
```
152162

153163
**`lmms-lab/LLaVA-OneVision-Data`**
154164

155165
```bash
156-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
157-
NUM_PROMPTS=10
158-
BACKEND="openai-chat"
159-
DATASET_NAME="hf"
160-
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
161-
DATASET_SPLIT='train'
162-
DATASET_SUBSET='chart2text(cauldron)'
163166
python3 vllm/benchmarks/benchmark_serving.py \
164-
--backend "${BACKEND}" \
165-
--model "${MODEL_NAME}" \
166-
--endpoint "/v1/chat/completions" \
167-
--dataset-name "${DATASET_NAME}" \
168-
--dataset-path "${DATASET_PATH}" \
169-
--hf-split "${DATASET_SPLIT}" \
170-
--num-prompts "${NUM_PROMPTS}" \
171-
--hf-subset "${DATASET_SUBSET}"
167+
--backend openai-chat \
168+
--model Qwen/Qwen2-VL-7B-Instruct \
169+
--endpoint /v1/chat/completions \
170+
--dataset-name hf \
171+
--dataset-path lmms-lab/LLaVA-OneVision-Data \
172+
--hf-split train \
173+
--hf-subset "chart2text(cauldron)" \
174+
--num-prompts 10
172175
```
173176

174177
**`Aeala/ShareGPT_Vicuna_unfiltered`**
175178

176179
```bash
177-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
178-
NUM_PROMPTS=10
179-
BACKEND="openai-chat"
180-
DATASET_NAME="hf"
181-
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
182-
DATASET_SPLIT='train'
183180
python3 vllm/benchmarks/benchmark_serving.py \
184-
--backend "${BACKEND}" \
185-
--model "${MODEL_NAME}" \
186-
--endpoint "/v1/chat/completions" \
187-
--dataset-name "${DATASET_NAME}" \
188-
--dataset-path "${DATASET_PATH}" \
189-
--hf-split "${DATASET_SPLIT}" \
190-
--num-prompts "${NUM_PROMPTS}" \
181+
--backend openai-chat \
182+
--model Qwen/Qwen2-VL-7B-Instruct \
183+
--endpoint /v1/chat/completions \
184+
--dataset-name hf \
185+
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
186+
--hf-split train \
187+
--num-prompts 10
191188
```
192189

193190
---
194191
## Example - Offline Throughput Benchmark
195192

196193
```bash
197-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
198-
NUM_PROMPTS=10
199-
DATASET_NAME="sonnet"
200-
DATASET_PATH="vllm/benchmarks/sonnet.txt"
201-
202194
python3 vllm/benchmarks/benchmark_throughput.py \
203-
--model "${MODEL_NAME}" \
204-
--dataset-name "${DATASET_NAME}" \
205-
--dataset-path "${DATASET_PATH}" \
206-
--num-prompts "${NUM_PROMPTS}"
195+
--model NousResearch/Hermes-3-Llama-3.1-8B \
196+
--dataset-name sonnet \
197+
--dataset-path vllm/benchmarks/sonnet.txt \
198+
--num-prompts 10
207199
```
208200

209201
If successful, you will see the following output
@@ -217,19 +209,13 @@ Total num output tokens: 1500
217209
### VisionArena Benchmark for Vision Language Models
218210

219211
``` bash
220-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
221-
NUM_PROMPTS=10
222-
DATASET_NAME="hf"
223-
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
224-
DATASET_SPLIT="train"
225-
226212
python3 vllm/benchmarks/benchmark_throughput.py \
227-
--model "${MODEL_NAME}" \
228-
--backend "vllm-chat" \
229-
--dataset-name "${DATASET_NAME}" \
230-
--dataset-path "${DATASET_PATH}" \
231-
--num-prompts "${NUM_PROMPTS}" \
232-
--hf-split "${DATASET_SPLIT}"
213+
--model Qwen/Qwen2-VL-7B-Instruct \
214+
--backend vllm-chat \
215+
--dataset-name hf \
216+
--dataset-path lmarena-ai/VisionArena-Chat \
217+
--num-prompts 1000 \
218+
--hf-split train
233219
```
234220

235221
The `num prompt tokens` now includes image token counts
@@ -240,29 +226,71 @@ Total num prompt tokens: 14527
240226
Total num output tokens: 1280
241227
```
242228

229+
### InstructCoder Benchmark with Speculative Decoding
230+
231+
``` bash
232+
VLLM_WORKER_MULTIPROC_METHOD=spawn \
233+
VLLM_USE_V1=1 \
234+
python3 vllm/benchmarks/benchmark_throughput.py \
235+
--dataset-name=hf \
236+
--dataset-path=likaixin/InstructCoder \
237+
--model=meta-llama/Meta-Llama-3-8B-Instruct \
238+
--input-len=1000 \
239+
--output-len=100 \
240+
--num-prompts=2048 \
241+
--async-engine \
242+
--speculative-model="[ngram]" \
243+
--ngram_prompt_lookup_min=2 \
244+
--ngram-prompt-lookup-max=5 \
245+
--num_speculative_tokens=5
246+
```
247+
248+
```
249+
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
250+
Total num prompt tokens: 261136
251+
Total num output tokens: 204800
252+
```
253+
254+
### Other HuggingFaceDataset Examples
255+
256+
**`lmms-lab/LLaVA-OneVision-Data`**
257+
258+
```bash
259+
python3 vllm/benchmarks/benchmark_throughput.py \
260+
--model Qwen/Qwen2-VL-7B-Instruct \
261+
--backend vllm-chat \
262+
--dataset-name hf \
263+
--dataset-path lmms-lab/LLaVA-OneVision-Data \
264+
--hf-split train \
265+
--hf-subset "chart2text(cauldron)" \
266+
--num-prompts 10
267+
```
268+
269+
**`Aeala/ShareGPT_Vicuna_unfiltered`**
270+
271+
```bash
272+
python3 vllm/benchmarks/benchmark_throughput.py \
273+
--model Qwen/Qwen2-VL-7B-Instruct \
274+
--backend vllm-chat \
275+
--dataset-name hf \
276+
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
277+
--hf-split train \
278+
--num-prompts 10
279+
```
280+
243281
### Benchmark with LoRA Adapters
244282

245283
``` bash
246284
# download dataset
247285
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
248-
MODEL_NAME="meta-llama/Llama-2-7b-hf"
249-
BACKEND="vllm"
250-
DATASET_NAME="sharegpt"
251-
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
252-
NUM_PROMPTS=10
253-
MAX_LORAS=2
254-
MAX_LORA_RANK=8
255-
ENABLE_LORA="--enable-lora"
256-
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
257-
258286
python3 vllm/benchmarks/benchmark_throughput.py \
259-
--model "${MODEL_NAME}" \
260-
--backend "${BACKEND}" \
261-
--dataset_path "${DATASET_PATH}" \
262-
--dataset_name "${DATASET_NAME}" \
263-
--num-prompts "${NUM_PROMPTS}" \
264-
--max-loras "${MAX_LORAS}" \
265-
--max-lora-rank "${MAX_LORA_RANK}" \
266-
${ENABLE_LORA} \
267-
--lora-path "${LORA_PATH}"
287+
--model meta-llama/Llama-2-7b-hf \
288+
--backend vllm \
289+
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
290+
--dataset_name sharegpt \
291+
--num-prompts 10 \
292+
--max-loras 2 \
293+
--max-lora-rank 8 \
294+
--enable-lora \
295+
--lora-path yard1/llama-2-7b-sql-lora-test
268296
```

0 commit comments

Comments
 (0)