Skip to content

Commit e294861

Browse files
committed
Merge remote-tracking branch 'upstream/main' into upstream_merge_2025_03_26
2 parents a4dba75 + 037bcd9 commit e294861

File tree

138 files changed

+4953
-1649
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+4953
-1649
lines changed

.buildkite/run-tpu-v1-test.sh

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,14 @@ docker run --privileged --net host --shm-size=16G -it \
2828
&& echo TEST_3 \
2929
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
3030
&& echo TEST_4 \
31-
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
31+
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
3232
&& echo TEST_5 \
33-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
33+
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
3434
&& echo TEST_6 \
35+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
36+
&& echo TEST_7 \
3537
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
3638

3739

3840
# TODO: This test fails because it uses RANDOM_SEED sampling
3941
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
40-
41-
# TODO: Re-enable this after fixing recompilation in quantization.
42-
# && echo TEST_4 \
43-
# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ steps:
153153
# TODO: create a dedicated test section for multi-GPU example tests
154154
# when we have multiple distributed example tests
155155
- pushd ../examples/offline_inference
156-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
157-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
156+
- python3 rlhf.py
157+
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
158158
- popd
159159

160160
- label: Metrics, Tracing Test # 10min
@@ -443,6 +443,7 @@ steps:
443443
- pytest -v -s models/encoder_decoder/audio_language -m core_model
444444
- pytest -v -s models/encoder_decoder/language -m core_model
445445
- pytest -v -s models/encoder_decoder/vision_language -m core_model
446+
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
446447

447448
- label: Multi-Modal Models Test (Extended) 1 # 48m
448449
optional: true
@@ -532,7 +533,7 @@ steps:
532533
- vllm/v1/engine/
533534
commands:
534535
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
535-
- VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
536+
- pytest -v -s entrypoints/llm/test_collective_rpc.py
536537
- pytest -v -s ./compile/test_basic_correctness.py
537538
- pytest -v -s ./compile/test_wrapper.py
538539
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
3434
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
3535

3636
# Supported AMD GPU architectures.
37-
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
37+
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
3838

3939
#
4040
# Supported/expected torch versions for CUDA/ROCm.
@@ -235,6 +235,7 @@ set(VLLM_EXT_SRC
235235
"csrc/activation_kernels.cu"
236236
"csrc/layernorm_kernels.cu"
237237
"csrc/layernorm_quant_kernels.cu"
238+
"csrc/cuda_view.cu"
238239
"csrc/quantization/gptq/q_gemm.cu"
239240
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
240241
"csrc/quantization/fp8/common.cu"

benchmarks/README.md

Lines changed: 134 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -41,51 +41,55 @@ become available.
4141
<td><code>synthetic</code></td>
4242
</tr>
4343
<tr>
44-
<td><strong>HuggingFace</strong></td>
45-
<td style="text-align: center;">🟡</td>
46-
<td style="text-align: center;">🟡</td>
47-
<td>Specify your dataset path on HuggingFace</td>
44+
<td><strong>HuggingFace-VisionArena</strong></td>
45+
<td style="text-align: center;">✅</td>
46+
<td style="text-align: center;">✅</td>
47+
<td><code>lmarena-ai/VisionArena-Chat</code></td>
48+
</tr>
49+
<tr>
50+
<td><strong>HuggingFace-InstructCoder</strong></td>
51+
<td style="text-align: center;">✅</td>
52+
<td style="text-align: center;">✅</td>
53+
<td><code>likaixin/InstructCoder</code></td>
4854
</tr>
4955
<tr>
50-
<td><strong>VisionArena</strong></td>
56+
<td><strong>HuggingFace-Other</strong></td>
5157
<td style="text-align: center;">✅</td>
5258
<td style="text-align: center;">✅</td>
53-
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
59+
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
5460
</tr>
5561
</tbody>
5662
</table>
5763

5864
✅: supported
5965

60-
🚧: to be supported
66+
🟡: Partial support
6167

62-
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
63-
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
64-
If you need support for other dataset formats, please consider contributing.
68+
🚧: to be supported
6569

66-
**Note**: VisionArena’s `dataset-name` should be set to `hf`
70+
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
6771

6872
---
6973
## Example - Online Benchmark
7074

7175
First start serving your model
7276

7377
```bash
74-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
75-
vllm serve ${MODEL_NAME} --disable-log-requests
78+
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
7679
```
7780

7881
Then run the benchmarking script
7982

8083
```bash
8184
# download dataset
8285
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
83-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
84-
NUM_PROMPTS=10
85-
BACKEND="vllm"
86-
DATASET_NAME="sharegpt"
87-
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
88-
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
86+
python3 vllm/benchmarks/benchmark_serving.py \
87+
--backend vllm \
88+
--model NousResearch/Hermes-3-Llama-3.1-8B \
89+
--endpoint /v1/completions \
90+
--dataset-name sharegpt \
91+
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
92+
--num-prompts 10
8993
```
9094

9195
If successful, you will see the following output
@@ -122,88 +126,76 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
122126
```
123127

124128
```bash
125-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
126-
NUM_PROMPTS=10
127-
BACKEND="openai-chat"
128-
DATASET_NAME="hf"
129-
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
130-
DATASET_SPLIT='train'
131-
132129
python3 vllm/benchmarks/benchmark_serving.py \
133-
--backend "${BACKEND}" \
134-
--model "${MODEL_NAME}" \
135-
--endpoint "/v1/chat/completions" \
136-
--dataset-name "${DATASET_NAME}" \
137-
--dataset-path "${DATASET_PATH}" \
138-
--hf-split "${DATASET_SPLIT}" \
139-
--num-prompts "${NUM_PROMPTS}"
130+
--backend openai-chat \
131+
--model Qwen/Qwen2-VL-7B-Instruct \
132+
--endpoint /v1/chat/completions \
133+
--dataset-name hf \
134+
--dataset-path lmarena-ai/VisionArena-Chat \
135+
--hf-split train \
136+
--num-prompts 1000
140137
```
141138

142-
### HuggingFaceDataset Examples
139+
### InstructCoder Benchmark with Speculative Decoding
143140

144-
Currently, HuggingFaceDataset only supports dataset formats
145-
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
146-
formats, please consider contributing.
141+
``` bash
142+
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
143+
--speculative-model "[ngram]" \
144+
--ngram_prompt_lookup_min 2 \
145+
--ngram-prompt-lookup-max 5 \
146+
--num_speculative_tokens 5
147+
```
148+
149+
``` bash
150+
python3 benchmarks/benchmark_serving.py \
151+
--model meta-llama/Meta-Llama-3-8B-Instruct \
152+
--dataset-name hf \
153+
--dataset-path likaixin/InstructCoder \
154+
--num-prompts 2048
155+
```
156+
157+
### Other HuggingFaceDataset Examples
147158

148159
```bash
149-
# need a model with vision capability here
150160
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
151161
```
152162

153163
**`lmms-lab/LLaVA-OneVision-Data`**
154164

155165
```bash
156-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
157-
NUM_PROMPTS=10
158-
BACKEND="openai-chat"
159-
DATASET_NAME="hf"
160-
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
161-
DATASET_SPLIT='train'
162-
DATASET_SUBSET='chart2text(cauldron)'
163166
python3 vllm/benchmarks/benchmark_serving.py \
164-
--backend "${BACKEND}" \
165-
--model "${MODEL_NAME}" \
166-
--endpoint "/v1/chat/completions" \
167-
--dataset-name "${DATASET_NAME}" \
168-
--dataset-path "${DATASET_PATH}" \
169-
--hf-split "${DATASET_SPLIT}" \
170-
--num-prompts "${NUM_PROMPTS}" \
171-
--hf-subset "${DATASET_SUBSET}"
167+
--backend openai-chat \
168+
--model Qwen/Qwen2-VL-7B-Instruct \
169+
--endpoint /v1/chat/completions \
170+
--dataset-name hf \
171+
--dataset-path lmms-lab/LLaVA-OneVision-Data \
172+
--hf-split train \
173+
--hf-subset "chart2text(cauldron)" \
174+
--num-prompts 10
172175
```
173176

174177
**`Aeala/ShareGPT_Vicuna_unfiltered`**
175178

176179
```bash
177-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
178-
NUM_PROMPTS=10
179-
BACKEND="openai-chat"
180-
DATASET_NAME="hf"
181-
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
182-
DATASET_SPLIT='train'
183180
python3 vllm/benchmarks/benchmark_serving.py \
184-
--backend "${BACKEND}" \
185-
--model "${MODEL_NAME}" \
186-
--endpoint "/v1/chat/completions" \
187-
--dataset-name "${DATASET_NAME}" \
188-
--dataset-path "${DATASET_PATH}" \
189-
--hf-split "${DATASET_SPLIT}" \
190-
--num-prompts "${NUM_PROMPTS}" \
181+
--backend openai-chat \
182+
--model Qwen/Qwen2-VL-7B-Instruct \
183+
--endpoint /v1/chat/completions \
184+
--dataset-name hf \
185+
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
186+
--hf-split train \
187+
--num-prompts 10
191188
```
192189

193190
---
194191
## Example - Offline Throughput Benchmark
195192

196193
```bash
197-
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
198-
NUM_PROMPTS=10
199-
DATASET_NAME="sonnet"
200-
DATASET_PATH="vllm/benchmarks/sonnet.txt"
201-
202194
python3 vllm/benchmarks/benchmark_throughput.py \
203-
--model "${MODEL_NAME}" \
204-
--dataset-name "${DATASET_NAME}" \
205-
--dataset-path "${DATASET_PATH}" \
206-
--num-prompts "${NUM_PROMPTS}"
195+
--model NousResearch/Hermes-3-Llama-3.1-8B \
196+
--dataset-name sonnet \
197+
--dataset-path vllm/benchmarks/sonnet.txt \
198+
--num-prompts 10
207199
```
208200

209201
If successful, you will see the following output
@@ -217,19 +209,13 @@ Total num output tokens: 1500
217209
### VisionArena Benchmark for Vision Language Models
218210

219211
``` bash
220-
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
221-
NUM_PROMPTS=10
222-
DATASET_NAME="hf"
223-
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
224-
DATASET_SPLIT="train"
225-
226212
python3 vllm/benchmarks/benchmark_throughput.py \
227-
--model "${MODEL_NAME}" \
228-
--backend "vllm-chat" \
229-
--dataset-name "${DATASET_NAME}" \
230-
--dataset-path "${DATASET_PATH}" \
231-
--num-prompts "${NUM_PROMPTS}" \
232-
--hf-split "${DATASET_SPLIT}"
213+
--model Qwen/Qwen2-VL-7B-Instruct \
214+
--backend vllm-chat \
215+
--dataset-name hf \
216+
--dataset-path lmarena-ai/VisionArena-Chat \
217+
--num-prompts 1000 \
218+
--hf-split train
233219
```
234220

235221
The `num prompt tokens` now includes image token counts
@@ -240,29 +226,71 @@ Total num prompt tokens: 14527
240226
Total num output tokens: 1280
241227
```
242228

229+
### InstructCoder Benchmark with Speculative Decoding
230+
231+
``` bash
232+
VLLM_WORKER_MULTIPROC_METHOD=spawn \
233+
VLLM_USE_V1=1 \
234+
python3 vllm/benchmarks/benchmark_throughput.py \
235+
--dataset-name=hf \
236+
--dataset-path=likaixin/InstructCoder \
237+
--model=meta-llama/Meta-Llama-3-8B-Instruct \
238+
--input-len=1000 \
239+
--output-len=100 \
240+
--num-prompts=2048 \
241+
--async-engine \
242+
--speculative-model="[ngram]" \
243+
--ngram_prompt_lookup_min=2 \
244+
--ngram-prompt-lookup-max=5 \
245+
--num_speculative_tokens=5
246+
```
247+
248+
```
249+
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
250+
Total num prompt tokens: 261136
251+
Total num output tokens: 204800
252+
```
253+
254+
### Other HuggingFaceDataset Examples
255+
256+
**`lmms-lab/LLaVA-OneVision-Data`**
257+
258+
```bash
259+
python3 vllm/benchmarks/benchmark_throughput.py \
260+
--model Qwen/Qwen2-VL-7B-Instruct \
261+
--backend vllm-chat \
262+
--dataset-name hf \
263+
--dataset-path lmms-lab/LLaVA-OneVision-Data \
264+
--hf-split train \
265+
--hf-subset "chart2text(cauldron)" \
266+
--num-prompts 10
267+
```
268+
269+
**`Aeala/ShareGPT_Vicuna_unfiltered`**
270+
271+
```bash
272+
python3 vllm/benchmarks/benchmark_throughput.py \
273+
--model Qwen/Qwen2-VL-7B-Instruct \
274+
--backend vllm-chat \
275+
--dataset-name hf \
276+
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
277+
--hf-split train \
278+
--num-prompts 10
279+
```
280+
243281
### Benchmark with LoRA Adapters
244282

245283
``` bash
246284
# download dataset
247285
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
248-
MODEL_NAME="meta-llama/Llama-2-7b-hf"
249-
BACKEND="vllm"
250-
DATASET_NAME="sharegpt"
251-
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
252-
NUM_PROMPTS=10
253-
MAX_LORAS=2
254-
MAX_LORA_RANK=8
255-
ENABLE_LORA="--enable-lora"
256-
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
257-
258286
python3 vllm/benchmarks/benchmark_throughput.py \
259-
--model "${MODEL_NAME}" \
260-
--backend "${BACKEND}" \
261-
--dataset_path "${DATASET_PATH}" \
262-
--dataset_name "${DATASET_NAME}" \
263-
--num-prompts "${NUM_PROMPTS}" \
264-
--max-loras "${MAX_LORAS}" \
265-
--max-lora-rank "${MAX_LORA_RANK}" \
266-
${ENABLE_LORA} \
267-
--lora-path "${LORA_PATH}"
287+
--model meta-llama/Llama-2-7b-hf \
288+
--backend vllm \
289+
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
290+
--dataset_name sharegpt \
291+
--num-prompts 10 \
292+
--max-loras 2 \
293+
--max-lora-rank 8 \
294+
--enable-lora \
295+
--lora-path yard1/llama-2-7b-sql-lora-test
268296
```

0 commit comments

Comments
 (0)