diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
index 68aff793ae6a..77ee313687fc 100644
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@@ -5,11 +5,11 @@
import sys
import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/3792 .
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
+# Note that we have 800 MiB quota, please use it wisely.
+# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
def print_top_10_largest_files(zip_file):
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
index 7045d8810493..bbed80ebe847 100644
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@@ -8,7 +8,8 @@
Links for vLLM
- {wheel}
+ {x86_wheel}
+ {arm_wheel}
"""
@@ -21,7 +22,25 @@
with open("index.html", "w") as f:
print(f"Generated index.html for {args.wheel}")
+ # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+ if "x86_64" in filename:
+ x86_wheel = filename
+ arm_wheel = filename.replace("x86_64", "aarch64").replace(
+ "manylinux1", "manylinux2014"
+ )
+ elif "aarch64" in filename:
+ x86_wheel = filename.replace("aarch64", "x86_64").replace(
+ "manylinux2014", "manylinux1"
+ )
+ arm_wheel = filename
+ else:
+ raise ValueError(f"Unsupported wheel: {filename}")
# cloudfront requires escaping the '+' character
f.write(
- template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
+ template.format(
+ x86_wheel=x86_wheel,
+ x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+ arm_wheel=arm_wheel,
+ arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+ )
)
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
new file mode 100644
index 000000000000..ccb4f84201b7
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+ metrics:
+ - name: "relaxed_accuracy,none"
+ # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+ value: 0.80
+limit: 100
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
new file mode 100644
index 000000000000..46f1a9fbf6ff
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+ metrics:
+ - name: "exact_match,custom-extract"
+ value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index a2f235f48581..aa4fb9fa03d6 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
new file mode 100644
index 000000000000..5f3c31743e75
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+ metrics:
+ - name: "relaxed_accuracy,none"
+ value: 0.855
+limit: 2500
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
new file mode 100644
index 000000000000..4fb0b84bc4d8
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 27a1a9a82bd3..37eeac85c933 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
DeepSeek-V2-Lite-Chat.yaml
-Meta-Llama-3-8B-QQQ.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
new file mode 100644
index 000000000000..91e22b6459c1
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
new file mode 100644
index 000000000000..1097d220245f
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
\ No newline at end of file
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
new file mode 100755
index 000000000000..c8db951381b0
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+# pip install lm-eval==0.4.9
+
+usage() {
+ echo``
+ echo "Runs lm eval harness on ChartQA using multimodal vllm."
+ echo "This pathway is intended to be used to create baselines for "
+ echo "our correctness tests in vllm's CI."
+ echo
+ echo "usage: ${0} "
+ echo
+ echo " -m - huggingface stub or local directory of the model"
+ echo " -l - limit number of samples to run"
+ echo " -t - tensor parallel size to run at"
+ echo
+}
+
+while getopts "m:l:t:" OPT; do
+ case ${OPT} in
+ m )
+ MODEL="$OPTARG"
+ ;;
+ l )
+ LIMIT="$OPTARG"
+ ;;
+ t )
+ TP_SIZE="$OPTARG"
+ ;;
+ \? )
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+lm_eval --model vllm-vlm \
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+ --tasks chartqa \
+ --batch_size auto \
+ --apply_chat_template \
+ --limit $LIMIT
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
old mode 100644
new mode 100755
index a67fc89d54e6..897f84d1e360
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -2,7 +2,7 @@
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.4
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index b98d42aa7b82..792f355c47a5 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.4
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
new file mode 100644
index 000000000000..d85a1721db9a
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+ echo``
+ echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+ echo "This pathway is intended to be used to create baselines for "
+ echo "our automated nm-test-accuracy workflow"
+ echo
+ echo "usage: ${0} "
+ echo
+ echo " -m - huggingface stub or local directory of the model"
+ echo " -l - limit number of samples to run"
+ echo " -f - number of fewshot samples to use"
+ echo " -t - tensor parallel size to run at"
+ echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+ case ${OPT} in
+ m )
+ MODEL="$OPTARG"
+ ;;
+ b )
+ BATCH_SIZE="$OPTARG"
+ ;;
+ l )
+ LIMIT="$OPTARG"
+ ;;
+ f )
+ FEWSHOT="$OPTARG"
+ ;;
+ t )
+ TP_SIZE="$OPTARG"
+ ;;
+ \? )
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+lm_eval --model vllm \
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+ --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size auto
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index ceea01166b7f..f10de82b1d8e 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,21 +19,27 @@
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
+ batch_size = eval_config.get("batch_size", "auto")
+ backend = eval_config.get("backend", "vllm")
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
- f"max_model_len={max_model_len}"
+ f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
- model="vllm",
+ model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
- batch_size="auto",
+ # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+ # text models. however, this is regressing measured strict-match for
+ # existing text models in CI, so only apply it for mm.
+ apply_chat_template=backend == "vllm-vlm",
+ batch_size=batch_size,
)
return results
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index cdf6a645147e..e6f5c8b60f45 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
## Trigger the benchmark
Performance benchmark will be triggered when:
+
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
@@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
+
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@@ -46,12 +48,14 @@ Runtime environment variables:
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when:
+
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+>
### Latency test
Here is an example of one test inside `latency-tests.json`:
@@ -74,7 +78,7 @@ Here is an example of one test inside `latency-tests.json`:
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
@@ -82,13 +86,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```json
[
@@ -100,7 +104,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
- "disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -118,8 +121,8 @@ Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
@@ -135,27 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
+`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0 | 142.633982 | 156.526018 | 1.097396 |
-| 1 | 241.620334 | 294.018783 | 1.216863 |
-| 2 | 218.298905 | 262.664916 | 1.203235 |
-| 3 | 242.743860 | 299.816190 | 1.235113 |
+| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
+| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
-Here is an example using the script to compare result_a and result_b with detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
-| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+
## Nightly test details
@@ -164,9 +160,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
-- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
-- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
-- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
+- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
+- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
+- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
@@ -176,6 +172,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
-WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
+WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
index ef11c040057c..466def07b6f1 100644
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,3 +1,4 @@
+# Nightly benchmark annotation
## Description
@@ -13,15 +14,15 @@ Please download the visualization scripts in the post
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- - Download `nightly-benchmarks.zip`.
- - In the same folder, run the following code:
-
- ```bash
- export HF_TOKEN=
- apt update
- apt install -y git
- unzip nightly-benchmarks.zip
- VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
- ```
+ - Download `nightly-benchmarks.zip`.
+ - In the same folder, run the following code:
+
+ ```bash
+ export HF_TOKEN=
+ apt update
+ apt install -y git
+ unzip nightly-benchmarks.zip
+ VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+ ```
And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 5f003f42f07c..2ef36089b6af 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -8,30 +8,30 @@ This benchmark aims to:
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
-Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
+Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
- Docker images:
- - vLLM: `vllm/vllm-openai:v0.6.2`
- - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+ - vLLM: `vllm/vllm-openai:v0.6.2`
+ - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+ - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+ - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+ - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
+ - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- - 8x Nvidia A100 GPUs
+ - 8x Nvidia A100 GPUs
- Workload:
- - Dataset
- - ShareGPT dataset
- - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- - Models: llama-3 8B, llama-3 70B.
- - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+ - Dataset
+ - ShareGPT dataset
+ - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+ - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+ - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+ - Models: llama-3 8B, llama-3 70B.
+ - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+ - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+ - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+ - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index a1f8441ccdac..8bb16bd3cf37 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -1,3 +1,4 @@
+# Performance benchmarks descriptions
## Latency tests
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 20c106234935..5ea5a50a258a 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,33 +1,202 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
+import json
+import os
+from importlib import util
import pandas as pd
+plotly_found = util.find_spec("plotly.express") is not None
+
def compare_data_columns(
- files, name_column, data_column, drop_column, ignore_test_name=False
+ files, name_column, data_column, info_cols, drop_column, debug=False
):
- print("\ncompare_data_column: " + data_column)
+ """
+ Align concatenation by keys derived from info_cols instead of row order.
+ - Pick one canonical key list: subset of info_cols present in ALL files.
+ - For each file: set index to those keys, aggregate duplicates
+ - (mean for metric, first for names).
+ - Concat along axis=1 (indexes align), then reset_index so callers can
+ - group by columns.
+ - If --debug, add a _name column per file.
+ """
+ print("\ncompare_data_column:", data_column)
+
frames = []
+ raw_data_cols = []
compare_frames = []
+
+ # 1) choose a canonical key list from info_cols that exists in ALL files
+ cols_per_file = []
+ for f in files:
+ try:
+ df_tmp = pd.read_json(f, orient="records")
+ except Exception as err:
+ raise ValueError(f"Failed to read {f}") from err
+ cols_per_file.append(set(df_tmp.columns))
+
+ key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+ if not key_cols:
+ # soft fallback: use any info_cols present in the first file
+ key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+ if not key_cols:
+ raise ValueError(
+ "No common key columns found from info_cols across the input files."
+ )
+
+ # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+ meta_added = False
+
for file in files:
- data_df = pd.read_json(file)
- serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
- if ignore_test_name is False:
- serving_df = serving_df.rename(columns={name_column: file + "_name"})
- frames.append(serving_df[file + "_name"])
- serving_df = serving_df.rename(columns={data_column: file})
- frames.append(serving_df[file])
- compare_frames.append(serving_df[file])
+ df = pd.read_json(file, orient="records")
+
+ # Keep rows that actually have the compared metric (same as original behavior)
+ if drop_column in df.columns:
+ df = df.dropna(subset=[drop_column], ignore_index=True)
+
+ # Stabilize numeric key columns (harmless if missing)
+ for c in (
+ "Input Len",
+ "Output Len",
+ "TP Size",
+ "PP Size",
+ "# of max concurrency.",
+ "qps",
+ ):
+ if c in df.columns:
+ df[c] = pd.to_numeric(df[c], errors="coerce")
+
+ # Ensure all key columns exist
+ for c in key_cols:
+ if c not in df.columns:
+ df[c] = pd.NA
+
+ # Set index = key_cols and aggregate duplicates → unique MultiIndex
+ df_idx = df.set_index(key_cols, drop=False)
+
+ # meta (key columns), unique per key
+ meta = df_idx[key_cols]
+ if not meta.index.is_unique:
+ meta = meta.groupby(level=key_cols, dropna=False).first()
+
+ # metric series for this file, aggregated to one row per key
+ file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+ s = df_idx[data_column]
+ if not s.index.is_unique:
+ s = s.groupby(level=key_cols, dropna=False).mean()
+ s.name = file_label # column label like original
+
+ # add meta once (from first file) so keys are the leftmost columns
+ if not meta_added:
+ frames.append(meta)
+ meta_added = True
+
+ # (NEW) debug: aligned test-name column per file
+ if debug and name_column in df_idx.columns:
+ name_s = df_idx[name_column]
+ if not name_s.index.is_unique:
+ name_s = name_s.groupby(level=key_cols, dropna=False).first()
+ name_s.name = f"{file_label}_name"
+ frames.append(name_s)
+
+ frames.append(s)
+ raw_data_cols.append(file_label)
+ compare_frames.append(s)
+
+ # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
if len(compare_frames) >= 2:
- # Compare numbers among two files
- ratio_df = compare_frames[1] / compare_frames[0]
- frames.append(ratio_df)
- compare_frames.pop(1)
+ base = compare_frames[0]
+ current = compare_frames[-1]
+ ratio = current / base
+ ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
+ ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+ frames.append(ratio)
+ # 4) concat on columns with aligned MultiIndex;
+ # then reset_index to return keys as columns
concat_df = pd.concat(frames, axis=1)
- return concat_df
+ concat_df = concat_df.reset_index(drop=True).reset_index()
+ if "index" in concat_df.columns:
+ concat_df = concat_df.drop(columns=["index"])
+
+ # Ensure key/info columns appear first (in your info_cols order)
+ front = [c for c in info_cols if c in concat_df.columns]
+ rest = [c for c in concat_df.columns if c not in front]
+ concat_df = concat_df[front + rest]
+
+ print(raw_data_cols)
+ return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+ input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+ """
+ Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+ Creates: /tp{TP}_pp{PP}/benchmark_results.json
+ Returns: list of file paths written.
+ """
+ # Load JSON data into DataFrame
+ with open(input_file, encoding="utf-8") as f:
+ data = json.load(f)
+
+ # If the JSON is a dict with a list under common keys, use that list
+ if isinstance(data, dict):
+ for key in ("results", "serving_results", "benchmarks", "data"):
+ if isinstance(data.get(key), list):
+ data = data[key]
+ break
+
+ df = pd.DataFrame(data)
+
+ # Keep only "serving" tests
+ name_col = next(
+ (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+ )
+ if name_col:
+ df = df[
+ df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+ ].copy()
+
+ # Handle alias column names
+ rename_map = {
+ "tp_size": "TP Size",
+ "tensor_parallel_size": "TP Size",
+ "pp_size": "PP Size",
+ "pipeline_parallel_size": "PP Size",
+ }
+ df.rename(
+ columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+ )
+
+ # Ensure TP/PP columns exist (default to 1 if missing)
+ if "TP Size" not in df.columns:
+ df["TP Size"] = 1
+ if "PP Size" not in df.columns:
+ df["PP Size"] = 1
+
+ # make sure TP/PP are numeric ints with no NaN
+ df["TP Size"] = (
+ pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+ df["PP Size"] = (
+ pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+ )
+
+ # Split into separate folders
+ saved_paths: list[str] = []
+ for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+ folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+ os.makedirs(folder_name, exist_ok=True)
+ filepath = os.path.join(folder_name, "benchmark_results.json")
+ group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+ print(f"Saved: {filepath}")
+ saved_paths.append(filepath)
+
+ return saved_paths
if __name__ == "__main__":
@@ -36,31 +205,103 @@ def compare_data_columns(
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
- "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+ "--debug", action="store_true", help="show all information for debugging"
+ )
+ parser.add_argument(
+ "--plot",
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help="plot perf diagrams or not --no-plot --plot",
+ )
+ parser.add_argument(
+ "-x",
+ "--xaxis",
+ type=str,
+ default="# of max concurrency.",
+ help="column name to use as X Axis in comparison graph",
)
args = parser.parse_args()
- files = args.file
- print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
+ info_cols = [
+ "Model",
+ "Dataset Name",
+ "Input Len",
+ "Output Len",
+ "TP Size",
+ "PP Size",
+ "# of max concurrency.",
+ "qps",
+ ]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
- ignore_test_name = args.ignore_test_name
+
+ if len(args.file) == 1:
+ files = split_json_by_tp_pp(args.file[0], output_root="splits")
+ info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+ else:
+ files = args.file
+ print("comparing : " + ", ".join(files))
+ debug = args.debug
+ plot = args.plot
+ # For Plot feature, assign y axis from one of info_cols
+ y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
- output_df = compare_data_columns(
+ output_df, raw_data_cols = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
+ info_cols,
drop_column,
- ignore_test_name=ignore_test_name,
+ debug=debug,
)
- print(output_df)
- html = output_df.to_html()
- text_file.write(html_msgs_for_data_cols[i])
- text_file.write(html)
+
+ # For Plot feature, insert y axis from one of info_cols
+ raw_data_cols.insert(0, info_cols[y_axis_index])
+
+ filtered_info_cols = info_cols[:-2]
+ existing_group_cols = [
+ c for c in filtered_info_cols if c in output_df.columns
+ ]
+ if not existing_group_cols:
+ raise ValueError(
+ f"No valid group-by columns "
+ f"Expected subset: {filtered_info_cols}, "
+ f"but DataFrame has: {list(output_df.columns)}"
+ )
+ output_df_sorted = output_df.sort_values(by=existing_group_cols)
+ output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+ for name, group in output_groups:
+ html = group.to_html()
+ text_file.write(html_msgs_for_data_cols[i])
+ text_file.write(html)
+
+ if plot and plotly_found:
+ import plotly.express as px
+
+ df = group[raw_data_cols]
+ df_sorted = df.sort_values(by=info_cols[y_axis_index])
+ # Melt DataFrame for plotting
+ df_melted = df_sorted.melt(
+ id_vars=info_cols[y_axis_index],
+ var_name="Configuration",
+ value_name=data_cols_to_compare[i],
+ )
+ title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+ # Create Plotly line chart
+ fig = px.line(
+ df_melted,
+ x=info_cols[y_axis_index],
+ y=data_cols_to_compare[i],
+ color="Configuration",
+ title=title,
+ markers=True,
+ )
+ # Export to HTML
+ text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 724b53056ca8..a655a650cb32 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,17 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
import json
import os
+import shlex
from importlib import util
from pathlib import Path
+from typing import Any
import pandas as pd
import psutil
+import regex as re
from tabulate import tabulate
-results_folder = Path("results/")
-
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@@ -42,13 +44,22 @@
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
+ "model_id": "Model",
+ "dataset_name": "Dataset Name",
+ "input_len": "Input Len",
+ "output_len": "Output Len",
+ "tp_size": "TP Size",
+ "pp_size": "PP Size",
+ "dtype": "dtype",
"gpu_type": "GPU",
"completed": "# of req.",
+ "qps": "qps",
+ "max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
- "total_input_tokens": "Total input tokens",
- "total_output_tokens": "Total output tokens",
+ # "total_input_tokens": "Total input tokens",
+ # "total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
@@ -93,15 +104,111 @@ def get_size_with_unit(bytes, suffix="B"):
bytes /= factor
+def _coerce(val: str) -> Any:
+ """Best-effort type coercion from string to Python types."""
+ low = val.lower()
+ if low == "null":
+ return None
+ if low == "true":
+ return True
+ if low == "false":
+ return False
+ # integers
+ if re.fullmatch(r"[+-]?\d+", val):
+ try:
+ return int(val)
+ except ValueError:
+ pass
+ # floats (keep 'inf'/'-inf'/'nan' as strings)
+ if re.fullmatch(r"[+-]?\d*\.\d+", val):
+ try:
+ return float(val)
+ except ValueError:
+ pass
+ return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+ """Parse the client_command shell string into {executable, script, args}."""
+ toks = shlex.split(cmd)
+ if len(toks) < 2:
+ raise ValueError("client_command must include an executable and a script")
+ executable, script = toks[0], toks[1]
+ args: dict[str, Any] = {}
+
+ i = 2
+ while i < len(toks):
+ t = toks[i]
+ if t.startswith("--"):
+ # --key=value or --key (value) or boolean flag
+ if "=" in t:
+ key, val = t.split("=", 1)
+ if key == "--metadata":
+ md = {}
+ if val:
+ if "=" in val:
+ k, v = val.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[val] = True
+ args[key] = md
+ else:
+ args[key] = _coerce(val)
+ i += 1
+ continue
+
+ key = t
+
+ # Special: consume metadata k=v pairs until next --flag
+ if key == "--metadata":
+ i += 1
+ md = {}
+ while i < len(toks) and not toks[i].startswith("--"):
+ pair = toks[i]
+ if "=" in pair:
+ k, v = pair.split("=", 1)
+ md[k] = _coerce(v)
+ else:
+ md[pair] = True
+ i += 1
+ args[key] = md
+ continue
+
+ # Standard: check if next token is a value (not a flag)
+ if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+ args[key] = _coerce(toks[i + 1])
+ i += 2
+ else:
+ # lone flag -> True
+ args[key] = True
+ i += 1
+ else:
+ # unexpected positional; skip
+ i += 1
+
+ return {"executable": executable, "script": script, "args": args}
+
+
if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "-r",
+ "--result",
+ type=str,
+ default="results",
+ help="Folder name for benchmark output results.",
+ )
+ args = parser.parse_args()
+ results_folder = Path(args.result)
+ if not results_folder.exists():
+ raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
- # this result is generated via `benchmark_serving.py`
-
+ # this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@@ -109,18 +216,50 @@ def get_size_with_unit(bytes, suffix="B"):
except OSError as e:
print(e)
continue
-
+ # Parse Server Command Arg
+ out: dict[str, Any] = {
+ "server_command": parse_client_command(command["server_command"])
+ }
+ parse_args = [
+ "--tensor-parallel-size",
+ "--pipeline-parallel-size",
+ "--dtype",
+ ]
+ col_mapping = ["tp_size", "pp_size", "dtype"]
+ for index, arg in enumerate(parse_args):
+ if arg in out["server_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["server_command"]["args"][arg]}
+ )
+
+ # Parse Client Command Arg
+ out: dict[str, Any] = {
+ "client_command": parse_client_command(command["client_command"])
+ }
+ parse_args = [
+ "--dataset-name",
+ "--random-input-len",
+ "--random-output-len",
+ "--request-rate",
+ ]
+ col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+ for index, arg in enumerate(parse_args):
+ if arg in out["client_command"]["args"]:
+ raw_result.update(
+ {col_mapping[index]: out["client_command"]["args"][arg]}
+ )
+ # Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
-
# add the result to raw_result
serving_results.append(raw_result)
continue
elif "latency" in f.name:
- # this result is generated via `benchmark_latency.py`
+ # this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result
try:
@@ -148,7 +287,7 @@ def get_size_with_unit(bytes, suffix="B"):
continue
elif "throughput" in f.name:
- # this result is generated via `benchmark_throughput.py`
+ # this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result
try:
@@ -204,7 +343,10 @@ def get_size_with_unit(bytes, suffix="B"):
columns=latency_column_mapping
)
if not serving_results.empty:
- serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+ valid_columns = [
+ col for col in serving_column_mapping if col in serving_results.columns
+ ]
+ serving_results = serving_results[valid_columns].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
@@ -226,7 +368,7 @@ def get_size_with_unit(bytes, suffix="B"):
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
- lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+ lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
)
# get markdown tables
@@ -244,7 +386,9 @@ def get_size_with_unit(bytes, suffix="B"):
)
# document the result
- with open(results_folder / "benchmark_results.md", "w") as f:
+ md_file = "benchmark_results.md"
+ json_file = "benchmark_results.json"
+ with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
@@ -259,7 +403,7 @@ def get_size_with_unit(bytes, suffix="B"):
f.write(results)
# document benchmarking results in json
- with open(results_folder / "benchmark_results.json", "w") as f:
+ with open(results_folder / json_file, "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index fb5063db8694..ebacdcbd6821 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -181,18 +181,14 @@ launch_vllm_server() {
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
- server_command="python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="vllm serve $model \
-tp $tp \
- --model $model \
--port $port \
$server_args"
fi
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 4d01a314adc4..a00de940cbbb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
-
+
return
fi
}
@@ -95,12 +95,14 @@ json2args() {
}
kill_gpu_processes() {
- pkill -f python
- pkill -f python3
- pkill -f tritonserver
- pkill -f pt_main_thread
- pkill -f text-generation
- pkill -f lmdeploy
+ pkill -f '[p]ython'
+ pkill -f '[p]ython3'
+ pkill -f '[t]ritonserver'
+ pkill -f '[p]t_main_thread'
+ pkill -f '[t]ext-generation'
+ pkill -f '[l]mdeploy'
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
@@ -125,7 +127,7 @@ ensure_installed() {
}
run_serving_tests() {
- # run serving tests using `benchmark_serving.py`
+ # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -225,7 +227,7 @@ run_serving_tests() {
if [[ "$dataset_name" = "sharegpt" ]]; then
- client_command="python3 benchmark_serving.py \
+ client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -246,7 +248,7 @@ run_serving_tests() {
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
- client_command="python3 benchmark_serving.py \
+ client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
@@ -265,13 +267,13 @@ run_serving_tests() {
$client_args"
else
-
+
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
-
+
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
@@ -302,7 +304,7 @@ run_serving_tests() {
}
run_genai_perf_tests() {
- # run genai-perf tests
+ # run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
@@ -311,14 +313,14 @@ run_genai_perf_tests() {
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
- test_name=$(echo "$params" | jq -r '.test_name')
-
+ test_name=$(echo "$params" | jq -r '.test_name')
+
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
-
+
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
@@ -369,10 +371,10 @@ run_genai_perf_tests() {
qps=$num_prompts
echo "now qps is $qps"
fi
-
+
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
-
+
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
@@ -380,7 +382,7 @@ run_genai_perf_tests() {
client_command="genai-perf profile \
-m $model \
--service-kind openai \
- --backend vllm \
+ --backend "$backend" \
--endpoint-type chat \
--streaming \
--url localhost:$port \
@@ -413,7 +415,7 @@ prepare_dataset() {
do
cat sonnet.txt >> sonnet_4x.txt
done
-
+
}
main() {
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index f05040618981..c64e5638029e 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -33,7 +33,7 @@ check_gpus() {
check_cpus() {
# check the number of CPUs and NUMA Node and GPU type.
- declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
+ declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
@@ -126,7 +126,8 @@ kill_gpu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
-
+ # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+ pgrep VLLM | xargs -r kill -9
# wait until GPU memory usage smaller than 1GB
if command -v nvidia-smi; then
@@ -164,7 +165,7 @@ upload_to_buildkite() {
}
run_latency_tests() {
- # run latency tests using `benchmark_latency.py`
+ # run latency tests using `vllm bench latency` command
# $1: a json file specifying latency test cases
local latency_test_file
@@ -193,9 +194,11 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -205,7 +208,7 @@ run_latency_tests() {
fi
fi
- latency_command=" $latency_envs python3 benchmark_latency.py \
+ latency_command=" $latency_envs vllm bench latency \
--output-json $RESULTS_FOLDER/${test_name}.json \
$latency_args"
@@ -231,7 +234,7 @@ run_latency_tests() {
}
run_throughput_tests() {
- # run throughput tests using `benchmark_throughput.py`
+ # run throughput tests using `vllm bench throughput`
# $1: a json file specifying throughput test cases
local throughput_test_file
@@ -260,9 +263,11 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -272,7 +277,7 @@ run_throughput_tests() {
fi
fi
- throughput_command=" $throughput_envs python3 benchmark_throughput.py \
+ throughput_command=" $throughput_envs vllm bench throughput \
--output-json $RESULTS_FOLDER/${test_name}.json \
$throughput_args"
@@ -297,7 +302,7 @@ run_throughput_tests() {
}
run_serving_tests() {
- # run serving tests using `benchmark_serving.py`
+ # run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
@@ -328,12 +333,21 @@ run_serving_tests() {
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
+ max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+ if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+ num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+ max_concurrency_list="[$num_prompts]"
+ fi
+ max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+ echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
- if [ "$ON_CPU" == "1" ];then
- if [[ $numa_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+ if [ "$ON_CPU" == "1" ]; then
+ pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+ world_size=$(($tp*$pp))
+ if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
+ echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@@ -351,8 +365,7 @@ run_serving_tests() {
continue
fi
- server_command="$server_envs python3 \
- -m vllm.entrypoints.openai.api_server \
+ server_command="$server_envs vllm serve \
$server_args"
# run the server
@@ -389,35 +402,39 @@ run_serving_tests() {
echo "now qps is $qps"
fi
- new_test_name=$test_name"_qps_"$qps
-
- # pass the tensor parallel size to the client so that it can be displayed
- # on the benchmark dashboard
- client_command="python3 benchmark_serving.py \
- --save-result \
- --result-dir $RESULTS_FOLDER \
- --result-filename ${new_test_name}.json \
- --request-rate $qps \
- --metadata "tensor_parallel_size=$tp" \
- $client_args $client_remote_args "
-
- echo "Running test case $test_name with qps $qps"
- echo "Client command: $client_command"
-
- bash -c "$client_command"
-
- # record the benchmarking commands
- jq_output=$(jq -n \
- --arg server "$server_command" \
- --arg client "$client_command" \
- --arg gpu "$gpu_type" \
- '{
- server_command: $server,
- client_command: $client,
- gpu_type: $gpu
- }')
- echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
+ # iterate over different max_concurrency
+ for max_concurrency in $max_concurrency_list; do
+ new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+ echo " new test name $new_test_name"
+ # pass the tensor parallel size to the client so that it can be displayed
+ # on the benchmark dashboard
+ client_command="vllm bench serve \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ --max-concurrency $max_concurrency \
+ --metadata "tensor_parallel_size=$tp" \
+ $client_args $client_remote_args "
+
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
+
+ bash -c "$client_command"
+
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+ done
done
# clean up
@@ -437,17 +454,12 @@ main() {
fi
check_hf_token
- # Set to v1 to run v1 benchmark
- if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
- export VLLM_USE_V1=1
- fi
-
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
- # get the current IP address, required by benchmark_serving.py
+ # get the current IP address, required by `vllm bench serve` command
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
# turn of the reporting of the status of each request, to clean up the terminal output
export VLLM_LOGGING_LEVEL="WARNING"
diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
index edbe9f2df0ce..afb844880f9f 100644
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -11,9 +11,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index da93fdd1dbac..569117aae852 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
@@ -20,7 +20,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index fda1a7a3ec53..423a3bfe1267 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -35,9 +35,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -90,9 +88,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -145,9 +141,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -197,9 +191,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -251,9 +243,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@@ -305,9 +295,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
- "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
- "num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
new file mode 100644
index 000000000000..f758097e098e
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -0,0 +1,610 @@
+[
+ {
+ "test_name": "serving_llama8B_bf16_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp4_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 4,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
new file mode 100644
index 000000000000..ce396d6e54f2
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -0,0 +1,820 @@
+[
+ {
+ "test_name": "serving_llama8B_bf16_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 200
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp1_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 1,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ },
+ {
+ "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
+ "qps_list": ["inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+ "server_environment_variables": {
+ "VLLM_RPC_TIMEOUT": 100000,
+ "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+ "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
+ "VLLM_CPU_KVCACHE_SPACE": 40
+ },
+ "server_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "quantization": "awq",
+ "tensor_parallel_size": 2,
+ "pipeline_parallel_size": 3,
+ "dtype": "bfloat16",
+ "distributed_executor_backend": "mp",
+ "block_size": 128,
+ "trust_remote_code": "",
+ "enable_chunked_prefill": "",
+ "disable_log_stats": "",
+ "enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
+ "load_format": "dummy"
+ },
+ "client_parameters": {
+ "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+ "backend": "vllm",
+ "dataset_name": "random",
+ "random-input-len": 128,
+ "random-output-len": 128,
+ "ignore-eos": "",
+ "num_prompts": 1000
+ }
+ }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index 22f71c993ff3..e21c8df0a9fe 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,104 +2,112 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
- "disable_log_requests": "",
"enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
- "disable_log_requests": "",
"enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
- "disable_log_requests": "",
"enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
- "max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -107,32 +115,34 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
- "disable_log_requests": "",
"enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
+ "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+ "VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@@ -140,18 +150,18 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
- "disable_log_requests": "",
"enforce_eager": "",
+ "max_num_batched_tokens": 2048,
+ "max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
- "max_concurrency": 100,
"num_prompts": 100
}
}
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 13fd5aa8db97..a6d4141d5c2d 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -7,7 +7,6 @@
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
- "disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -26,7 +25,6 @@
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
- "disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -45,7 +43,6 @@
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
- "disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@@ -60,8 +57,7 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2],
"server_parameters": {
- "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
- "disable_log_requests": "",
+ "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index f159c30637d3..48c015aa8403 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -21,7 +21,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
- "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+ "model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml
deleted file mode 100644
index d5cad1c73c6f..000000000000
--- a/.buildkite/pyproject.toml
+++ /dev/null
@@ -1,46 +0,0 @@
-# This local pyproject file is part of the migration from yapf to ruff format.
-# It uses the same core rules as the main pyproject.toml file, but with the
-# following differences:
-# - ruff line length is overridden to 88
-# - deprecated typing ignores (UP006, UP035) have been removed
-
-[tool.ruff]
-line-length = 88
-
-[tool.ruff.lint.per-file-ignores]
-"vllm/third_party/**" = ["ALL"]
-"vllm/version.py" = ["F401"]
-"vllm/_version.py" = ["ALL"]
-
-[tool.ruff.lint]
-select = [
- # pycodestyle
- "E",
- # Pyflakes
- "F",
- # pyupgrade
- "UP",
- # flake8-bugbear
- "B",
- # flake8-simplify
- "SIM",
- # isort
- "I",
- # flake8-logging-format
- "G",
-]
-ignore = [
- # star imports
- "F405", "F403",
- # lambda expression assignment
- "E731",
- # Loop control variable not used within loop body
- "B007",
- # f-string format
- "UP032",
- # Can remove once 3.10+ is the minimum Python version
- "UP007",
-]
-
-[tool.ruff.format]
-docstring-code-format = true
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 6314afd65234..5bc59c151565 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,5 +1,36 @@
steps:
+ # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build arm64 wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-arm64-cuda-12-9
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+ # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ # aarch64 build.
+ - label: "Build arm64 CPU wheel"
+ depends_on: ~
+ id: build-wheel-arm64-cpu
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
+ - "mkdir artifacts"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ - "bash .buildkite/scripts/upload-wheels.sh"
+ env:
+ DOCKER_BUILDKIT: "1"
+
- label: "Build wheel - CUDA 12.8"
+ depends_on: ~
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
@@ -12,6 +43,7 @@ steps:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
+ depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
@@ -23,44 +55,61 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
- # However, this block can be uncommented to save some compute hours.
- # - block: "Build CUDA 11.8 wheel"
- # key: block-build-cu118-wheel
-
- - label: "Build wheel - CUDA 11.8"
- # depends_on: block-build-cu118-wheel
- id: build-wheel-cuda-11-8
+ # x86 + CUDA builds
+ - label: "Build wheel - CUDA 12.9"
+ depends_on: ~
+ id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- - block: "Build release image"
+ - label: "Build release image (x86)"
depends_on: ~
- key: block-release-image-build
-
- - label: "Build release image"
- depends_on: block-release-image-build
- id: build-release-image
+ id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+ # re-tag to default image tag and push, just in case arm64 build fails
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+ # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+ - label: "Build release image (arm64)"
+ depends_on: ~
+ id: build-release-image-arm64
+ agents:
+ queue: arm64_cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+ # Add job to create multi-arch manifest
+ - label: "Create multi-arch manifest"
+ depends_on:
+ - build-release-image-x86
+ - build-release-image-arm64
+ id: create-multi-arch-manifest
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+ - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
- label: "Annotate release workflow"
depends_on:
- - build-release-image
+ - create-multi-arch-manifest
- build-wheel-cuda-12-8
- - build-wheel-cuda-12-6
- - build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
@@ -107,18 +156,46 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- - block: "Build Neuron release image"
- key: block-neuron-release-image-build
+ - block: "Build arm64 CPU release image"
+ key: block-arm64-cpu-release-image-build
depends_on: ~
- - label: "Build and publish Neuron release image"
- depends_on: block-neuron-release-image-build
+ - label: "Build and publish arm64 CPU release image"
+ depends_on: block-arm64-cpu-release-image-build
agents:
- queue: neuron-postmerge
+ queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
- - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+ - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+ env:
+ DOCKER_BUILDKIT: "1"
+
+ - label: "Build and publish nightly multi-arch image to DockerHub"
+ depends_on:
+ - create-multi-arch-manifest
+ if: build.env("NIGHTLY") == "1"
+ agents:
+ queue: cpu_queue_postmerge
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+ - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+ - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+ - "docker push vllm/vllm-openai:nightly-x86_64"
+ - "docker push vllm/vllm-openai:nightly-aarch64"
+ - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+ - "docker manifest push vllm/vllm-openai:nightly"
+ - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+ # Clean up old nightly builds (keep only last 14)
+ - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
+ plugins:
+ - docker-login#v3.0.0:
+ username: vllmbot
+ password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
+ DOCKERHUB_USERNAME: "vllmbot"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 94e0ac2398f3..fde48603ad3c 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker push vllm/vllm-openai:latest
-docker push vllm/vllm-openai:v${RELEASE_VERSION}
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
+docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh
new file mode 100755
index 000000000000..f02a128c6772
--- /dev/null
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+set -ex
+
+# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
+# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
+
+# DockerHub API endpoint for vllm/vllm-openai repository
+REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
+
+# Get DockerHub credentials from environment
+if [ -z "$DOCKERHUB_TOKEN" ]; then
+ echo "Error: DOCKERHUB_TOKEN environment variable is not set"
+ exit 1
+fi
+
+if [ -z "$DOCKERHUB_USERNAME" ]; then
+ echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+ exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+ -H "Content-Type: application/json" \
+ -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+ "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+ echo "Error: Failed to get DockerHub bearer token"
+ exit 1
+fi
+
+# Function to get all tags from DockerHub
+get_all_tags() {
+ local page=1
+ local all_tags=""
+
+ while true; do
+ set +x
+ local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
+ "$REPO_API_URL?page=$page&page_size=100")
+ set -x
+
+ # Get both last_updated timestamp and tag name, separated by |
+ local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
+
+ if [ -z "$tags" ]; then
+ break
+ fi
+
+ all_tags="$all_tags$tags"$'\n'
+ page=$((page + 1))
+ done
+
+ # Sort by timestamp (newest first) and extract just the tag names
+ echo "$all_tags" | sort -r | cut -d'|' -f2
+}
+
+delete_tag() {
+ local tag_name="$1"
+ echo "Deleting tag: $tag_name"
+
+ local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
+ set +x
+ local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+ set -x
+
+ if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
+ echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
+ else
+ echo "Successfully deleted tag: $tag_name"
+ fi
+}
+
+# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
+echo "Fetching all tags from DockerHub..."
+all_tags=$(get_all_tags)
+
+if [ -z "$all_tags" ]; then
+ echo "No tags found to clean up"
+ exit 0
+fi
+
+# Count total tags
+total_tags=$(echo "$all_tags" | wc -l)
+echo "Found $total_tags tags"
+
+# Keep only the last 14 builds (including the current one)
+tags_to_keep=14
+tags_to_delete=$((total_tags - tags_to_keep))
+
+if [ $tags_to_delete -le 0 ]; then
+ echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
+ exit 0
+fi
+
+echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
+
+# Get tags to delete (skip the first $tags_to_keep tags)
+tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
+
+if [ -z "$tags_to_delete_list" ]; then
+ echo "No tags to delete"
+ exit 0
+fi
+
+# Delete old tags
+echo "Deleting old tags..."
+while IFS= read -r tag; do
+ if [ -n "$tag" ]; then
+ delete_tag "$tag"
+ # Add a small delay to avoid rate limiting
+ sleep 1
+ fi
+done <<< "$tags_to_delete_list"
+
+echo "Cleanup completed successfully"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 5e5a532cb57d..aa4cc7b35a54 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
- commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
@@ -121,7 +117,6 @@ fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
- --ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \
@@ -165,16 +160,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_chat.py \
--ignore=entrypoints/llm/test_accuracy.py \
--ignore=entrypoints/llm/test_init.py \
- --ignore=entrypoints/llm/test_generate_multiple_loras.py \
--ignore=entrypoints/llm/test_prompt_validation.py "}
fi
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 36bcb015d308..39ea18017308 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,25 +25,28 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
- set -e
- python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ set -xve
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
- set -e
+ set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
- pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+ # Note: disable Bart until supports V1
+ # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
- pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+ # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+ # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 90cc9c844622..7927aef19e4e 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
-remove_docker_container() {
- set -e;
- docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
+remove_docker_container() {
+ set -e;
+ docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
@@ -46,57 +46,74 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+ # Run kernel tests
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -x -v -s tests/kernels/test_onednn.py"
+
# Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
# Note: disable until supports V1
- # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
- # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
-
- # Note: disable Bart until supports V1
- pytest -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
- VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
- --ignore=tests/models/language/generation/test_bart.py
-
- pytest -v -s tests/models/language/pooling -m cpu_model
- pytest -v -s tests/models/multimodal/generation \
- --ignore=tests/models/multimodal/generation/test_mllama.py \
+ # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+ # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+
+ pytest -x -v -s tests/models/language/generation -m cpu_model
+ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
+
+ pytest -x -v -s tests/models/language/pooling -m cpu_model
+ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
- pytest -s -v \
- tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+ pytest -x -s -v \
+ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Note: disable it until supports V1
# Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e
- # VLLM_USE_V1=0 pytest -s -v \
+ # VLLM_USE_V1=0 pytest -x -s -v \
# tests/quantization/test_ipex_quant.py"
- # online serving
+ # Run multi-lora tests
+ docker exec cpu-test-"$NUMA_NODE" bash -c "
+ set -e
+ pytest -x -s -v \
+ tests/lora/test_qwen2vl.py"
+
+ # online serving: tp+pp
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+ server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
- python3 benchmarks/benchmark_serving.py \
+ vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
- --endpoint /v1/completions'
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
- # Run multi-lora tests
- docker exec cpu-test-"$NUMA_NODE" bash -c "
+ # online serving: tp+dp
+ docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
- pytest -s -v \
- tests/lora/test_qwen2vl.py"
+ VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+ server_pid=$!
+ timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+ vllm bench serve \
+ --backend vllm \
+ --dataset-name random \
+ --model meta-llama/Llama-3.2-3B-Instruct \
+ --num-prompts 20 \
+ --endpoint /v1/completions
+ kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
-timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
diff --git a/.buildkite/scripts/hardware_ci/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
index 8c64e14606d3..f69e4b06680f 100644
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
- --build-arg torch_cuda_arch_list="9.0+PTX" \
- --build-arg vllm_fa_cmake_gpu_arches="90-real"
+ --build-arg torch_cuda_arch_list="9.0+PTX"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted file mode 100644
index a397457c8326..000000000000
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-# This script build the Neuron docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -e
-set -v
-
-image_name="neuron/vllm-ci"
-container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
-
-NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
-mkdir -p "${NEURON_COMPILE_CACHE_URL}"
-NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
-
-# Try building the docker image
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
-
-# prune old image and containers to save disk space, and only once a day
-# by using a timestamp file in tmp.
-if [ -f /tmp/neuron-docker-build-timestamp ]; then
- last_build=$(cat /tmp/neuron-docker-build-timestamp)
- current_time=$(date +%s)
- if [ $((current_time - last_build)) -gt 86400 ]; then
- # Remove dangling images (those that are not tagged and not used by any container)
- docker image prune -f
- # Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune -f
- echo "$current_time" > /tmp/neuron-docker-build-timestamp
- fi
-else
- date "+%s" > /tmp/neuron-docker-build-timestamp
-fi
-
-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
-
-# Setup cleanup
-remove_docker_container() {
- docker image rm -f "${image_name}" || true;
-}
-trap remove_docker_container EXIT
-
-# Run the image
-docker run --rm -it --device=/dev/neuron0 --network bridge \
- -v "${HF_CACHE}:${HF_MOUNT}" \
- -e "HF_HOME=${HF_MOUNT}" \
- -e "HF_TOKEN=${HF_TOKEN}" \
- -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
- -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
- --name "${container_name}" \
- ${image_name} \
- /bin/bash -c "
- set -e; # Exit on first error
- python3 /workspace/vllm/examples/offline_inference/neuron.py;
- python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
- for f in /workspace/vllm/tests/neuron/2_core/*.py; do
- echo \"Running test file: \$f\";
- python3 -m pytest \$f -v --capture=tee-sys;
- done
- "
\ No newline at end of file
diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh
new file mode 100644
index 000000000000..29c8f5ed5a91
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# This script build the Ascend NPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Base ubuntu image with basic ascend development libraries and python installed
+VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
+CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
+TEST_RUN_CONFIG_FILE="vllm_test.cfg"
+VLLM_ASCEND_TMP_DIR=
+# Get the test run configuration file from the vllm-ascend repository
+fetch_vllm_test_cfg() {
+ VLLM_ASCEND_TMP_DIR=$(mktemp -d)
+ # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
+ cleanup() {
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ }
+ trap cleanup EXIT
+
+ GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
+ if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
+ echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+
+ # If the file already exists locally, just overwrite it
+ cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
+ echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
+
+ # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
+ # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
+ rm -rf "${VLLM_ASCEND_TMP_DIR}"
+ trap - EXIT
+}
+
+# Downloads test run configuration file from a remote URL.
+# Loads the configuration into the current script environment.
+get_config() {
+ if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
+ echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
+ exit 1
+ fi
+ source "${TEST_RUN_CONFIG_FILE}"
+ echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
+ return 0
+}
+
+# get test running configuration.
+fetch_vllm_test_cfg
+get_config
+# Check if the function call was successful. If not, exit the script.
+if [ $? -ne 0 ]; then
+ exit 1
+fi
+
+image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
+container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
+agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
+echo "agent_idx: ${agent_idx}"
+builder_name="cachebuilder${agent_idx}"
+builder_cache_dir="/mnt/docker-cache${agent_idx}"
+mkdir -p ${builder_cache_dir}
+
+# Try building the docker image
+cat <=6.0 modelscope
+
+WORKDIR /workspace/vllm
+
+# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r requirements/common.txt
+
+COPY . .
+
+# Install vLLM
+RUN --mount=type=cache,target=/root/.cache/pip \
+ VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+ python3 -m pip uninstall -y triton
+
+# Install vllm-ascend
+WORKDIR /workspace
+ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
+ARG VLLM_ASCEND_TAG=main
+RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
+ git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
+
+# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -r /workspace/vllm-ascend/requirements.txt
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+ source /usr/local/Ascend/nnal/atb/set_env.sh && \
+ export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+ python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV VLLM_USE_MODELSCOPE=True
+
+WORKDIR /workspace/vllm-ascend
+
+CMD ["/bin/bash"]
+
+EOF
+
+# Setup cleanup
+remove_docker_container() {
+ docker rm -f "${container_name}" || true;
+ docker image rm -f "${image_name}" || true;
+ docker system prune -f || true;
+}
+trap remove_docker_container EXIT
+
+# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
+# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
+# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
+# returns --device /dev/davinci0 --device /dev/davinci1
+parse_and_gen_devices() {
+ local input="$1"
+ local index cards_num
+ if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
+ index="${BASH_REMATCH[1]}"
+ cards_num="${BASH_REMATCH[2]}"
+ else
+ echo "parse error" >&2
+ return 1
+ fi
+
+ local devices=""
+ local i=0
+ while (( i < cards_num )); do
+ local dev_idx=$(((index - 1)*cards_num + i ))
+ devices="$devices --device /dev/davinci${dev_idx}"
+ ((i++))
+ done
+
+ # trim leading space
+ devices="${devices#"${devices%%[![:space:]]*}"}"
+ # Output devices: assigned to the caller variable
+ printf '%s' "$devices"
+}
+
+devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
+
+# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
+# This test checks whether the OOT platform interface is functioning properly in conjunction with
+# the hardware plugin vllm-ascend.
+model_cache_dir=/mnt/modelscope${agent_idx}
+mkdir -p ${model_cache_dir}
+docker run \
+ ${devices} \
+ --device /dev/davinci_manager \
+ --device /dev/devmm_svm \
+ --device /dev/hisi_hdc \
+ -v /usr/local/dcmi:/usr/local/dcmi \
+ -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+ -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+ -v /etc/ascend_install.info:/etc/ascend_install.info \
+ -v ${model_cache_dir}:/root/.cache/modelscope \
+ --entrypoint="" \
+ --name "${container_name}" \
+ "${image_name}" \
+ bash -c '
+ set -e
+ pytest -v -s tests/e2e/vllm_interface/
+'
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
new file mode 100755
index 000000000000..cbb2527a4ff0
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+set -xu
+
+
+remove_docker_container() {
+ docker rm -f tpu-test || true;
+}
+
+trap remove_docker_container EXIT
+
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# Build the docker image.
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+cleanup_docker() {
+ # Get Docker's root directory
+ docker_root=$(docker info -f '{{.DockerRootDir}}')
+ if [ -z "$docker_root" ]; then
+ echo "Failed to determine Docker root directory."
+ exit 1
+ fi
+ echo "Docker root directory: $docker_root"
+ # Check disk usage of the filesystem where Docker's root directory is located
+ disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+ # Define the threshold
+ threshold=70
+ if [ "$disk_usage" -gt "$threshold" ]; then
+ echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+ # Remove dangling images (those that are not tagged and not used by any container)
+ docker image prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
+ echo "Docker images and volumes cleanup completed."
+ else
+ echo "Disk usage is below $threshold%. No cleanup needed."
+ fi
+}
+cleanup_docker
+
+# For HF_TOKEN.
+source /etc/environment
+
+docker run --privileged --net host --shm-size=16G -it \
+ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
+ vllm-tpu /bin/bash -c '
+set -e # Exit immediately if a command exits with a non-zero status.
+set -u # Treat unset variables as an error.
+
+echo "--- Starting script inside Docker container ---"
+
+# Create results directory
+RESULTS_DIR=$(mktemp -d)
+# If mktemp fails, set -e will cause the script to exit.
+echo "Results will be stored in: $RESULTS_DIR"
+
+# Install dependencies
+echo "--- Installing Python dependencies ---"
+python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
+ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
+echo "--- Python dependencies installed ---"
+
+export VLLM_XLA_CHECK_RECOMPILATION=1
+export VLLM_XLA_CACHE_PATH=
+
+echo "--- Hardware Information ---"
+# tpu-info
+echo "--- Starting Tests ---"
+set +e
+overall_script_exit_code=0
+
+# --- Test Definitions ---
+# If a test fails, this function will print logs and will not cause the main script to exit.
+run_test() {
+ local test_num=$1
+ local test_name=$2
+ local test_command=$3
+ local log_file="$RESULTS_DIR/test_${test_num}.log"
+ local actual_exit_code
+
+ echo "--- TEST_$test_num: Running $test_name ---"
+
+ # Execute the test command.
+ eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
+ actual_exit_code=$?
+
+ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
+ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
+
+ if [ "$actual_exit_code" -ne 0 ]; then
+ echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
+ echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
+ if [ -f "$log_file" ]; then
+ cat "$log_file" >&2
+ else
+ echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
+ fi
+ echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
+ return "$actual_exit_code" # Return the failure code
+ else
+ echo "TEST_$test_num ($test_name) PASSED."
+ return 0 # Return success
+ fi
+}
+
+# Helper function to call run_test and update the overall script exit code
+run_and_track_test() {
+ local test_num_arg="$1"
+ local test_name_arg="$2"
+ local test_command_arg="$3"
+
+ # Run the test
+ run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
+ local test_specific_exit_code=$?
+
+ # If the test failed, set the overall script exit code to 1
+ if [ "$test_specific_exit_code" -ne 0 ]; then
+ # No need for extra echo here, run_test already logged the failure.
+ overall_script_exit_code=1
+ fi
+}
+
+# --- Actual Test Execution ---
+run_and_track_test 1 "test_struct_output_generate.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+run_and_track_test 2 "test_moe_pallas.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
+run_and_track_test 3 "test_lora.py" \
+ "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
+run_and_track_test 4 "test_tpu_qkv_linear.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
+run_and_track_test 5 "test_spmd_model_weight_loading.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 6 "test_kv_cache_update_kernel.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
+run_and_track_test 7 "test_tpu_int8.py" \
+ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
+
+# After all tests have been attempted, exit with the overall status.
+if [ "$overall_script_exit_code" -ne 0 ]; then
+ echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
+else
+ echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
+fi
+exit "$overall_script_exit_code"
+' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
+
+# Capture the exit code of the docker run command
+DOCKER_RUN_EXIT_CODE=$?
+
+# The trap will run for cleanup.
+# Exit the main script with the Docker run command's exit code.
+if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
+ echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
+ exit "$DOCKER_RUN_EXIT_CODE"
+else
+ echo "Docker run command completed successfully."
+ exit 0
+fi
+# TODO: This test fails because it uses RANDOM_SEED sampling
+# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 60f0d174bd6c..f022fa3672ee 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -5,7 +5,6 @@ set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
- docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
@@ -62,12 +61,12 @@ echo "Results will be stored in: $RESULTS_DIR"
echo "--- Installing Python dependencies ---"
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
- && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
+ && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+ && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
echo "--- Python dependencies installed ---"
-export VLLM_USE_V1=1
+
export VLLM_XLA_CHECK_RECOMPILATION=1
export VLLM_XLA_CACHE_PATH=
-echo "Using VLLM V1"
echo "--- Hardware Information ---"
# tpu-info
@@ -149,18 +148,6 @@ run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
-run_and_track_test 11 "test_struct_output_generate.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
-run_and_track_test 12 "test_moe_pallas.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
-run_and_track_test 13 "test_lora.py" \
- "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
-run_and_track_test 14 "test_tpu_qkv_linear.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
-run_and_track_test 15 "test_spmd_model_weight_loading.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
-run_and_track_test 16 "test_kv_cache_update_kernel.py" \
- "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 7589b48b584d..250a64fdd071 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -23,12 +23,26 @@ docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
+ -e "HF_TOKEN=${HF_TOKEN}" \
+ -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
--name "${container_name}" \
"${image_name}" \
- sh -c '
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
- VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ bash -c '
+ set -e
+ echo $ZE_AFFINITY_MASK
+ pip install tblib==3.1.0
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
+ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
+ VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
+ pytest -v -s v1/engine
+ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
+ pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
+ pytest -v -s v1/structured_output
+ pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
+ pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+ pytest -v -s v1/test_serial_utils.py
'
diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 195a8063fd74..51536b36b808 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -11,20 +11,20 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
+vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
+vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
-python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+vllm serve meta-llama/Llama-2-7b-chat-hf &
server_pid=$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-python3 benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh
new file mode 100755
index 000000000000..5b25c358fc4a
--- /dev/null
+++ b/.buildkite/scripts/run-prime-rl-test.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Setup script for Prime-RL integration tests
+# This script prepares the environment for running Prime-RL tests with nightly vLLM
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
+PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
+
+echo "Setting up Prime-RL integration test environment..."
+
+# Clean up any existing Prime-RL directory
+if [ -d "${PRIME_RL_DIR}" ]; then
+ echo "Removing existing Prime-RL directory..."
+ rm -rf "${PRIME_RL_DIR}"
+fi
+
+# Install UV if not available
+if ! command -v uv &> /dev/null; then
+ echo "Installing UV package manager..."
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+ source $HOME/.local/bin/env
+fi
+
+# Clone Prime-RL repository at specific branch for reproducible tests
+PRIME_RL_BRANCH="integ-vllm-main"
+echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
+git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
+cd "${PRIME_RL_DIR}"
+
+echo "Setting up UV project environment..."
+export UV_PROJECT_ENVIRONMENT=/usr/local
+ln -s /usr/bin/python3 /usr/local/bin/python
+
+# Remove vllm pin from pyproject.toml
+echo "Removing vllm pin from pyproject.toml..."
+sed -i '/vllm==/d' pyproject.toml
+
+# Sync Prime-RL dependencies
+echo "Installing Prime-RL dependencies..."
+uv sync --inexact && uv sync --inexact --all-extras
+
+# Verify installation
+echo "Verifying installations..."
+uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
+
+echo "Prime-RL integration test environment setup complete!"
+
+echo "Running Prime-RL integration tests..."
+export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
+uv run pytest -vs tests/integration/test_rl.py -m gpu
+
+echo "Prime-RL integration tests completed!"
diff --git a/.buildkite/scripts/tpu/cleanup_docker.sh b/.buildkite/scripts/tpu/cleanup_docker.sh
index 209d9c4341cd..740d81fb39bb 100755
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
- docker volume prune -f && docker system prune --force --filter "until=72h" --all
+ docker volume prune -f && docker system prune --force --filter "until=24h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
diff --git a/.buildkite/scripts/tpu/config_v6e_1.env b/.buildkite/scripts/tpu/config_v6e_1.env
index 03ec116f698d..c9e3c26571e7 100644
--- a/.buildkite/scripts/tpu/config_v6e_1.env
+++ b/.buildkite/scripts/tpu/config_v6e_1.env
@@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8b
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
diff --git a/.buildkite/scripts/tpu/docker_run_bm.sh b/.buildkite/scripts/tpu/docker_run_bm.sh
index 8959877a3c05..08e36611809d 100755
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@@ -12,8 +12,6 @@ source /etc/environment
source $ENV_FILE
remove_docker_container() {
- docker rm -f tpu-test || true;
- docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
diff --git a/.buildkite/scripts/tpu/quantized_v6e_1.env b/.buildkite/scripts/tpu/quantized_v6e_1.env
index bab34b3be3b9..ecb98d4516bd 100644
--- a/.buildkite/scripts/tpu/quantized_v6e_1.env
+++ b/.buildkite/scripts/tpu/quantized_v6e_1.env
@@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8bw8a8
-CONTAINER_NAME=vllm-tpu
+CONTAINER_NAME=tpu-test
# vllm config
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
@@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
-EXPECTED_THROUGHPUT=10.0
+EXPECTED_THROUGHPUT=8.7
INPUT_LEN=1800
OUTPUT_LEN=128
diff --git a/.buildkite/scripts/tpu/run_bm.sh b/.buildkite/scripts/tpu/run_bm.sh
index 877669cd956a..3364fce8e1fd 100755
--- a/.buildkite/scripts/tpu/run_bm.sh
+++ b/.buildkite/scripts/tpu/run_bm.sh
@@ -42,9 +42,8 @@ echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
-VLLM_USE_V1=1 vllm serve $MODEL \
+vllm serve $MODEL \
--seed 42 \
- --disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
@@ -77,7 +76,7 @@ done
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 037897e53dbe..43aa8c47be29 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -14,8 +14,19 @@ fi
# Get the single wheel file
wheel="${wheel_files[0]}"
-# Rename 'linux' to 'manylinux1' in the wheel filename
-new_wheel="${wheel/linux/manylinux1}"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+ manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+ manylinux_version="manylinux2014"
+else
+ echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+ manylinux_version="manylinux1"
+fi
+
+# Rename 'linux' to the appropriate manylinux version in the wheel filename
+new_wheel="${wheel/linux/$manylinux_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
@@ -47,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@@ -63,14 +75,15 @@ fi
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu118"* ]]; then
- # if $normal_wheel matches cu118, do not upload the index.html
- echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
+elif [[ $normal_wheel == *"cu128"* ]]; then
+ # if $normal_wheel matches cu128, do not upload the index.html
+ echo "Skipping index files for cu128 wheels"
else
- # only upload index.html for cu128 wheels (default wheels)
+ # only upload index.html for cu129 wheels (default wheels) as it
+ # is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
new file mode 100644
index 000000000000..50b2b61124af
--- /dev/null
+++ b/.buildkite/test-amd.yaml
@@ -0,0 +1,1267 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
+# to generate the final pipeline yaml file.
+
+# Documentation
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
+# command(str): the single command to run for tests. incompatible with commands.
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
+# command runs on the second host.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
+
+# When adding a test
+# - If the test belongs to an existing group, add it there
+# - If the test is short, add to any existing step
+# - If the test takes more than 10min, then it is okay to create a new step.
+# Note that all steps execute in parallel.
+
+steps:
+##### fast check tests #####
+
+- label: Pytorch Nightly Dependency Override Check # 2min
+ # if this test fails, it means the nightly torch version is not compatible with some
+ # of the dependencies. Please check the error message and add the package to whitelist
+ # in /vllm/tools/generate_nightly_torch_test.py
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ soft_fail: true
+ source_file_dependencies:
+ - requirements/nightly_torch_test.txt
+ commands:
+ - bash standalone_tests/pytorch_nightly_dependency.sh
+
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/multimodal
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+ timeout_in_minutes: 10
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
+ - tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
+ no_gpu: true
+ commands:
+ - python3 standalone_tests/lazy_imports.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s test_outputs.py
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s transformers_utils
+
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - tests/standalone_tests/python_only_compile.sh
+ - setup.py
+ commands:
+ - bash standalone_tests/python_only_compile.sh
+
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/basic_correctness/test_basic_correctness
+ - tests/basic_correctness/test_cpu_offload
+ - tests/basic_correctness/test_cumem.py
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s basic_correctness/test_cumem.py
+ - pytest -v -s basic_correctness/test_basic_correctness.py
+ - pytest -v -s basic_correctness/test_cpu_offload.py
+
+- label: Entrypoints Unit Tests # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ source_file_dependencies:
+ - vllm/entrypoints
+ - tests/entrypoints/
+ commands:
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
+
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/llm
+ - tests/entrypoints/offline_mode
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/openai
+ - tests/entrypoints/test_chat_utils
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+ - pytest -v -s entrypoints/test_chat_utils.py
+
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - tests/distributed/test_utils
+ - tests/distributed/test_pynccl
+ - tests/distributed/test_events
+ - tests/compile/test_basic_correctness
+ - examples/offline_inference/rlhf.py
+ - examples/offline_inference/rlhf_colocate.py
+ - tests/examples/offline_inference/data_parallel.py
+ - tests/v1/distributed
+ - tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
+ commands:
+ # test with torchrun tp=2 and external_dp=2
+ - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and pp=2
+ - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with internal dp
+ - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+ - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+ - pytest -v -s distributed/test_utils.py
+ - pytest -v -s compile/test_basic_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
+ - pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
+ # TODO: create a dedicated test section for multi-GPU example tests
+ # when we have multiple distributed example tests
+ - pushd ../examples/offline_inference
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+ - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+ - popd
+
+- label: EPLB Algorithm Test # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_algo.py
+ commands:
+ - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution Test # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ timeout_in_minutes: 15
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/eplb
+ - tests/distributed/test_eplb_execute.py
+ commands:
+ - pytest -v -s distributed/test_eplb_execute.py
+
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_2
+ # grade: Blocking
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/
+ - tests/v1/tracing
+ commands:
+ - "pip install \
+ 'opentelemetry-sdk>=1.26.0' \
+ 'opentelemetry-api>=1.26.0' \
+ 'opentelemetry-exporter-otlp>=1.26.0' \
+ 'opentelemetry-semantic-conventions-ai>=0.4.1'"
+ - pytest -v -s v1/tracing
+
+##### fast check tests #####
+##### 1 GPU test #####
+
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/test_regression
+ commands:
+ - pip install modelscope
+ - pytest -v -s test_regression.py
+ working_dir: "/vllm-workspace/tests" # optional
+
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ #grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/engine
+ - tests/tokenization
+ - tests/test_sequence
+ - tests/test_config
+ - tests/test_logger
+ - tests/test_vllm_port
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+ # OOM in the CI unless we run this separately
+ - pytest -v -s tokenization
+
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
+ - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ - pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
+ - pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
+ - pytest -v -s v1/worker
+ - pytest -v -s v1/spec_decode
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
+ - pytest -v -s v1/test_oracle.py
+ - pytest -v -s v1/test_request.py
+ # Integration test for streaming correctness (requires special branch).
+ - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
+ - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: V1 Test others (CPU) # 5 mins
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ working_dir: "/vllm-workspace/examples"
+ source_file_dependencies:
+ - vllm/entrypoints
+ - examples/
+ commands:
+ - pip install tensorizer # for tensorizer test
+ - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+ - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+ - python3 offline_inference/basic/chat.py
+ - python3 offline_inference/prefix_caching.py
+ - python3 offline_inference/llm_engine_example.py
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_pooling.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+ - python3 offline_inference/basic/classify.py
+ - python3 offline_inference/basic/embed.py
+ - python3 offline_inference/basic/score.py
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/cuda
+ commands:
+ - pytest -v -s cuda/test_cuda_context.py
+
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor/layers
+ - vllm/sampling_metadata.py
+ - tests/samplers
+ - tests/conftest.py
+ commands:
+ - pytest -v -s samplers
+ - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py
+ parallelism: 4
+
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_pass_manager.py
+ - pytest -v -s compile/test_fusion.py
+ - pytest -v -s compile/test_fusion_attn.py
+ - pytest -v -s compile/test_functionalization.py
+ - pytest -v -s compile/test_silu_mul_quant_fusion.py
+ - pytest -v -s compile/test_sequence_parallelism.py
+ - pytest -v -s compile/test_async_tp.py
+ - pytest -v -s compile/test_fusion_all_reduce.py
+ - pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
+ - pytest -v -s compile/test_aot_compile.py
+
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_basic_correctness.py
+ - pytest -v -s compile/piecewise/
+
+- label: PyTorch Fullgraph Test # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/compile
+ commands:
+ - pytest -v -s compile/test_full_graph.py
+
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - tests/kernels/core
+ commands:
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
+
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/attention/
+ - vllm/attention
+ - vllm/v1/attention
+ - tests/kernels/attention
+ commands:
+ - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/quantization/
+ - vllm/model_executor/layers/quantization
+ - tests/kernels/quantization
+ commands:
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
+ - csrc/moe/
+ - tests/kernels/moe
+ - vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
+ commands:
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
+
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/mamba/
+ - tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
+ commands:
+ - pytest -v -s kernels/mamba
+
+- label: Model Executor Test # 23min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor
+ - tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
+ commands:
+ - apt-get update && apt-get install -y curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
+
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ working_dir: "/vllm-workspace/.buildkite"
+ source_file_dependencies:
+ - benchmarks/
+ commands:
+ - bash scripts/run-benchmarks.sh
+
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/benchmarks/
+ commands:
+ - pytest -v -s benchmarks/
+
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ - tests/quantization
+ commands:
+ # temporary install here since we need nightly, will move to requirements/test.in
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+
+- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - csrc/
+ - vllm/entrypoints/openai/
+ - vllm/model_executor/models/whisper.py
+ commands: # LMEval+Transcription WER check
+ - pytest -s entrypoints/openai/correctness/
+
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ fast_check: false
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ commands:
+ - pytest -v -s -m 'not cpu_test' tool_use
+
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/tool_use
+ no_gpu: true
+ commands:
+ - pytest -v -s -m 'cpu_test' tool_use
+
+##### models test #####
+
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
+ commands:
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ timeout_in_minutes: 10
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language
+ commands:
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_8
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
+
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation
+ commands:
+ # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
+ - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+ - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
+- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling
+ commands:
+ - pytest -v -s models/language/pooling -m 'not core_model'
+
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing
+
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Models Test (Extended) 1
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+
+- label: Multi-Modal Models Test (Extended) 2
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+
+- label: Multi-Modal Models Test (Extended) 3
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ # grade: Blocking
+ source_file_dependencies:
+ - vllm/model_executor/layers/quantization
+ - tests/models/quantization
+ commands:
+ - pytest -v -s models/quantization
+
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_1
+ # grade: Blocking
+ optional: true
+ commands:
+ - echo 'Testing custom models...'
+ # PR authors can temporarily add commands below to test individual models
+ # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+ # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+
+- label: Transformers Nightly Models Test
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_1
+ working_dir: "/vllm-workspace/"
+ optional: true
+ commands:
+ - pip install --upgrade git+https://github.com/huggingface/transformers
+ - pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
+ - pytest -v -s tests/models/multimodal/processing/
+ - pytest -v -s tests/models/multimodal/test_mapping.py
+ - python3 examples/offline_inference/basic/chat.py
+ - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Blackwell Test # 38 min
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/fusion.py
+ - vllm/compilation/fusion_attn.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ # Fusion
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+
+- label: Blackwell GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
+
+##### 1 GPU test #####
+##### multi gpus test #####
+
+- label: Distributed Comm Ops Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/distributed
+ - tests/distributed
+ commands:
+ - pytest -v -s distributed/test_comm_ops.py
+ - pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
+
+- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ - tests/examples/offline_inference/data_parallel.py
+ commands:
+ - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+ - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+ - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+ - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+ - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+ - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+ - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+ - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+
+- label: Distributed Tests (2 GPUs) # 68min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/compilation/
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/worker/worker_base.py
+ - vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
+ commands:
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+ - pytest -v -s entrypoints/llm/test_collective_rpc.py
+ - pytest -v -s ./compile/test_basic_correctness.py
+ - pytest -v -s ./compile/test_wrapper.py
+ - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
+ - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
+ # Avoid importing model tests that cause CUDA reinitialization error
+ - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/language -v -s -m 'distributed(num_gpus=2)'
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
+
+- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/plugins/
+ - tests/plugins/
+ commands:
+ # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
+ - pip install -e ./plugins/vllm_add_dummy_platform
+ - pytest -v -s plugins_tests/test_platform_plugins.py
+ - pip uninstall vllm_add_dummy_platform -y
+ # end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # other tests continue here:
+ - pytest -v -s plugins_tests/test_scheduler_plugins.py
+ - pip install -e ./plugins/vllm_add_dummy_model
+ - pytest -v -s distributed/test_distributed_oot.py
+ - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+ - pytest -v -s models/test_oot_registration.py # it needs a clean process
+ - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/
+ - vllm/engine/
+ - vllm/executor/
+ - vllm/model_executor/models/
+ - tests/distributed/
+ commands:
+ - pytest -v -s distributed/test_pp_cudagraph.py
+ - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental, amdproduction]
+ agent_pool: mi325_4
+ # grade: Blocking
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/lora
+ - tests/lora
+ commands:
+ # FIXIT: find out which code initialize cuda before running the test
+ # before the fix, we need to use spawn to test it
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ # There is some Tensor Parallelism related processing logic in LoRA that
+ # requires multi-GPU testing for validation.
+ - pytest -v -s -x lora/test_chatglm3_tp.py
+ - pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+
+
+- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ gpu: a100
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/weight_loading
+ commands:
+ - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/
+ commands:
+ # NOTE: don't test llama model here, it seems hf implementation is buggy
+ # see https://github.com/vllm-project/vllm/pull/5689 for details
+ - pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+ - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+ gpu: a100
+ optional: true
+ num_gpus: 4
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H200 test #####
+- label: Distrubted Tests (H200) # optional
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+ mirror_hardwares: [amdexperimental]
+ agent_pool: mi325_2
+ # grade: Blocking
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c476f71c6637..e037f88f0d31 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -6,24 +6,28 @@
# to generate the final pipeline yaml file.
# Documentation
-# label(str): the name of the test. emoji allowed.
-# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
-# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
-# fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
+# label(str): the name of the test. emojis allowed.
+# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
+# fast_check_only(bool): run this test on the fastcheck pipeline only
+# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
+# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
-# commands(list): the list of commands to run for test. incompatbile with command.
-# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
-# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
-# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
-# in this case, commands must be specified. the first command runs on first host, the second
+# commands(list): the list of commands to run for the test. incompatible with command.
+# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
+# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
+# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
+# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
+# in this case, commands must be specified. the first command runs on the first host, the second
# command runs on the second host.
-# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
-# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
+# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
+# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
+# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
+# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
+# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
# When adding a test
-# - If the test belong to an existing group, add it there
+# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
@@ -31,16 +35,6 @@
steps:
##### fast check tests #####
-- label: Documentation Build # 2min
- mirror_hardwares: [amdexperimental]
- working_dir: "/vllm-workspace/test_docs"
- fast_check: true
- no_gpu: True
- commands:
- - pip install -r ../requirements/docs.txt
- # TODO: add `--strict` once warnings in docstrings are fixed
- - mkdocs build
-
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
@@ -51,29 +45,36 @@ steps:
commands:
- bash standalone_tests/pytorch_nightly_dependency.sh
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- - tests/mq_llm_engine
- - tests/async_engine
- - tests/test_inputs
- tests/multimodal
- - tests/test_utils
- - tests/worker
+ - tests/utils_
+ commands:
+ - pytest -v -s -m 'not cpu_test' multimodal
+ - pytest -v -s utils_
+
+- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
+ timeout_in_minutes: 10
+ source_file_dependencies:
+ - vllm/
+ - tests/test_inputs.py
+ - tests/test_outputs.py
+ - tests/multimodal
- tests/standalone_tests/lazy_imports.py
+ - tests/transformers_utils
+ no_gpu: true
commands:
- python3 standalone_tests/lazy_imports.py
- - pytest -v -s mq_llm_engine # MQLLMEngine
- - pytest -v -s async_engine # AsyncLLMEngine
- - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- - pytest -v -s multimodal
- - pytest -v -s test_utils.py # Utils
- - pytest -v -s worker # Worker
+ - pytest -v -s -m 'cpu_test' multimodal
+ - pytest -v -s transformers_utils
-- label: Python-only Installation Test
+- label: Python-only Installation Test # 10min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
@@ -81,43 +82,35 @@ steps:
commands:
- bash standalone_tests/python_only_compile.sh
-- label: Basic Correctness Test # 30min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Basic Correctness Test # 20min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- - tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
-- label: Chunked Prefill Test
- mirror_hardwares: [amdexperimental, amdproduction]
- source_file_dependencies:
- - vllm/
- - tests/basic_correctness/test_chunked_prefill
- commands:
- - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-- label: Core Test # 10min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Entrypoints Unit Tests # 5min
+ timeout_in_minutes: 10
+ working_dir: "/vllm-workspace/tests"
fast_check: true
source_file_dependencies:
- - vllm/core
- - vllm/distributed
- - tests/core
+ - vllm/entrypoints
+ - tests/entrypoints/
commands:
- - pytest -v -s core
+ - pytest -v -s entrypoints/openai/tool_parsers
+ - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
-- label: Entrypoints Test (LLM) # 40min
+- label: Entrypoints Integration Test (LLM) # 30min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -128,14 +121,12 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+ - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
-- label: Entrypoints Test (API Server) # 40min
+- label: Entrypoints Integration Test (API Server) # 100min
+ timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
@@ -146,16 +137,30 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+ - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+ - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
-- label: Distributed Tests (4 GPUs) # 10min
+- label: Entrypoints Integration Test (Pooling)
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ fast_check: true
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/entrypoints/pooling
+ commands:
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s entrypoints/pooling
+
+- label: Distributed Tests (4 GPUs) # 35min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- - vllm/core/
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/distributed/test_events
@@ -163,24 +168,34 @@ steps:
- examples/offline_inference/rlhf.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
+ - tests/v1/distributed
- tests/v1/engine/test_engine_core_client.py
+ - tests/distributed/test_symm_mem_allreduce.py
commands:
- # test with tp=2 and external_dp=2
- - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
- # test with tp=2 and pp=2
+ # test with torchrun tp=2 and pp=2
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+ # test with torchrun tp=4 and dp=1
+ - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2, pp=2 and dp=1
+ - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=1 and dp=4 with ep
+ - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+ # test with torchrun tp=2 and dp=2 with ep
+ - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s distributed/test_events.py
+ - pytest -v -s distributed/test_symm_mem_allreduce.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- pushd ../examples/offline_inference
@@ -188,7 +203,8 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
-- label: EPLB Algorithm Test
+- label: EPLB Algorithm Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/distributed/eplb
@@ -197,6 +213,7 @@ steps:
- pytest -v -s distributed/test_eplb_algo.py
- label: EPLB Execution Test # 5min
+ timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -205,26 +222,26 @@ steps:
commands:
- pytest -v -s distributed/test_eplb_execute.py
-- label: Metrics, Tracing Test # 10min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Metrics, Tracing Test # 12min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
num_gpus: 2
source_file_dependencies:
- vllm/
- - tests/metrics
- - tests/tracing
+ - tests/v1/tracing
commands:
- - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- - pytest -v -s tracing
+ - pytest -v -s v1/tracing
##### fast check tests #####
##### 1 GPU test #####
-- label: Regression Test # 5min
+- label: Regression Test # 7min
+ timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -234,7 +251,8 @@ steps:
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
-- label: Engine Test # 10min
+- label: Engine Test # 25min
+ timeout_in_minutes: 40
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -249,34 +267,66 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
-- label: V1 Test
+- label: V1 Test e2e + engine # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/v1
commands:
- # split the test to avoid interference
- - pytest -v -s v1/core
+ # TODO: accuracy does not match, whether setting
+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+ - pytest -v -s v1/e2e
- pytest -v -s v1/engine
+
+- label: V1 Test entrypoints # 35min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
- pytest -v -s v1/entrypoints
+
+- label: V1 Test others # 42min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'not cpu_test' v1/core
+ - pytest -v -s v1/executor
+ - pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
+ - pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- - pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
- - pytest -v -s v1/kv_connector/unit
- - pytest -v -s v1/metrics
- - pytest -v -s v1/test_serial_utils.py
- - pytest -v -s v1/test_utils.py
+ - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'not cpu_test' v1/metrics
- pytest -v -s v1/test_oracle.py
- - pytest -v -s v1/test_metrics_reader.py
- # TODO: accuracy does not match, whether setting
- # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- - pytest -v -s v1/e2e
+ - pytest -v -s v1/test_request.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-- label: Examples Test # 25min
+- label: V1 Test others (CPU) # 5 mins
+ source_file_dependencies:
+ - vllm/
+ - tests/v1
+ no_gpu: true
+ commands:
+ # split the test to avoid interference
+ - pytest -v -s -m 'cpu_test' v1/core
+ - pytest -v -s v1/structured_output
+ - pytest -v -s v1/test_serial_utils.py
+ - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
+ - pytest -v -s -m 'cpu_test' v1/metrics
+
+
+- label: Examples Test # 30min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
@@ -293,24 +343,16 @@ steps:
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- - python3 offline_inference/encoder_decoder.py
+ - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-
-- label: Prefix Caching Test # 9min
- mirror_hardwares: [amdexperimental, amdproduction]
- source_file_dependencies:
- - vllm/
- - tests/prefix_caching
- commands:
- - pytest -v -s prefix_caching
-
+ - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+ - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-- label: Platform Tests (CUDA)
+- label: Platform Tests (CUDA) # 4min
+ timeout_in_minutes: 15
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
@@ -318,7 +360,8 @@ steps:
commands:
- pytest -v -s cuda/test_cuda_context.py
-- label: Samplers Test # 36min
+- label: Samplers Test # 56min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers
@@ -329,15 +372,28 @@ steps:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-- label: LoRA Test %N # 15min each
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LoRA Test %N # 20min each
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/lora
- tests/lora
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+ commands:
+ - pytest -v -s lora \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --ignore=lora/test_chatglm3_tp.py \
+ --ignore=lora/test_llama_tp.py \
+ --ignore=lora/test_llm_with_multi_loras.py \
+ --ignore=lora/test_olmoe_tp.py \
+ --ignore=lora/test_deepseekv2_tp.py \
+ --ignore=lora/test_gptoss.py \
+ --ignore=lora/test_qwen3moe_tp.py
+
parallelism: 4
-- label: PyTorch Compilation Unit Tests
+- label: PyTorch Compilation Unit Tests # 15min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -347,42 +403,48 @@ steps:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
+ - pytest -v -s compile/test_functionalization.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- - pytest -v -s compile/test_sequence_parallelism.py
- - pytest -v -s compile/test_async_tp.py
+ - pytest -v -s compile/test_fusion_all_reduce.py
+ - pytest -v -s compile/test_decorator.py
+ - pytest -v -s compile/test_noop_elimination.py
+ - pytest -v -s compile/test_aot_compile.py
-- label: PyTorch Fullgraph Smoke Test # 9min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: PyTorch Fullgraph Smoke Test # 15min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- # these tests need to be separated, cannot combine
- - pytest -v -s compile/piecewise/test_simple.py
- - pytest -v -s compile/piecewise/test_toy_llama.py
- - pytest -v -s compile/piecewise/test_full_cudagraph.py
+ - pytest -v -s compile/piecewise/
-- label: PyTorch Fullgraph Test # 18min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: PyTorch Fullgraph Test # 22min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
+ - pytest -v -s compile/test_fusions_e2e.py
-- label: Kernels Core Operation Test
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Kernels Core Operation Test # 48min
+ timeout_in_minutes: 75
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
- tests/kernels/core
+ - tests/kernels/test_top_k_per_row.py
commands:
- - pytest -v -s kernels/core
+ - pytest -v -s kernels/core kernels/test_top_k_per_row.py
-- label: Kernels Attention Test %N
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Kernels Attention Test %N # 23min
+ timeout_in_minutes: 35
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
- vllm/attention
@@ -392,74 +454,73 @@ steps:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels Quantization Test %N
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Kernels Quantization Test %N # 64min
+ timeout_in_minutes: 90
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
-- label: Kernels MoE Test
+- label: Kernels MoE Test %N # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
source_file_dependencies:
+ - csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
+ - vllm/distributed/device_communicators/
commands:
- - pytest -v -s kernels/moe
+ - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ parallelism: 2
-- label: Kernels Mamba Test
+- label: Kernels Mamba Test # 31min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
+ - vllm/model_executor/layers/mamba/ops
commands:
- pytest -v -s kernels/mamba
-- label: Tensorizer Test # 11min
+- label: Model Executor Test # 23min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
- soft_fail: true
- source_file_dependencies:
- - vllm/model_executor/model_loader
- - tests/tensorizer_loader
- - tests/entrypoints/openai/test_tensorizer_entrypoint.py
- commands:
- - apt-get update && apt-get install -y curl libsodium23
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -v -s tensorizer_loader
- - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-
-- label: Model Executor Test
- mirror_hardwares: [amdexperimental, amdproduction]
- soft_fail: true
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
+ - tests/entrypoints/openai/test_tensorizer_entrypoint.py
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s model_executor
+ - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
-- label: Benchmarks # 9min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Benchmarks # 11min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
commands:
- bash scripts/run-benchmarks.sh
-- label: Benchmarks CLI Test # 10min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Benchmarks CLI Test # 7min
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/benchmarks/
commands:
- pytest -v -s benchmarks/
-- label: Quantization Test
+- label: Quantization Test # 70min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -467,21 +528,26 @@ steps:
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
- # after torchao 0.12 release
- - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+ # after torchao 0.12 release, and pin a working version of torchao nightly here
+
+ # since torchao nightly is only compatible with torch nightly currently
+ # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
+ # we can only upgrade after this is resolved
+ # TODO(jerryzh168): resolve the above comment
+ - uv pip install --system torchao==0.13.0
+ - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
- working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-- label: OpenAI API correctness
+- label: OpenAI API correctness # 22min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
@@ -490,64 +556,128 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
-- label: Encoder Decoder tests # 5min
+- label: OpenAI-Compatible Tool Use # 23 min
+ timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
+ fast_check: false
source_file_dependencies:
- - vllm/
- - tests/encoder_decoder
+ - vllm/
+ - tests/tool_use
commands:
- - pytest -v -s encoder_decoder
+ - pytest -v -s -m 'not cpu_test' tool_use
-- label: OpenAI-Compatible Tool Use # 20 min
- mirror_hardwares: [amdexperimental]
- fast_check: false
+- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
+ timeout_in_minutes: 10
source_file_dependencies:
- vllm/
- tests/tool_use
- - tests/mistral_tool_use
+ no_gpu: true
commands:
- - pytest -v -s tool_use
- - pytest -v -s mistral_tool_use
+ - pytest -v -s -m 'cpu_test' tool_use
##### models test #####
-- label: Basic Models Test # 24min
+- label: Basic Models Tests (Initialization)
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- - tests/models
+ - tests/models/test_initialization.py
+ commands:
+ # Run a subset of model initialization tests
+ - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
+
+- label: Basic Models Tests (Extra Initialization) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/test_initialization.py
commands:
- - pytest -v -s models/test_transformers.py
- - pytest -v -s models/test_registry.py
- - pytest -v -s models/test_utils.py
- - pytest -v -s models/test_vision.py
- - pytest -v -s models/test_initialization.py
+ # Only when vLLM model source is modified - test initialization of a large
+ # subset of supported models (the complement of the small subset in the above
+ # test.) Also run if model initialization test file is modified
+ - pytest -v -s models/test_initialization.py \
+ -k 'not test_can_initialize_small_subset' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Standard)
+- label: Basic Models Tests (Other)
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_transformers.py
+ - tests/models/test_registry.py
+ commands:
+ - pytest -v -s models/test_transformers.py models/test_registry.py
+
+- label: Basic Models Test (Other CPU) # 5min
+ timeout_in_minutes: 10
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/test_utils.py
+ - tests/models/test_vision.py
+ no_gpu: true
+ commands:
+ - pytest -v -s models/test_utils.py models/test_vision.py
+
+- label: Language Models Tests (Standard)
+ timeout_in_minutes: 25
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
commands:
- # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+ # Test standard language models, excluding a subset of slow tests
+ - pip freeze | grep -E 'torch'
+ - pytest -v -s models/language -m 'core_model and (not slow_test)'
+
+- label: Language Models Tests (Extra Standard) %N
+ timeout_in_minutes: 45
+ mirror_hardwares: [amdexperimental]
+ torch_nightly: true
+ source_file_dependencies:
+ - vllm/model_executor/models/
+ - tests/models/language/pooling/test_embedding.py
+ - tests/models/language/generation/test_common.py
+ - tests/models/language/pooling/test_classification.py
+ commands:
+ # Shard slow subset of standard language models tests. Only run when model
+ # source is modified, or when specified test files are modified
- pip freeze | grep -E 'torch'
- - pytest -v -s models/language -m core_model
+ - pytest -v -s models/language -m 'core_model and slow_test' \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Hybrid) # 35 min
+- label: Language Models Tests (Hybrid) %N
+ timeout_in_minutes: 75
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
commands:
- # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- - pytest -v -s models/language/generation -m hybrid_model
+ # Install fast path packages for testing against transformers
+ # Note: also needed to run plamo2 model in vLLM
+ - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+ - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+ # Shard hybrid language model tests
+ - pytest -v -s models/language/generation \
+ -m hybrid_model \
+ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
+ --shard-id=$$BUILDKITE_PARALLEL_JOB
+ parallelism: 2
-- label: Language Models Test (Extended Generation) # 1hr20min
+- label: Language Models Test (Extended Generation) # 80min
+ timeout_in_minutes: 110
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -558,7 +688,18 @@ steps:
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
+- label: Language Models Test (PPL)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/generation_ppl_test
+ commands:
+ - pytest -v -s models/language/generation_ppl_test
+
- label: Language Models Test (Extended Pooling) # 36min
+ timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
optional: true
source_file_dependencies:
@@ -567,7 +708,27 @@ steps:
commands:
- pytest -v -s models/language/pooling -m 'not core_model'
-- label: Multi-Modal Models Test (Standard)
+- label: Language Models Test (MTEB)
+ timeout_in_minutes: 110
+ mirror_hardwares: [amdexperimental]
+ optional: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/language/pooling_mteb_test
+ commands:
+ - pytest -v -s models/language/pooling_mteb_test
+
+- label: Multi-Modal Processor Test # 44min
+ timeout_in_minutes: 60
+ source_file_dependencies:
+ - vllm/
+ - tests/models/multimodal
+ commands:
+ - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+ - pytest -v -s models/multimodal/processing
+
+- label: Multi-Modal Models Test (Standard) # 60min
+ timeout_in_minutes: 80
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@@ -576,9 +737,18 @@ steps:
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- - pytest -v -s models/multimodal/processing
- - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
- - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+ - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+ - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
+
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+ timeout_in_minutes: 70
+ working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+ source_file_dependencies:
+ - vllm/multimodal/
+ - vllm/inputs/
+ - vllm/v1/core/
+ commands:
+ - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
@@ -588,7 +758,7 @@ steps:
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
+ - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2
mirror_hardwares: [amdexperimental]
@@ -610,8 +780,9 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-- label: Quantized Models Test
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Quantized Models Test # 45 min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/layers/quantization
- tests/models/quantization
@@ -620,7 +791,7 @@ steps:
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
- mirror_hardwares: [amdexperimental, amdproduction]
+ mirror_hardwares: [amdexperimental]
optional: true
commands:
- echo 'Testing custom models...'
@@ -634,17 +805,118 @@ steps:
commands:
- pip install --upgrade git+https://github.com/huggingface/transformers
- pytest -v -s tests/models/test_initialization.py
+ - pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- - python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+ # Whisper needs spawn method to avoid deadlock
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+
+- label: Blackwell Test # 21 min
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ # optional: true
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - csrc/attention/mla/
+ - csrc/quantization/cutlass_w8a8/moe/
+ - vllm/model_executor/layers/fused_moe/cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+ - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - nvidia-smi
+ - python3 examples/offline_inference/basic/chat.py
+ # Attention
+ # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+ - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
+ - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
+ - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+ - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
+ # Quantization
+ - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+ - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+ - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+ - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+ timeout_in_minutes: 40
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - csrc/quantization/fp4/
+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+ - vllm/v1/attention/backends/flashinfer.py
+ - vllm/compilation/
+ # can affect pattern matching
+ - vllm/model_executor/layers/layernorm.py
+ - vllm/model_executor/layers/activation.py
+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
+ commands:
+ - nvidia-smi
+ - pytest -v -s tests/compile/test_fusion_attn.py
+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+ # this runner has 2 GPUs available even though num_gpus=2 is not set
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusions_e2e.py
+
+- label: Blackwell GPT-OSS Eval
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - tests/evals/gpt_oss
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - uv pip install --system 'gpt-oss[eval]==0.0.5'
+ - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+
+- label: Blackwell Quantized MoE Test
+ timeout_in_minutes: 60
+ working_dir: "/vllm-workspace/"
+ gpu: b200
+ source_file_dependencies:
+ - tests/quantization/test_blackwell_moe.py
+ - vllm/model_executor/models/deepseek_v2.py
+ - vllm/model_executor/models/gpt_oss.py
+ - vllm/model_executor/models/llama4.py
+ - vllm/model_executor/layers/fused_moe
+ - vllm/model_executor/layers/quantization/compressed_tensors
+ - vllm/model_executor/layers/quantization/modelopt.py
+ - vllm/model_executor/layers/quantization/mxfp4.py
+ - vllm/v1/attention/backends/flashinfer.py
+ commands:
+ - pytest -s -v tests/quantization/test_blackwell_moe.py
+
+- label: Blackwell LM Eval Small Models
+ timeout_in_minutes: 120
+ gpu: b200
+ optional: true # run on nightlies
+ source_file_dependencies:
+ - csrc/
+ - vllm/model_executor/layers/quantization
+ commands:
+ - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
- mirror_hardwares: [amdexperimental, amdproduction]
+ timeout_in_minutes: 20
+ mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
@@ -653,8 +925,11 @@ steps:
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+ - pytest -v -s distributed/test_shm_buffer.py
+ - pytest -v -s distributed/test_shm_storage.py
- label: 2 Node Tests (4 GPUs in total) # 16min
+ timeout_in_minutes: 30
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -678,47 +953,61 @@ steps:
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 68min
+ timeout_in_minutes: 90
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
+ - vllm/compilation/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- - vllm/model_executor/models/
- - tests/distributed/
- - vllm/compilation
- vllm/worker/worker_base.py
- - vllm/worker/worker.py
- - vllm/worker/model_runner.py
- - entrypoints/llm/test_collective_rpc.py
- - tests/v1/test_async_llm_dp.py
- - tests/v1/test_external_lb_dp.py
- - tests/v1/entrypoints/openai/test_multi_api_servers.py
- vllm/v1/engine/
+ - vllm/v1/worker/
+ - tests/compile/test_basic_correctness.py
+ - tests/compile/test_wrapper.py
+ - tests/distributed/
+ - tests/entrypoints/llm/test_collective_rpc.py
+ - tests/v1/distributed
+ - tests/v1/entrypoints/openai/test_multi_api_servers.py
+ - tests/v1/shutdown
+ - tests/v1/worker/test_worker_memory_snapshot.py
commands:
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+ - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+ - pytest -v -s distributed/test_sequence_parallel.py
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+
+- label: Distributed Model Tests (2 GPUs) # 37min
+ timeout_in_minutes: 50
+ mirror_hardwares: [amdexperimental]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ source_file_dependencies:
+ - vllm/model_executor/model_loader/sharded_state_loader.py
+ - vllm/model_executor/models/
+ - tests/basic_correctness/
+ - tests/model_executor/model_loader/test_sharded_state_loader.py
+ - tests/models/
+ commands:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+ - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
- - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
- # test sequence parallel
- - pytest -v -s distributed/test_sequence_parallel.py
- # this test fails consistently.
- # TODO: investigate and fix
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+ - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+ - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
- label: Plugin Tests (2 GPUs) # 40min
+ timeout_in_minutes: 60
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
@@ -731,6 +1020,16 @@ steps:
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
+ - pip install -e ./plugins/prithvi_io_processor_plugin
+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
+ - pip uninstall prithvi_io_processor_plugin -y
+ # end io_processor plugins test
+ # begin stat_logger plugins test
+ - pip install -e ./plugins/vllm_add_dummy_stat_logger
+ - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+ - pip uninstall dummy_stat_logger -y
+ # end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@@ -739,29 +1038,9 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
-- label: Multi-step Tests (4 GPUs) # 36min
- mirror_hardwares: [amdexperimental, amdproduction]
- working_dir: "/vllm-workspace/tests"
- num_gpus: 4
- source_file_dependencies:
- - vllm/model_executor/layers/sampler.py
- - vllm/sequence.py
- - vllm/worker/worker_base.py
- - vllm/worker/worker.py
- - vllm/worker/multi_step_worker.py
- - vllm/worker/model_runner_base.py
- - vllm/worker/model_runner.py
- - vllm/worker/multi_step_model_runner.py
- - vllm/engine
- - tests/multi_step
- commands:
- # this test is quite flaky
- # TODO: investigate and fix.
- # - pytest -v -s multi_step/test_correctness_async_llm.py
- - pytest -v -s multi_step/test_correctness_llm.py
-
-- label: Pipeline Parallelism Test # 45min
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: Pipeline + Context Parallelism Test # 45min
+ timeout_in_minutes: 60
+ mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@@ -774,8 +1053,9 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py
-- label: LoRA TP Test (Distributed)
- mirror_hardwares: [amdexperimental, amdproduction]
+- label: LoRA TP Test (Distributed) # 17 min
+ timeout_in_minutes: 30
+ mirror_hardwares: [amdexperimental]
num_gpus: 4
source_file_dependencies:
- vllm/lora
@@ -788,12 +1068,16 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
+ - pytest -v -s -x lora/test_llm_with_multi_loras.py
+ - pytest -v -s -x lora/test_olmoe_tp.py
- label: Weight Loading Multiple GPU Test # 33min
+ timeout_in_minutes: 45
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
+ optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
@@ -811,6 +1095,17 @@ steps:
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+ timeout_in_minutes: 30
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ source_file_dependencies:
+ - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+ - tests/v1/kv_connector/nixl_integration/
+ commands:
+ - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+ - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test #####
@@ -841,3 +1136,39 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
+
+##### H200 test #####
+- label: Distributed Tests (H200) # optional
+ gpu: h200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/compile/test_async_tp.py
+ - pytest -v -s tests/compile/test_sequence_parallelism.py
+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
+ - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
+
+##### B200 test #####
+- label: Distributed Tests (B200) # optional
+ gpu: b200
+ optional: true
+ working_dir: "/vllm-workspace/"
+ num_gpus: 2
+ commands:
+ - pytest -v -s tests/distributed/test_context_parallel.py
+ - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+
+##### RL Integration Tests #####
+- label: Prime-RL Integration Test # 15min
+ timeout_in_minutes: 30
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ source_file_dependencies:
+ - vllm/
+ - .buildkite/scripts/run-prime-rl-test.sh
+ commands:
+ - bash .buildkite/scripts/run-prime-rl-test.sh
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000000..b7a9fdb4e05a
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,47 @@
+[run]
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+ vllm
+ */dist-packages/vllm
+ */site-packages/vllm
+omit =
+ */tests/*
+ */test_*
+ */__pycache__/*
+ */build/*
+ */dist/*
+ */vllm.egg-info/*
+ */third_party/*
+ */examples/*
+ */benchmarks/*
+ */docs/*
+
+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+ vllm
+ /vllm-workspace/src/vllm
+ /vllm-workspace/vllm
+ */site-packages/vllm
+ */dist-packages/vllm
+
+[report]
+exclude_lines =
+ pragma: no cover
+ def __repr__
+ if self.debug:
+ if settings.DEBUG
+ raise AssertionError
+ raise NotImplementedError
+ if 0:
+ if __name__ == .__main__.:
+ class .*\bProtocol\):
+ @(abc\.)?abstractmethod
+
+[html]
+directory = htmlcov
+
+[xml]
+output = coverage.xml
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000000..5a601d00cef8
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,4 @@
+# Migrate from `yapf` & `isort` to `ruff`
+d6953beb91da4e9c99be4c0a1304a2d24189535c
+# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
+8fcaaf6a165e661f63fc51be906bc05b0767332f
diff --git a/.github/.bc-linter.yml b/.github/.bc-linter.yml
new file mode 100644
index 000000000000..443dfa45af22
--- /dev/null
+++ b/.github/.bc-linter.yml
@@ -0,0 +1,24 @@
+# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
+version: 1
+paths:
+# We temporarily disable globally, and will only enable with `annotations.include`
+# include:
+# - "vllm/v1/attetion/*.py"
+# - "vllm/v1/core/*.py"
+exclude:
+ - "**/*.py"
+
+scan:
+ functions: true # check free functions and methods
+ classes: true # check classes/dataclasses
+ public_only: true # ignore names starting with "_" at any level
+
+annotations:
+ include: # decorators that force‑include a symbol
+ - name: "bc_linter_include" # matched by simple name or dotted suffix
+ propagate_to_members: false # for classes, include methods/inner classes
+ exclude: # decorators that force‑exclude a symbol
+ - name: "bc_linter_skip" # matched by simple name or dotted suffix
+ propagate_to_members: true # for classes, exclude methods/inner classes
+
+excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 8c68bc8f02b6..14301fe8d847 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,53 +2,127 @@
# for more info about CODEOWNERS file
# This lists cover the "core" components of vLLM that require careful review
+/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
-/vllm/multimodal @DarkLight1337 @ywang96
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/model_executor/layers/fused_moe @mgoin
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/mamba @tdoublep
+/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
+/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
+/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
-/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
+/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention/backends/triton_attn.py @tdoublep
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/sample @22quinn @houseroad @njhill
+/vllm/v1/spec_decode @benchislett @luccafong
+/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/kv_cache_interface.py @heheda12345
+/vllm/v1/offloading @ApostaC
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/model_executor/test_guided_processors.py @mgoin @russellb
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/evals @mgoin
+/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat
+/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/weight_loading @mgoin @youkaichao
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
+/tests/models/language/generation/test_hybrid.py @tdoublep
+/tests/v1/kv_connector/nixl_integration @NickLucche
+/tests/v1/kv_connector @ApostaC
+/tests/v1/offloading @ApostaC
+
+# Transformers backend
+/vllm/model_executor/models/transformers @hmellor
+/tests/models/test_transformers.py @hmellor
# Docs
-/docs @hmellor
+/docs/mkdocs @hmellor
+/docs/**/*.yml @hmellor
+/requirements/docs.txt @hmellor
+.readthedocs.yaml @hmellor
mkdocs.yaml @hmellor
+
+# Linting
+.markdownlint.yaml @hmellor
+.pre-commit-config.yaml @hmellor
+/tools/pre_commit @hmellor
+
+# CPU
+/vllm/v1/worker/cpu* @bigPYJ1151
+/csrc/cpu @bigPYJ1151
+/vllm/platforms/cpu.py @bigPYJ1151
+/cmake/cpu_extension.cmake @bigPYJ1151
+/docker/Dockerfile.cpu @bigPYJ1151
+
+# Intel GPU
+/vllm/v1/worker/xpu* @jikunshang
+/vllm/platforms/xpu.py @jikunshang
+/docker/Dockerfile.xpu @jikunshang
+
+# Qwen-specific files
+/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
+/vllm/model_executor/models/qwen* @sighingnow
+
+# MTP-specific files
+/vllm/model_executor/models/deepseek_mtp.py @luccafong
+
+# Mistral-specific files
+/vllm/model_executor/models/mistral*.py @patrickvonplaten
+/vllm/model_executor/models/mixtral*.py @patrickvonplaten
+/vllm/model_executor/models/voxtral*.py @patrickvonplaten
+/vllm/model_executor/models/pixtral*.py @patrickvonplaten
+/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
+/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
+
+# Kernels
+/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/attention/ops/triton_unified_attention.py @tdoublep
+
+# ROCm related: specify owner with write access to notify AMD folks for careful code review
+/docker/Dockerfile.rocm* @gshtras
+/vllm/v1/attention/backends/rocm*.py @gshtras
+/vllm/v1/attention/backends/mla/rocm*.py @gshtras
+/vllm/attention/ops/rocm*.py @gshtras
+/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
+
+# TPU
+/vllm/v1/worker/tpu* @NickLucche
+/vllm/platforms/tpu.py @NickLucche
+/vllm/v1/sample/tpu @NickLucche
+/vllm/tests/v1/tpu @NickLucche
+
+# KVConnector installation files
+/requirements/kv_connectors.txt @NickLucche
+
+# Pooling models
+/examples/*/pooling/ @noooop
+/tests/models/*/pooling* @noooop
+/tests/entrypoints/pooling @noooop
+/vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
index 7ee57c42895c..c0e009855964 100644
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -43,10 +43,6 @@ body:
Any other things you would like to mention.
validations:
required: false
-- type: markdown
- attributes:
- value: >
- Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
- type: checkboxes
id: askllm
attributes:
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 017ec7ca82da..8043df65d558 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,10 +1,5 @@
-## Essential Elements of an Effective PR Description Checklist
-- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan, such as providing test command.
-- [ ] The test results, such as pasting the results comparison before and after, or e2e results
-- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
-
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
## Purpose
@@ -12,7 +7,15 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
## Test Result
-## (Optional) Documentation Update
+---
+
+ Essential Elements of an Effective PR Description Checklist
+
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
+
-
**BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 5c878ac02069..de1a8314a4ec 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -2,6 +2,7 @@ pull_request_rules:
- name: label-documentation
description: Automatically apply documentation label
conditions:
+ - label != stale
- or:
- files~=^[^/]+\.md$
- files~=^docs/
@@ -10,10 +11,13 @@ pull_request_rules:
label:
add:
- documentation
+ comment:
+ message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
- name: label-ci-build
description: Automatically apply ci/build label
conditions:
+ - label != stale
- or:
- files~=^\.github/
- files~=\.buildkite/
@@ -30,6 +34,7 @@ pull_request_rules:
- name: label-deepseek
description: Automatically apply deepseek label
conditions:
+ - label != stale
- or:
- files~=^examples/.*deepseek.*\.py
- files~=^tests/.*deepseek.*\.py
@@ -46,6 +51,7 @@ pull_request_rules:
- name: label-frontend
description: Automatically apply frontend label
conditions:
+ - label != stale
- files~=^vllm/entrypoints/
actions:
label:
@@ -55,6 +61,7 @@ pull_request_rules:
- name: label-llama
description: Automatically apply llama label
conditions:
+ - label != stale
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
@@ -70,6 +77,7 @@ pull_request_rules:
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
+ - label != stale
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
@@ -83,6 +91,7 @@ pull_request_rules:
- name: label-new-model
description: Automatically apply new-model label
conditions:
+ - label != stale
- and:
- files~=^vllm/model_executor/models/
- files=vllm/model_executor/models/registry.py
@@ -94,6 +103,7 @@ pull_request_rules:
- name: label-performance
description: Automatically apply performance label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
@@ -107,6 +117,7 @@ pull_request_rules:
- name: label-qwen
description: Automatically apply qwen label
conditions:
+ - label != stale
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
@@ -118,9 +129,32 @@ pull_request_rules:
add:
- qwen
+- name: label-gpt-oss
+ description: Automatically apply gpt-oss label
+ conditions:
+ - label != stale
+ - or:
+ - files~=^examples/.*gpt[-_]?oss.*\.py
+ - files~=^tests/.*gpt[-_]?oss.*\.py
+ - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
+ - files~=^tests/entrypoints/test_context.py
+ - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
+ - files~=^vllm/entrypoints/harmony_utils.py
+ - files~=^vllm/entrypoints/tool_server.py
+ - files~=^vllm/entrypoints/tool.py
+ - files~=^vllm/entrypoints/context.py
+ - title~=(?i)gpt[-_]?oss
+ - title~=(?i)harmony
+ actions:
+ label:
+ add:
+ - gpt-oss
+
- name: label-rocm
description: Automatically apply rocm label
conditions:
+ - label != stale
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
@@ -141,6 +175,7 @@ pull_request_rules:
- name: label-structured-output
description: Automatically apply structured-output label
conditions:
+ - label != stale
- or:
- files~=^benchmarks/structured_schemas/
- files=benchmarks/benchmark_serving_structured_output.py
@@ -149,11 +184,8 @@ pull_request_rules:
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- - files~=^vllm/model_executor/guided_decoding/
- - files=tests/model_executor/test_guided_processors.py
- - files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- - files=tests/v1/entrypoints/llm/test_guided_generate.py
+ - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/
actions:
label:
@@ -163,6 +195,7 @@ pull_request_rules:
- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/spec_decode/
- files~=^tests/v1/spec_decode/
@@ -178,6 +211,7 @@ pull_request_rules:
- name: label-v1
description: Automatically apply v1 label
conditions:
+ - label != stale
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
@@ -190,6 +224,7 @@ pull_request_rules:
description: Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions:
+ - label != stale
- or:
- files~=tpu.py
- files~=_tpu
@@ -205,6 +240,7 @@ pull_request_rules:
description: Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions:
+ - label != stale
- and:
- -files~=tpu.py
- -files~=_tpu
@@ -219,9 +255,9 @@ pull_request_rules:
- name: label-tool-calling
description: Automatically add tool-calling label
conditions:
+ - label != stale
- or:
- files~=^tests/tool_use/
- - files~=^tests/mistral_tool_use/
- files~=^tests/entrypoints/openai/tool_parsers/
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
- files~=^vllm/entrypoints/openai/tool_parsers/
@@ -238,8 +274,9 @@ pull_request_rules:
- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- - conflict
- - -closed
+ - label != stale
+ - conflict
+ - -closed
actions:
label:
add:
@@ -253,20 +290,55 @@ pull_request_rules:
- name: assign reviewer for tensorizer changes
conditions:
+ - label != stale
+ - or:
- files~=^vllm/model_executor/model_loader/tensorizer.py
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
- - files~=^tests/tensorizer_loader/
+ - files~=^tests/model_executor/model_loader/tensorizer_loader/
actions:
assign:
users:
- "sangstar"
+- name: assign reviewer for modelopt changes
+ conditions:
+ - label != stale
+ - or:
+ - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
+ - files~=^vllm/model_executor/layers/quantization/__init__\.py$
+ - files~=^tests/models/quantization/test_modelopt\.py$
+ - files~=^tests/quantization/test_modelopt\.py$
+ - files~=^tests/models/quantization/test_nvfp4\.py$
+ - files~=^docs/features/quantization/modelopt\.md$
+ actions:
+ assign:
+ users:
+ - "Edwardf0t1"
+
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
- - -conflict
- - -closed
+ - -conflict
+ - -closed
actions:
label:
remove:
- needs-rebase
+
+- name: label-kv-connector
+ description: Automatically apply kv-connector label
+ conditions:
+ - label != stale
+ - or:
+ - files~=^examples/online_serving/disaggregated[^/]*/.*
+ - files~=^examples/offline_inference/disaggregated[^/]*/.*
+ - files~=^examples/others/lmcache/
+ - files~=^tests/v1/kv_connector/
+ - files~=^vllm/distributed/kv_transfer/
+ - title~=(?i)\bP/?D\b
+ - title~=(?i)NIXL
+ - title~=(?i)LMCache
+ actions:
+ label:
+ add:
+ - kv-connector
\ No newline at end of file
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 000000000000..c41a3ee3eb19
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+# Powers what instance types are available for GHA auto-scaled
+# runners. Runners listed here will be available as self hosted
+# runners, configuration is directly pulled from the main branch.
+# runner_types:
+# runner_label:
+# instance_type: m4.large
+# os: linux
+# # min_available defaults to the global cfg in the ALI Terraform
+# min_available: undefined
+# # when max_available value is not defined, no max runners is enforced
+# max_available: undefined
+# disk_size: 50
+# is_ephemeral: true
+
+runner_types:
+ linux.2xlarge:
+ disk_size: 150
+ instance_type: c5.2xlarge
+ is_ephemeral: true
+ os: linux
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 8d65936fba1d..25af344aab2b 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
+# Remove markdown comments (like the at the start)
+sed -i '/$/d' "${NEW}"
-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
index 315042fbf5cf..d8bbedef3174 100644
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Add label
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
github.rest.issues.addLabels({
diff --git a/.github/workflows/bc-lint.yml b/.github/workflows/bc-lint.yml
new file mode 100644
index 000000000000..823695a92132
--- /dev/null
+++ b/.github/workflows/bc-lint.yml
@@ -0,0 +1,29 @@
+name: BC Lint
+
+on:
+ pull_request:
+ types:
+ - opened
+ - synchronize
+ - reopened
+ - labeled
+ - unlabeled
+
+jobs:
+ bc_lint:
+ if: github.repository_owner == 'vllm-project'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Run BC Lint Action
+ uses: pytorch/test-infra/.github/actions/bc-lint@main
+ with:
+ repo: ${{ github.event.pull_request.head.repo.full_name }}
+ base_sha: ${{ github.event.pull_request.base.sha }}
+ head_sha: ${{ github.event.pull_request.head.sha }}
+ suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
+ docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
+ config_dir: .github
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+ cancel-in-progress: true
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index d5c6b8d43a6e..c3e132a536a4 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
new file mode 100644
index 000000000000..7d565ef9f2e4
--- /dev/null
+++ b/.github/workflows/issue_autolabel.yml
@@ -0,0 +1,361 @@
+name: Label issues based on keywords
+on:
+ issues:
+ types: [opened, edited, reopened]
+permissions:
+ issues: write # needed so the workflow can add labels
+ contents: read
+concurrency:
+ group: issue-labeler-${{ github.event.issue.number }}
+ cancel-in-progress: true
+jobs:
+ add-labels:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Label issues based on keywords
+ id: label-step
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+ with:
+ script: |
+ // Configuration: Add new labels and keywords here
+ const labelConfig = {
+ rocm: {
+ // Keyword search - matches whole words only (with word boundaries)
+ keywords: [
+ {
+ term: "composable kernel",
+ searchIn: "both"
+ },
+ {
+ term: "rccl",
+ searchIn: "body" // only search in body
+ },
+ {
+ term: "migraphx",
+ searchIn: "title" // only search in title
+ },
+ {
+ term: "hipgraph",
+ searchIn: "both"
+ },
+ {
+ term: "ROCm System Management Interface",
+ searchIn: "body"
+ },
+ ],
+ // Substring search - matches anywhere in text (partial matches)
+ substrings: [
+ {
+ term: "VLLM_ROCM_",
+ searchIn: "both"
+ },
+ {
+ term: "aiter",
+ searchIn: "title"
+ },
+ {
+ term: "rocm",
+ searchIn: "title"
+ },
+ {
+ term: "amd",
+ searchIn: "title"
+ },
+ {
+ term: "hip-",
+ searchIn: "both"
+ },
+ {
+ term: "gfx",
+ searchIn: "both"
+ },
+ {
+ term: "cdna",
+ searchIn: "both"
+ },
+ {
+ term: "rdna",
+ searchIn: "both"
+ },
+ {
+ term: "torch_hip",
+ searchIn: "body" // only in body
+ },
+ {
+ term: "_hip",
+ searchIn: "both"
+ },
+ {
+ term: "hip_",
+ searchIn: "both"
+ },
+ // ROCm tools and libraries
+ {
+ term: "hipify",
+ searchIn: "both"
+ },
+ ],
+ // Regex patterns - for complex pattern matching
+ regexPatterns: [
+ {
+ pattern: "\\bmi\\d{3}[a-z]*\\b",
+ description: "AMD GPU names (mi + 3 digits + optional letters)",
+ flags: "gi",
+ searchIn: "both" // "title", "body", or "both"
+ }
+ ],
+ },
+ // Add more label configurations here as needed
+ // example: {
+ // keywords: [...],
+ // substrings: [...],
+ // regexPatterns: [...]
+ // },
+ };
+ // Helper function to create regex based on search type
+ function createSearchRegex(term, type) {
+ // Escape special regex characters in the term
+ const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+ switch (type) {
+ case 'keyword':
+ // Word boundary search - matches whole words only
+ return new RegExp(`\\b${escapedTerm}\\b`, "gi");
+ case 'substring':
+ // Substring search - matches anywhere in the text
+ return new RegExp(escapedTerm, "gi");
+ default:
+ throw new Error(`Unknown search type: ${type}`);
+ }
+ }
+ // Helper function to find matching terms in text with line information
+ function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
+ const matches = [];
+ const lines = text.split('\n');
+ for (const termConfig of searchTerms) {
+ let regex;
+ let term, searchIn, pattern, description, flags;
+ // Handle different input formats (string or object)
+ if (typeof termConfig === 'string') {
+ term = termConfig;
+ searchIn = 'both'; // default
+ } else {
+ term = termConfig.term;
+ searchIn = termConfig.searchIn || 'both';
+ pattern = termConfig.pattern;
+ description = termConfig.description;
+ flags = termConfig.flags;
+ }
+ // Skip if this term shouldn't be searched in the current location
+ if (searchIn !== 'both' && searchIn !== searchLocation) {
+ continue;
+ }
+ // Create appropriate regex
+ if (searchType === 'regex') {
+ regex = new RegExp(pattern, flags || "gi");
+ } else {
+ regex = createSearchRegex(term, searchType);
+ }
+ const termMatches = [];
+ // Check each line for matches
+ lines.forEach((line, lineIndex) => {
+ const lineMatches = line.match(regex);
+ if (lineMatches) {
+ lineMatches.forEach(match => {
+ termMatches.push({
+ match: match,
+ lineNumber: lineIndex + 1,
+ lineContent: line.trim(),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ originalTerm: term || pattern,
+ description: description,
+ // Show context around the match in the line
+ context: line.length > 100 ?
+ line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+ line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
+ : line.trim()
+ });
+ });
+ }
+ });
+ if (termMatches.length > 0) {
+ matches.push({
+ term: term || (description || pattern),
+ searchType: searchType,
+ searchLocation: searchLocation,
+ searchIn: searchIn,
+ pattern: pattern,
+ matches: termMatches,
+ count: termMatches.length
+ });
+ }
+ }
+ return matches;
+ }
+ // Helper function to check if label should be added
+ async function processLabel(labelName, config) {
+ const body = context.payload.issue.body || "";
+ const title = context.payload.issue.title || "";
+ core.notice(`Processing label: ${labelName}`);
+ core.notice(`Issue Title: "${title}"`);
+ core.notice(`Issue Body length: ${body.length} characters`);
+ let shouldAddLabel = false;
+ let allMatches = [];
+ let reason = '';
+ const keywords = config.keywords || [];
+ const substrings = config.substrings || [];
+ const regexPatterns = config.regexPatterns || [];
+ core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
+ // Search in title
+ if (title.trim()) {
+ core.notice(`Searching in title: "${title}"`);
+ const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
+ const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
+ const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
+ allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
+ }
+ // Search in body
+ if (body.trim()) {
+ core.notice(`Searching in body (${body.length} characters)`);
+ const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
+ const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
+ const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
+ allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
+ }
+ if (allMatches.length > 0) {
+ core.notice(`Found ${allMatches.length} matching term(s):`);
+ for (const termMatch of allMatches) {
+ const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
+ const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
+ if (termMatch.searchType === 'regex') {
+ core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ } else {
+ core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
+ }
+ // Show details for each match
+ termMatch.matches.forEach((match, index) => {
+ core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
+ if (match.description) {
+ core.notice(` Description: ${match.description}`);
+ }
+ core.notice(` Context: ${match.context}`);
+ if (match.lineContent !== match.context) {
+ core.notice(` Full line: ${match.lineContent}`);
+ }
+ });
+ }
+ shouldAddLabel = true;
+ const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
+ const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
+ const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
+ const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
+ const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
+ const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
+ reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
+ }
+ core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
+ core.notice(`Reason: ${reason || 'No matching terms found'}`);
+ if (shouldAddLabel) {
+ const existingLabels = context.payload.issue.labels.map(l => l.name);
+ if (!existingLabels.includes(labelName)) {
+ await github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: [labelName],
+ });
+ core.notice(`Label "${labelName}" added. ${reason}`);
+ return true;
+ }
+ core.notice(`Label "${labelName}" already present.`);
+ return false;
+ }
+ core.notice(`No matching terms found for label "${labelName}".`);
+ return false;
+ }
+ // Process all configured labels
+ const labelsAddedResults = await Promise.all(
+ Object.entries(labelConfig).map(([labelName, config]) =>
+ processLabel(labelName, config).then(added => ({ labelName, added }))
+ )
+ );
+
+ const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+ core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+
+ // Return which labels were added for the next step
+ const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+ core.setOutput('labels_added', JSON.stringify(addedLabels));
+ return addedLabels;
+
+ - name: CC users for labeled issues
+ if: steps.label-step.outputs.labels_added != '[]'
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+ with:
+ script: |
+ // Configuration: Map labels to GitHub users to CC
+ // You can add multiple users per label, and multiple label configurations
+ const ccConfig = {
+ rocm: {
+ users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
+ message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
+ },
+ // Add more label -> user mappings here
+ // Example:
+ // cuda: {
+ // users: ['user1', 'user2'],
+ // message: 'CC {users} for CUDA-related issue'
+ // },
+ // performance: {
+ // users: ['perfexpert'],
+ // message: 'CC {users} for performance issue'
+ // },
+ };
+
+ const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+ core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+
+ // Get existing comments to check for already mentioned users
+ const comments = await github.rest.issues.listComments({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ });
+
+ const issueBody = context.payload.issue.body || '';
+ const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+
+ // Process each label that was added
+ for (const label of labelsAdded) {
+ if (ccConfig[label]) {
+ const config = ccConfig[label];
+ const usersToMention = [];
+
+ // Check which users haven't been mentioned yet
+ for (const user of config.users) {
+ const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+ if (!mentionPattern.test(allExistingText)) {
+ usersToMention.push(user);
+ } else {
+ core.notice(`@${user} already mentioned for label "${label}", skipping`);
+ }
+ }
+
+ // Post comment if there are users to mention
+ if (usersToMention.length > 0) {
+ const mentions = usersToMention.map(u => `@${u}`).join(' ');
+ const message = config.message.replace('{users}', mentions);
+
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: message
+ });
+
+ core.notice(`CC comment added for label "${label}": ${mentions}`);
+ } else {
+ core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+ }
+ }
+ }
\ No newline at end of file
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
deleted file mode 100644
index 74a7a3a3530f..000000000000
--- a/.github/workflows/lint-and-deploy.yaml
+++ /dev/null
@@ -1,85 +0,0 @@
-name: Lint and Deploy Charts
-
-on: pull_request
-
-permissions:
- contents: read
-
-jobs:
- lint-and-deploy:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- with:
- fetch-depth: 0
-
- - name: Set up Helm
- uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
- with:
- version: v3.14.4
-
- #Python is required because ct lint runs Yamale and yamllint which require Python.
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
- with:
- python-version: '3.13'
-
- - name: Set up chart-testing
- uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
- with:
- version: v3.10.1
-
- - name: Run chart-testing (lint)
- run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
-
- - name: Setup minio
- run: |
- docker network create vllm-net
- docker run -d -p 9000:9000 --name minio --net vllm-net \
- -e "MINIO_ACCESS_KEY=minioadmin" \
- -e "MINIO_SECRET_KEY=minioadmin" \
- -v /tmp/data:/data \
- -v /tmp/config:/root/.minio \
- minio/minio server /data
- export AWS_ACCESS_KEY_ID=minioadmin
- export AWS_SECRET_ACCESS_KEY=minioadmin
- export AWS_EC2_METADATA_DISABLED=true
- mkdir opt-125m
- cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
- aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
- aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
-
- - name: Create kind cluster
- uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
-
- - name: Build the Docker image vllm cpu
- run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
-
- - name: Configuration of docker images, network and namespace for the kind cluster
- run: |
- docker pull amazon/aws-cli:2.6.4
- kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
- kind load docker-image vllm-cpu-env:latest --name chart-testing
- docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
- kubectl create ns ns-vllm
-
- - name: Run chart-testing (install)
- run: |
- export AWS_ACCESS_KEY_ID=minioadmin
- export AWS_SECRET_ACCESS_KEY=minioadmin
- sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
- helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
- - name: curl test
- run: |
- kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
- sleep 10
- CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
- --header "Content-Type: application/json" \
- --data '{
- "model": "opt-125m",
- "prompt": "San Francisco is a",
- "max_tokens": 7,
- "temperature": 0
- }'):$CODE"
- echo "$CODE"
diff --git a/.github/workflows/matchers/markdownlint.json b/.github/workflows/matchers/markdownlint.json
new file mode 100644
index 000000000000..fe094a9badb2
--- /dev/null
+++ b/.github/workflows/matchers/markdownlint.json
@@ -0,0 +1,17 @@
+{
+ "problemMatcher": [
+ {
+ "owner": "markdownlint",
+ "pattern": [
+ {
+ "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
+ "file": 1,
+ "line": 2,
+ "column": 3,
+ "code": 4,
+ "message": 5
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 8e694d18134e..e21d13b8161f 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -5,6 +5,10 @@ on:
push:
branches: [main]
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
permissions:
contents: read
@@ -13,10 +17,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+ - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+ - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index bfd02879965e..000000000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# This workflow will upload a Python Package to Release asset
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Create Release
-
-on:
- push:
- tags:
- - v*
-
-# Needed to create release and upload assets
-permissions:
- contents: write
-
-jobs:
- release:
- # Retrieve tag and create release
- name: Create Release
- runs-on: ubuntu-latest
- outputs:
- upload_url: ${{ steps.create_release.outputs.upload_url }}
- steps:
- - name: Checkout
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
- - name: Extract branch info
- shell: bash
- run: |
- echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
-
- - name: Create Release
- id: create_release
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
- env:
- RELEASE_TAG: ${{ env.release_tag }}
- with:
- github-token: "${{ secrets.GITHUB_TOKEN }}"
- script: |
- const script = require('.github/workflows/scripts/create_release.js')
- await script(github, context, core)
-
- # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
- # wheel:
- # name: Build Wheel
- # runs-on: ${{ matrix.os }}
- # needs: release
-
- # strategy:
- # fail-fast: false
- # matrix:
- # os: ['ubuntu-20.04']
- # python-version: ['3.9', '3.10', '3.11', '3.12']
- # pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
- # cuda-version: ['11.8', '12.1']
-
- # steps:
- # - name: Checkout
- # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
- # - name: Setup ccache
- # uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
- # with:
- # create-symlink: true
- # key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
- # - name: Set up Linux Env
- # if: ${{ runner.os == 'Linux' }}
- # run: |
- # bash -x .github/workflows/scripts/env.sh
-
- # - name: Set up Python
- # uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- # with:
- # python-version: ${{ matrix.python-version }}
-
- # - name: Install CUDA ${{ matrix.cuda-version }}
- # run: |
- # bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
- # - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
- # run: |
- # bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
- # - name: Build wheel
- # shell: bash
- # env:
- # CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
- # run: |
- # bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
- # wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
- # asset_name=${wheel_name//"linux"/"manylinux1"}
- # echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
- # echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
-
- # - name: Upload Release Asset
- # uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
- # env:
- # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- # with:
- # upload_url: ${{ needs.release.outputs.upload_url }}
- # asset_path: ./dist/${{ env.wheel_name }}
- # asset_name: ${{ env.asset_name }}
- # asset_content_type: application/*
-
- # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
- # - name: Publish package
- # uses: pypa/gh-action-pypi-publish@release/v1.8
- # with:
- # repository-url: https://test.pypi.org/legacy/
- # password: ${{ secrets.PYPI_API_TOKEN }}
- # skip-existing: true
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index 16ae1aadb96b..8884359fa0ce 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -9,19 +9,46 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Remind to run full CI on PR
- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
- github.rest.issues.createComment({
- owner: context.repo.owner,
- repo: context.repo.repo,
- issue_number: context.issue.number,
- body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
- '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
- 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
- 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
- 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
- '🚀'
- })
+ try {
+ // Get the PR author
+ const prAuthor = context.payload.pull_request.user.login;
+
+ // Check if this is the author's first PR in this repository
+ // Use GitHub's search API to find all PRs by this author
+ const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+ q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
+ per_page: 100
+ });
+
+ const authorPRCount = searchResults.total_count;
+
+ console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
+
+ // Only post comment if this is the first PR (only one PR by this author)
+ if (authorPRCount === 1) {
+ console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+ '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+ 'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
+ 'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
+ 'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+ 'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+ 'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
+ '🚀'
+ });
+ } else {
+ console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
+ }
+ } catch (error) {
+ console.error('Error checking PR history or posting comment:', error);
+ // Don't fail the workflow, just log the error
+ }
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 0f010832b465..c69ebbb42da5 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 656f3d3fa7bc..dca3089f496c 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+ - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
diff --git a/.gitignore b/.gitignore
index 96b97a552c54..b1df673e83ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
+# triton jit
+.triton
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@@ -147,7 +150,8 @@ venv.bak/
# mkdocs documentation
/site
docs/argparse
-docs/examples
+docs/examples/*
+!docs/examples/README.md
# mypy
.mypy_cache/
@@ -173,6 +177,14 @@ cython_debug/
# VSCode
.vscode/
+# Claude
+CLAUDE.md
+.claude/
+
+# Codex
+AGENTS.md
+.codex/
+
# DS Store
.DS_Store
@@ -203,3 +215,6 @@ shellcheck*/
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*
+
+# Ignore ep_kernels_workspace folder
+ep_kernels_workspace/
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
new file mode 100644
index 000000000000..cd9df57cd980
--- /dev/null
+++ b/.markdownlint.yaml
@@ -0,0 +1,12 @@
+MD007:
+ indent: 4
+MD013: false
+MD024:
+ siblings_only: true
+MD033: false
+MD045: false
+MD046: false
+MD051: false
+MD052: false
+MD053: false
+MD059: false
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5197820fb402..121bdb750de5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,50 +6,39 @@ default_stages:
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
-- repo: https://github.com/google/yapf
- rev: v0.43.0
- hooks:
- - id: yapf
- args: [--in-place, --verbose]
- # Keep the same list from yapfignore here to avoid yapf failing without any inputs
- exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.11.7
+ rev: v0.14.0
hooks:
- - id: ruff
+ - id: ruff-check
args: [--output-format, github, --fix]
- id: ruff-format
- files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos
- rev: v1.34.0
+ rev: v1.38.1
hooks:
- id: typos
-- repo: https://github.com/PyCQA/isort
- rev: 6.0.1
- hooks:
- - id: isort
+ args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v20.1.3
+ rev: v21.1.2
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
-- repo: https://github.com/jackdewinter/pymarkdown
- rev: v0.9.29
+- repo: https://github.com/igorshubovych/markdownlint-cli
+ rev: v0.45.0
hooks:
- - id: pymarkdown
+ - id: markdownlint
exclude: '.*\.inc\.md'
- args: [fix]
+ stages: [manual] # Only run in CI
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
- rev: 0.6.17
+ rev: 0.9.1
hooks:
- id: pip-compile
- args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
+ args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
@@ -60,38 +49,32 @@ repos:
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
- entry: tools/mypy.sh 0 "local"
- language: python
- types: [python]
- additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
+ entry: python tools/pre_commit/mypy.py 0 "local"
stages: [pre-commit] # Don't run in CI
- - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
- name: Run mypy for Python 3.9
- entry: tools/mypy.sh 1 "3.9"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
- stages: [manual] # Only run in CI
+ <<: &mypy_common
+ language: python
+ types_or: [python, pyi]
+ require_serial: true
+ additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.10
- entry: tools/mypy.sh 1 "3.10"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.10"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.11
- entry: tools/mypy.sh 1 "3.11"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.11"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.12
- entry: tools/mypy.sh 1 "3.12"
- language: python
- types: [python]
- additional_dependencies: *mypy_deps
+ entry: python tools/pre_commit/mypy.py 1 "3.12"
+ <<: *mypy_common
+ stages: [manual] # Only run in CI
+ - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+ name: Run mypy for Python 3.13
+ entry: python tools/pre_commit/mypy.py 1 "3.13"
+ <<: *mypy_common
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
@@ -155,18 +138,15 @@ repos:
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
- entry: python tools/check_pickle_imports.py
+ entry: python tools/pre_commit/check_pickle_imports.py
language: python
types: [python]
- pass_filenames: false
- additional_dependencies: [pathspec, regex]
+ additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
language: python
- types: [python]
- pass_filenames: true
- files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+ additional_dependencies: [regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 98c3be25f7e7..d83d6df35ed9 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -7,9 +7,13 @@ build:
os: ubuntu-22.04
tools:
python: "3.12"
+ jobs:
+ post_checkout:
+ - git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yaml
+ fail_on_warning: true
# Optionally declare the Python requirements required to build your docs
python:
diff --git a/.yapfignore b/.yapfignore
index 2d6dcf8380ca..38158259032a 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1 +1,2 @@
collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index edc64f87730a..46630af89f09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
# cmake --install . --component _C
project(vllm_extensions LANGUAGES CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
@@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
# Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
#
# Supported/expected torch versions for CUDA/ROCm.
@@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
#
# Try to find python package with an executable that exactly matches
@@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
# Supported NVIDIA architectures.
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
+elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
else()
@@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
+#
+# Set compression mode for CUDA >=13.x.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA" AND
+ DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+ list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
+endif()
+
+#
+# Set CUDA include flags for CXX compiler.
+#
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
+ endif()
+endif()
+
#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@@ -243,13 +269,12 @@ set(VLLM_EXT_SRC
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
- "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
- "csrc/quantization/fp8/common.cu"
+ "csrc/quantization/w8a8/int8/scaled_quant.cu"
+ "csrc/quantization/w8a8/fp8/common.cu"
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu"
- "csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp")
@@ -257,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
- set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
+ set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -287,16 +312,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
- "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
- "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
- "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/cutlass_extensions/common.cpp"
- "csrc/attention/mla/cutlass_mla_entry.cu")
+ "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
+ "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_EXT_SRC}"
@@ -350,20 +374,27 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
set(MARLIN_SRCS
- "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
- "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
@@ -393,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -421,11 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -450,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS
- "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
- "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
+ "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -486,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
@@ -528,11 +568,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
+ # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
+ # CUDA 12.8 or later
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+ endif()
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
+ set(SRCS
+ "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
+ "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${FP4_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+ message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+ else()
+ message(STATUS "Not building NVFP4 as no compatible archs were found.")
+ # clear FP4_ARCHS
+ set(FP4_ARCHS)
+ endif()
+
# FP4 Archs and flags
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+ "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@@ -540,7 +609,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
- list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
@@ -550,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
# CUTLASS MLA Archs and flags
- cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
set(SRCS
- "csrc/attention/mla/cutlass_mla_kernels.cu"
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@@ -577,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -595,10 +667,38 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+ endif()
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${SCALED_MM_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
+ message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+ else()
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
+ message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+ "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
+ "if you intend on running FP8 quantized MoE models on Blackwell.")
+ else()
+ message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+ "in CUDA target architectures.")
+ endif()
+ endif()
+
# moe_data.cu is used by all CUTLASS MoE kernels.
- cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
@@ -614,10 +714,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures.")
endif()
endif()
-
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ else()
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
- set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
+ set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
@@ -704,6 +808,44 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"found in CUDA target architectures")
endif()
endif()
+
+ # Only build W4A8 kernels if we are building for something compatible with sm90a
+ cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
+ set(SRCS
+ "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
+
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${W4A8_ARCHS}")
+
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+
+ message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
+ else()
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
+ AND W4A8_ARCHS)
+ message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
+ "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
+ "later if you intend on running w4a16 quantized models on "
+ "Hopper.")
+ else()
+ message(STATUS "Not building W4A8 kernels as no compatible archs "
+ "found in CUDA target architectures")
+ endif()
+ endif()
+
+ # Hadacore kernels
+ cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+ if(HADACORE_ARCHS)
+ set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+ set_gencode_flags_for_srcs(
+ SRCS "${SRCS}"
+ CUDA_ARCHS "${HADACORE_ARCHS}")
+ list(APPEND VLLM_EXT_SRC "${SRCS}")
+ message(STATUS "Building hadacore")
+ endif()
+
# if CUDA endif
endif()
@@ -741,10 +883,21 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
+ "csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
- list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+ list(APPEND VLLM_MOE_EXT_SRC
+ "csrc/moe/moe_wna16.cu"
+ "csrc/moe/grouped_topk_kernels.cu")
+endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ set(MOE_PERMUTE_SRC
+ "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
+ "csrc/moe/moe_permute_unpermute_op.cu")
+
+ list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
endif()
set_gencode_flags_for_srcs(
@@ -805,6 +958,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+ set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+ PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+ endif()
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
@@ -815,17 +972,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
-if(VLLM_GPU_LANG STREQUAL "CUDA")
- set(MOE_PERMUTE_SRC
- "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
- "csrc/moe/moe_permute_unpermute_op.cu")
-
- set_gencode_flags_for_srcs(
- SRCS "${MARLIN_PERMUTE_SRC}"
- CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
-
- list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
-endif()
message(STATUS "Enabling moe extension.")
define_gpu_extension_target(
_moe_C
@@ -862,6 +1008,7 @@ endif()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
+ include(cmake/external_projects/qutlass.cmake)
# vllm-flash-attn should be last as it overwrites some CMake functions
include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/MANIFEST.in b/MANIFEST.in
index 82fd22b845f0..fb3cccbb4a9c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
-include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
diff --git a/README.md b/README.md
index dc2f0afbe353..3dcdd7dc0094 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
+
---
*Latest News* 🔥
-- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+
+- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
+- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
-- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
Previous News
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
+- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
@@ -46,6 +57,7 @@ Easy, fast, and cheap LLM serving for everyone
---
+
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
@@ -70,11 +82,12 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
@@ -91,6 +104,7 @@ pip install vllm
```
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@@ -107,6 +121,7 @@ vLLM is a community project. Our compute resources for development and testing a
Cash Donations:
+
- a16z
- Dropbox
- Sequoia Capital
@@ -114,6 +129,8 @@ Cash Donations:
- ZhenFund
Compute Resources:
+
+- Alibaba Cloud
- AMD
- Anyscale
- AWS
@@ -132,6 +149,7 @@ Compute Resources:
- Trainy
- UC Berkeley
- UC San Diego
+- Volcengine
Slack Sponsor: Anyscale
@@ -153,7 +171,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
-- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
diff --git a/RELEASE.md b/RELEASE.md
index 9352e7ef706c..db0d51afc7be 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
**Current Coverage:**
+
* Models: Llama3, Llama4, and Mixtral
* Hardware: NVIDIA H100 and AMD MI300x
-* *Note: Coverage may change based on new model releases and hardware availability*
+* _Note: Coverage may change based on new model releases and hardware availability_
**Performance Validation Process:**
@@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
**Step 2: Review Benchmark Setup**
Familiarize yourself with the benchmark configurations:
+
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
**Step 3: Run the Benchmark**
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
+
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
* **vLLM commit**: Set to the RC commit hash
diff --git a/SECURITY.md b/SECURITY.md
index 6053cfb41f35..d6319cdb1ac2 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,13 +1,50 @@
# Security Policy
-## Reporting a Vulnerability
+## Reporting security issues
-If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+## Issue triage
----
+Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
+
+## Threat model
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
+
+## Issue severity
+
+We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
+
+### CRITICAL Severity
+
+Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0.
+
+### HIGH Severity
+
+Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
+
+### MODERATE Severity
+
+Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
+
+### LOW Severity
+
+Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
+
+## Prenotification policy
+
+For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
+
+* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
+
+* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
+
+* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
+ * Substantial internal deployment leveraging the upstream vLLM project.
+ * Established internal security teams and comprehensive compliance measures.
+ * Active and consistent contributions to the upstream vLLM project.
+
+* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db9..269a4d51ec2e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,605 +1,20 @@
-# Benchmarking vLLM
+# Benchmarks
-This README guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It’s a living document, updated as new features and datasets
-become available.
+This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
-**Dataset Overview**
+## Contents
-