Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .buildkite/nightly-benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName

## Performance benchmark quick overview

**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!),
Intel® Xeon® Processors and ARM® Neoverse®, with different models.

**Benchmarking Duration**: about 1hr.

Expand Down Expand Up @@ -41,6 +42,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
Runtime environment variables:

- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `ON_ARM64_CPU`: set the value to '1' on ARM® Neoverse®. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
Expand All @@ -54,7 +56,7 @@ Nightly benchmark will be triggered when:
## Performance benchmark details

See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. For Arm® Neoverse®, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
>
### Latency test

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@ check_cpus() {
echo "GPU type is $gpu_type"
}

check_arm64_cpus() {
# check the number of CPUs and GPU type.
declare -g cpu_count=$(nproc)
if [[ $cpu_count -gt 0 ]]; then
echo "CPU found."
echo $cpu_count
else
echo "Need at least 1 CPU to run benchmarking."
exit 1
fi
declare -g gpu_type="arm64-cpu"
echo "GPU type is $gpu_type"
}

check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
Expand Down Expand Up @@ -201,6 +215,14 @@ run_latency_tests() {
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [ "$ON_ARM64_CPU" == "1" ]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
Expand Down Expand Up @@ -270,6 +292,14 @@ run_throughput_tests() {
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [ "$ON_ARM64_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
Expand Down Expand Up @@ -350,6 +380,14 @@ run_serving_tests() {
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
if [ "$ON_ARM64_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
continue
fi
else
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
Expand Down Expand Up @@ -449,6 +487,10 @@ main() {
if [ "$ON_CPU" == "1" ];then
check_cpus
ARCH='-cpu'
else
if [ "$ON_ARM64_CPU" == "1" ];then
check_arm64_cpus
ARCH='-arm64-cpu'
else
check_gpus
fi
Expand Down
30 changes: 30 additions & 0 deletions .buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]
121 changes: 121 additions & 0 deletions .buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"device": "cpu",
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 16,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]