Skip to content

Commit b66b1d3

Browse files
committed
Enable aarch64 CPU performance benchmarks
1 parent e246ad6 commit b66b1d3

File tree

5 files changed

+229
-2
lines changed

5 files changed

+229
-2
lines changed

.buildkite/nightly-benchmarks/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
1111

1212
## Performance benchmark quick overview
1313

14-
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
14+
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!),
15+
Intel® Xeon® Processors and ARM® Neoverse®, with different models.
1516

1617
**Benchmarking Duration**: about 1hr.
1718

@@ -41,6 +42,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
4142
Runtime environment variables:
4243

4344
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
45+
- `ON_ARM64_CPU`: set the value to '1' on ARM® Neoverse®. Default value is 0.
4446
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
4547
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
4648
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -54,7 +56,7 @@ Nightly benchmark will be triggered when:
5456
## Performance benchmark details
5557

5658
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
57-
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
59+
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. For ARM® Graviton®, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
5860
>
5961
### Latency test
6062

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,20 @@ check_cpus() {
4545
echo "GPU type is $gpu_type"
4646
}
4747

48+
check_arm64_cpus() {
49+
# check the number of CPUs and GPU type.
50+
declare -g cpu_count=$(nproc)
51+
if [[ $cpu_count -gt 0 ]]; then
52+
echo "CPU found."
53+
echo $cpu_count
54+
else
55+
echo "Need at least 1 CPU to run benchmarking."
56+
exit 1
57+
fi
58+
declare -g gpu_type="arm64-cpu"
59+
echo "GPU type is $gpu_type"
60+
}
61+
4862
check_hf_token() {
4963
# check if HF_TOKEN is available and valid
5064
if [[ -z "$HF_TOKEN" ]]; then
@@ -201,6 +215,14 @@ run_latency_tests() {
201215
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
202216
continue
203217
fi
218+
else
219+
if [ "$ON_ARM64_CPU" == "1" ]; then
220+
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
221+
world_size=$(($tp*$pp))
222+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
223+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
224+
continue
225+
fi
204226
else
205227
if [[ $gpu_count -lt $tp ]]; then
206228
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -270,6 +292,14 @@ run_throughput_tests() {
270292
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
271293
continue
272294
fi
295+
else
296+
if [ "$ON_ARM64_CPU" == "1" ]; then
297+
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
298+
world_size=$(($tp*$pp))
299+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
300+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
301+
continue
302+
fi
273303
else
274304
if [[ $gpu_count -lt $tp ]]; then
275305
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -350,6 +380,14 @@ run_serving_tests() {
350380
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
351381
continue
352382
fi
383+
else
384+
if [ "$ON_ARM64_CPU" == "1" ]; then
385+
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
386+
world_size=$(($tp*$pp))
387+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
388+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
389+
continue
390+
fi
353391
else
354392
if [[ $gpu_count -lt $tp ]]; then
355393
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -449,6 +487,10 @@ main() {
449487
if [ "$ON_CPU" == "1" ];then
450488
check_cpus
451489
ARCH='-cpu'
490+
else
491+
if [ "$ON_ARM64_CPU" == "1" ];then
492+
check_arm64_cpus
493+
ARCH='-arm64-cpu'
452494
else
453495
check_gpus
454496
fi
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 16,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 16,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 16,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 16,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)