Skip to content

Commit ac51392

Browse files
committed
Enable aarch64 CPU performance benchmarks
Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
1 parent 938772a commit ac51392

File tree

5 files changed

+228
-1
lines changed

5 files changed

+228
-1
lines changed

.buildkite/performance-benchmarks/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
77

88
## Performance benchmark quick overview
99

10-
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
10+
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and ARM® Neoverse® with different models.
1111

1212
**Benchmarking Duration**: about 1hr.
1313

@@ -24,6 +24,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
2424
Runtime environment variables:
2525

2626
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
27+
- `ON_ARM64_CPU`: set the value to '1' on ARM® Neoverse®. Default value is 0.
2728
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
2829
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
2930
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -35,6 +36,7 @@ Runtime environment variables:
3536
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
3637
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
3738
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
39+
For Arm® Neoverse®, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
3840
>
3941
### Latency test
4042

.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,20 @@ check_cpus() {
5353
echo "GPU type is $gpu_type"
5454
}
5555

56+
check_arm64_cpus() {
57+
# check the number of CPUs and GPU type.
58+
declare -g cpu_count=$(nproc)
59+
if [[ $cpu_count -gt 0 ]]; then
60+
echo "CPU found."
61+
echo $cpu_count
62+
else
63+
echo "Need at least 1 CPU to run benchmarking."
64+
exit 1
65+
fi
66+
declare -g gpu_type="arm64-cpu"
67+
echo "GPU type is $gpu_type"
68+
}
69+
5670
check_hf_token() {
5771
# check if HF_TOKEN is available and valid
5872
if [[ -z "$HF_TOKEN" ]]; then
@@ -213,6 +227,14 @@ run_latency_tests() {
213227
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
214228
continue
215229
fi
230+
else
231+
if [ "$ON_ARM64_CPU" == "1" ]; then
232+
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
233+
world_size=$(($tp*$pp))
234+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
235+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
236+
continue
237+
fi
216238
else
217239
if [[ $gpu_count -lt $tp ]]; then
218240
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -282,6 +304,14 @@ run_throughput_tests() {
282304
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
283305
continue
284306
fi
307+
else
308+
if [ "$ON_ARM64_CPU" == "1" ]; then
309+
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
310+
world_size=$(($tp*$pp))
311+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
312+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
313+
continue
314+
fi
285315
else
286316
if [[ $gpu_count -lt $tp ]]; then
287317
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -362,6 +392,14 @@ run_serving_tests() {
362392
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
363393
continue
364394
fi
395+
else
396+
if [ "$ON_ARM64_CPU" == "1" ]; then
397+
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
398+
world_size=$(($tp*$pp))
399+
if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
400+
echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
401+
continue
402+
fi
365403
else
366404
if [[ $gpu_count -lt $tp ]]; then
367405
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -461,6 +499,10 @@ main() {
461499
if [ "$ON_CPU" == "1" ];then
462500
check_cpus
463501
ARCH='-cpu'
502+
else
503+
if [ "$ON_ARM64_CPU" == "1" ];then
504+
check_arm64_cpus
505+
ARCH='-arm64-cpu'
464506
else
465507
check_gpus
466508
ARCH="$arch_suffix"
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 16,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 16,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 16,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 16,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)