Skip to content

Commit 0dd65e3

Browse files
Add Linux Aarch64 G3 runners to vLLM bms
1 parent 4bde5d3 commit 0dd65e3

File tree

7 files changed

+241
-7
lines changed

7 files changed

+241
-7
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"linux.aws.h100",
1919
"linux.rocm.gpu.gfx942.1",
2020
"linux.24xl.spr-metal",
21+
"linux.arm64.m7g.4xlarge",
2122
"linux.dgx.b200",
2223
],
2324
# NB: There is no 2xH100 runner at the momement, so let's use the next one
@@ -50,6 +51,7 @@
5051
"linux.rocm.gpu.gfx942.4": "rocm",
5152
"linux.rocm.gpu.gfx942.8": "rocm",
5253
"linux.24xl.spr-metal": "cpu",
54+
"linux.arm64.m7g.4xlarge": "cpu",
5355
}
5456

5557
# All the different names vLLM uses to refer to their benchmark configs
@@ -198,8 +200,8 @@ def generate_benchmark_matrix(
198200
) -> Dict[str, Any]:
199201
"""
200202
Parse all the JSON files in vLLM benchmark configs directory to get the
201-
model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
202-
"""
203+
model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
204+
or CPUs - ARM)"""
203205
benchmark_matrix: Dict[str, Any] = {
204206
"include": [],
205207
}

.github/scripts/test_generate_vllm_benchmark_matrix.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
2121
"""\
2222
{
2323
"include": [
24+
{
25+
"runner": "linux.arm64.m7g.4xlarge",
26+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
27+
},
2428
{
2529
"runner": "linux.24xl.spr-metal",
2630
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
8084
"""\
8185
{
8286
"include": [
87+
{
88+
"runner": "linux.arm64.m7g.4xlarge",
89+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
90+
},
8391
{
8492
"runner": "linux.24xl.spr-metal",
8593
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
110118
"""\
111119
{
112120
"include": [
121+
{
122+
"runner": "linux.arm64.m7g.4xlarge",
123+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
124+
},
113125
{
114126
"runner": "linux.24xl.spr-metal",
115127
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
145157
"""\
146158
{
147159
"include": [
160+
{
161+
"runner": "linux.arm64.m7g.4xlarge",
162+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
163+
},
148164
{
149165
"runner": "linux.24xl.spr-metal",
150166
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
172188
"""\
173189
{
174190
"include": [
191+
{
192+
"runner": "linux.arm64.m7g.4xlarge",
193+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
194+
},
175195
{
176196
"runner": "linux.24xl.spr-metal",
177197
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():
225245

226246
# Select multiple runners
227247
models = []
228-
runners = ["h100", "spr"]
248+
runners = ["h100", "spr", "m7g"]
229249
output = json.dumps(
230250
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
231251
)
@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
234254
"""\
235255
{
236256
"include": [
257+
{
258+
"runner": "linux.arm64.m7g.4xlarge",
259+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
260+
},
237261
{
238262
"runner": "linux.24xl.spr-metal",
239263
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
356380
"meta-llama/meta-llama-3.1-8b-instruct",
357381
"mistralai/mixtral-8x7b-instruct-v0.1",
358382
]
359-
runners = ["rocm", "spr"]
383+
runners = ["rocm", "spr", "m7g"]
360384
output = json.dumps(
361385
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
362386
)
@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
365389
"""\
366390
{
367391
"include": [
392+
{
393+
"runner": "linux.arm64.m7g.4xlarge",
394+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
395+
},
368396
{
369397
"runner": "linux.24xl.spr-metal",
370398
"models": "meta-llama/meta-llama-3.1-8b-instruct"

.github/workflows/vllm-benchmark.yml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626
required: true
2727
type: string
28-
default: h100,rocm,spr,b200
28+
default: h100,rocm,spr,b200,m7g
2929
pull_request:
3030
paths:
3131
- .github/workflows/vllm-benchmark.yml
@@ -104,8 +104,17 @@ jobs:
104104
elif command -v rocm-smi; then
105105
DEVICE_NAME=rocm
106106
rocm-smi
107-
else
108-
DEVICE_NAME=cpu
107+
else
108+
arch=$(uname -m)
109+
110+
case "$arch" in
111+
aarch64|arm64)
112+
DEVICE_NAME=arm64-cpu
113+
;;
114+
*)
115+
DEVICE_NAME=cpu
116+
;;
117+
esac
109118
lscpu
110119
fi
111120
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
@@ -122,6 +131,8 @@ jobs:
122131
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
123132
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
124133
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
134+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
135+
DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
125136
fi
126137
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
127138
@@ -157,6 +168,8 @@ jobs:
157168
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
158169
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
159170
DOCKER_IMAGE_SUFFIX=-cpu
171+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
172+
DOCKER_IMAGE_SUFFIX=-arm64-cpu
160173
fi
161174
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
162175
echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
@@ -266,8 +279,12 @@ jobs:
266279
run: |
267280
set -eux
268281
282+
ON_ARM64_CPU=0
283+
269284
if [[ "${DEVICE_NAME}" == "cpu" ]]; then
270285
ON_CPU=1
286+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
287+
ON_ARM64_CPU=1
271288
else
272289
ON_CPU=0
273290
fi
@@ -283,6 +300,7 @@ jobs:
283300
-e ENGINE_VERSION \
284301
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
285302
-e ON_CPU="${ON_CPU}" \
303+
-e ON_ARM64_CPU="${ON_ARM64_CPU}" \
286304
--ipc=host \
287305
--tty \
288306
--detach \

LICENSE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ MIT License
22

33
Copyright (c) Facebook, Inc. and its affiliates.
44

5+
All contributions by Arm:
6+
Copyright (c) 2025 Arm Limited and/or its affiliates
7+
58
Permission is hereby granted, free of charge, to any person obtaining a copy
69
of this software and associated documentation files (the "Software"), to deal
710
in the Software without restriction, including without limitation the rights
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 16,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 16,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 16,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 16,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)