Enable aarch64 CPU performance benchmarks

ioghiban · ioghiban · commit b66b1d32a69f · 2025-10-10T15:06:44.000+02:00
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
@@ -11,7 +11,8 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
 
 ## Performance benchmark quick overview
 
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
+**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!),
+Intel® Xeon® Processors and ARM® Neoverse®, with different models.
 
 **Benchmarking Duration**: about 1hr.
 
@@ -41,6 +42,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 Runtime environment variables:
 
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `ON_ARM64_CPU`: set the value to '1' on ARM® Neoverse®. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
@@ -54,7 +56,7 @@ Nightly benchmark will be triggered when:
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
-> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. For ARM® Graviton®, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead.
 >
 ### Latency test
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -45,6 +45,20 @@ check_cpus() {
   echo "GPU type is $gpu_type"
 }
 
+check_arm64_cpus() {
+  # check the number of CPUs and GPU type.
+  declare -g cpu_count=$(nproc)
+  if [[ $cpu_count -gt 0 ]]; then
+    echo "CPU found."
+    echo $cpu_count
+  else
+    echo "Need at least 1 CPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type="arm64-cpu"
+  echo "GPU type is $gpu_type"
+}
+
 check_hf_token() {
   # check if HF_TOKEN is available and valid
   if [[ -z "$HF_TOKEN" ]]; then
@@ -201,6 +215,14 @@ run_latency_tests() {
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
+    else
+      if [ "$ON_ARM64_CPU" == "1" ]; then
+        pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+        world_size=$(($tp*$pp))
+        if [[ $cpu_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+          echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
+          continue
+        fi
     else
       if [[ $gpu_count -lt $tp ]]; then
         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -270,6 +292,14 @@ run_throughput_tests() {
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
+    else
+      if [ "$ON_ARM64_CPU" == "1" ]; then
+        pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+        world_size=$(($tp*$pp))
+        if [[ $cpu_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+          echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
+          continue
+        fi
     else
       if [[ $gpu_count -lt $tp ]]; then
         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -350,6 +380,14 @@ run_serving_tests() {
         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
+    else
+      if [ "$ON_ARM64_CPU" == "1" ]; then
+        pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+        world_size=$(($tp*$pp))
+        if [[ $cpu_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+          echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name."
+          continue
+        fi
     else
       if [[ $gpu_count -lt $tp ]]; then
         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
@@ -449,6 +487,10 @@ main() {
   if [ "$ON_CPU" == "1" ];then
      check_cpus
      ARCH='-cpu'
+  else
+    if [ "$ON_ARM64_CPU" == "1" ];then
+     check_arm64_cpus
+     ARCH='-arm64-cpu'
   else
      check_gpus
   fi
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 16,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-arm64-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]