diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index e6f5c8b60f45..b8ccfe9eb382 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -11,7 +11,8 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), +Intel® Xeon® Processors and ARM® Neoverse®, with different models. **Benchmarking Duration**: about 1hr. @@ -41,6 +42,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh Runtime environment variables: - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_ARM64_CPU`: set the value to '1' on ARM® Neoverse®. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). @@ -54,7 +56,7 @@ Nightly benchmark will be triggered when: ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. -> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. For Arm® Neoverse®, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead. > ### Latency test diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index c64e5638029e..252f01a968e4 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -45,6 +45,20 @@ check_cpus() { echo "GPU type is $gpu_type" } +check_arm64_cpus() { + # check the number of CPUs and GPU type. + declare -g cpu_count=$(nproc) + if [[ $cpu_count -gt 0 ]]; then + echo "CPU found." + echo $cpu_count + else + echo "Need at least 1 CPU to run benchmarking." + exit 1 + fi + declare -g gpu_type="arm64-cpu" + echo "GPU type is $gpu_type" +} + check_hf_token() { # check if HF_TOKEN is available and valid if [[ -z "$HF_TOKEN" ]]; then @@ -201,6 +215,14 @@ run_latency_tests() { echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi + else + if [ "$ON_ARM64_CPU" == "1" ]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name." + continue + fi else if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." @@ -270,6 +292,14 @@ run_throughput_tests() { echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi + else + if [ "$ON_ARM64_CPU" == "1" ]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name." + continue + fi else if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." @@ -350,6 +380,14 @@ run_serving_tests() { echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi + else + if [ "$ON_ARM64_CPU" == "1" ]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $cpu_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $cpu_count CPU nodes found. Skip testcase $test_name." + continue + fi else if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." @@ -449,6 +487,10 @@ main() { if [ "$ON_CPU" == "1" ];then check_cpus ARCH='-cpu' + else + if [ "$ON_ARM64_CPU" == "1" ];then + check_arm64_cpus + ARCH='-arm64-cpu' else check_gpus fi diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json new file mode 100644 index 000000000000..569117aae852 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-arm64-cpu.json @@ -0,0 +1,30 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json new file mode 100644 index 000000000000..abf4bb884291 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-arm64-cpu.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 16, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 100 + } + } +] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-arm64-cpu.json new file mode 100644 index 000000000000..48c015aa8403 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-arm64-cpu.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +]